In [1]:
import json
import numpy as np
import pandas as pd
from datetime import datetime , timedelta
from pytz import timezone
from sklearn.ensemble import RandomForestRegressor

In [2]:
# aggregate
files = ['tweets_#gopatriots.txt','tweets_#gohawks.txt','tweets_#sb49.txt',
            'tweets_#patriots.txt','tweets_#nfl.txt','tweets_#superbowl.txt']
rawdata = {'hours':list(),'tweets':list(),'retweet':list(),'followers':list(),
        'maxfollowers':list(),'timeofday':list()}
for filename in files:
    with open(filename) as file:
        for line in file:
            data_line = json.loads(line)
            cite_date = data_line['citation_date']
            pst_tz = timezone('US/Pacific')
            x = datetime.fromtimestamp(cite_date, pst_tz)
            rawdata['hours'].append(x)
            rawdata['tweets'].append(1)
            rawdata['retweet'].append(data_line['metrics']['citations']['total'])  
            rawdata['followers'].append(data_line['author']['followers'])
            rawdata['maxfollowers'].append(data_line['author']['followers'])
            rawdata['timeofday'].append(x.hour)
            #data['usermention'].append(len(data_line['tweet']['entities']['user_mentions']))
            #data['rankingscore'].append(data_line['metrics']['ranking_score'])
    print(len(rawdata['hours']))
rawdata = pd.DataFrame(rawdata, columns=['hours','tweets','retweet','followers','maxfollowers','timeofday'])
#print(rawdata)

26232
214368
1041319
1531032
1790056
3138823


In [3]:
data_gp = rawdata.groupby(pd.Grouper(key='hours', freq='3600s'))

data = {'tweets':list(),'retweet':list(),'followers':list(),'maxfollowers':list(),'timeofday':list()}
for i,(j,group) in enumerate(data_gp):
    data['tweets'].append(group.tweets.sum())
    data['retweet'].append(group.retweet.sum())
    data['followers'].append(group.followers.sum())
    data['maxfollowers'].append(group.maxfollowers.max())
    data['timeofday'].append(group.timeofday.max())
data = pd.DataFrame(data, columns = ['tweets','retweet','followers','maxfollowers','timeofday'])
#print(data)
data = np.nan_to_num(data.values)
print(data)

time_line=data[:,4]
num_dates=0
#print(time_line)
data1_end = ( 24-int(time_line[0]) ) + 17*24 + 8
data1=data[0:data1_end, : ]
#print(data1_end)

data2_end = data1_end + 11 
data2 = data[data1_end: data2_end, : ]
#print(data2_end)

data3 = data[data2_end : len(time_line), : ]
#print(len(time_line))

train_data = [data1,data2,data3]
#print(train_data[0])
#print(train_data[1])
#print(train_data[2])

[[1.200000e+02 7.590000e+02 2.472950e+05 4.181800e+04 0.000000e+00]
 [9.500000e+01 8.040000e+02 2.124180e+05 1.955800e+04 1.000000e+00]
 [1.160000e+02 6.560000e+02 1.659488e+06 1.362401e+06 2.000000e+00]
 ...
 [6.300000e+01 4.740000e+02 1.494713e+06 1.267836e+06 8.000000e+00]
 [5.300000e+01 1.100000e+02 6.238810e+05 1.590740e+05 9.000000e+00]
 [5.500000e+01 8.700000e+01 6.808590e+05 1.607590e+05 1.000000e+01]]


In [4]:
def reshape(data):
    data_end=data.shape[0]
    data_reshape = []
    data_x=data[0:5,: ]
    data_x=np.reshape(data_x,25)
    data_reshape = data_x
    for i in range(1, data_end-5):
        data_x=data[i:i+5,: ]
        data_x=np.reshape(data_x,25)
        data_reshape=np.vstack((data_reshape,data_x))
    data_x=data_reshape
    data_y=data[5:data_end,0]
    #print(data_y[len(data_y)-1])
    return data_x,data_y

def reshape_sample8(data):
    data_end=data.shape[0]
    data_reshape = []
    data_x=data[0:5,: ]
    data_x=np.reshape(data_x,25)
    data_reshape = data_x
    for i in range(1, data_end-4):
        data_x=data[i:i+4,: ]
        data_x=np.reshape(data_x,25)
        data_reshape=np.vstack((data_reshape,data_x))
    data_x=data_reshape
    data_y=data[4:data_end,0]
    #print(data_y[len(data_y)-1])
    return data_x,data_y

def SamplePredict(period, samplename):
    data_s = {'hours':list(),'tweets':list(),'retweet':list(),'followers':list(),
              'maxfollowers':list(),'timeofday':list()}
    with open('test_data/'+samplename+'.txt') as file:
        for line in file:
            data_line = json.loads(line)
            cite_date = data_line['citation_date']
            pst_tz = timezone('US/Pacific')
            x = datetime.fromtimestamp(cite_date, pst_tz)
            data_s['hours'].append(x)
            data_s['tweets'].append(1)
            data_s['retweet'].append(data_line['metrics']['citations']['total'])  
            data_s['followers'].append(data_line['author']['followers'])
            data_s['maxfollowers'].append(data_line['author']['followers'])
            data_s['timeofday'].append(x.hour)
            #data_s['usermention'].append(len(data_line['tweet']['entities']['user_mentions']))
            #data_s['rankingscore'].append(data_line['metrics']['ranking_score'])
    data_s = pd.DataFrame(data_s, columns = ['hours','tweets','retweet','followers','maxfollowers','timeofday'])
    data_gp = data_s.groupby(pd.Grouper(key='hours', freq='3600s'))
    data_s = {'tweets':list(),'retweet':list(),'followers':list(),'maxfollowers':list(),'timeofday':list()}
    for i,(j,group) in enumerate(data_gp):
        #data['hours'].append(i)
        data_s['tweets'].append(group.tweets.sum())
        data_s['retweet'].append(group.retweet.sum())
        data_s['followers'].append(group.followers.sum())
        data_s['maxfollowers'].append(group.maxfollowers.max())
        data_s['timeofday'].append(group.timeofday.max())
    data_s = pd.DataFrame(data_s, columns = ['tweets','retweet','followers','maxfollowers','timeofday'])
    #print(data_s)
    data_s = data_s.values[0:6,:]
    if samplename=='sample8_period1':
        data_s = data_s[0:5,:]
    #print(data_s)
    train_x,train_y = reshape(train_data[period-1])
    test_x,test_y = reshape(data_s)
    if samplename=='sample8_period1':
        test_x,test_y = reshape_sample8(data_s)
    rf = RandomForestRegressor(n_estimators=20, max_features=4, max_depth=4, 
                               bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)
    rf.fit(train_x, train_y)
    predict_y = rf.predict(test_x.reshape(1,-1))
    score = rf.score(test_x.reshape(1,-1),test_y)
    print(samplename+": ")
    print('predict value =',predict_y[0])
    print('true value =',test_y[0])
    #print('score =',score)
    print()

### Period 1

In [5]:
SamplePredict(1, 'sample1_period1')
SamplePredict(1, 'sample4_period1')
SamplePredict(1, 'sample5_period1')
SamplePredict(1, 'sample8_period1')

sample1_period1: 
predict value = 507.8646761617787
true value = 177.0

sample4_period1: 
predict value = 560.6267008397111
true value = 201.0

sample5_period1: 
predict value = 484.0790834812191
true value = 215.0

sample8_period1: 
predict value = 345.4584684684563
true value = 11.0



### Period 2

In [6]:
SamplePredict(2, 'sample2_period2')
SamplePredict(2, 'sample6_period2')
SamplePredict(2, 'sample9_period2')

sample2_period2: 
predict value = 249004.2
true value = 83440.0

sample6_period2: 
predict value = 175661.4
true value = 37199.0

sample9_period2: 
predict value = 237987.2
true value = 2788.0



### Period 3

In [7]:
SamplePredict(3, 'sample3_period3')
SamplePredict(3, 'sample7_period3')
SamplePredict(3, 'sample10_period3')

sample3_period3: 
predict value = 1296.6450157475554
true value = 523.0

sample7_period3: 
predict value = 121.20988042599456
true value = 120.0

sample10_period3: 
predict value = 97.14126556112969
true value = 61.0

