In [1]:
import csv 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
import math
import pytz
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from statsmodels.api import OLS
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
#def function to read and combine data together
def readfile_aggregate(file_list):
    time_list=[]
    num_followers=[]
    num_retweets=[]
    num_replies=[]
    num_mentions=[]
    mentions_ratio=[]
    num_urls=[]
    urls_ratio=[]
    num_hashtags=[]
    hashtags_ratio=[]
    num_favorites=[]
    num_impressions=[]
    num_rank=[]
    for file_name in file_list:
        
        with open(file_name,encoding='utf8') as file:
        #Each line is a tweet info
            data=file.readlines()

        for line in data:
            json_object=json.loads(line)
            time_list.append(json_object['citation_date'])#extract time a tweet is posted by
            num_followers.append(json_object['author']['followers'])#extract number of followers of teh person tweeting
            num_retweets.append(json_object['metrics']['citations']['total'])#extract number of retweets of a tweet
            num_replies.append(json_object['metrics']['citations']['replies'])#extract number of replies
            num_mentions.append(len(json_object['tweet']['entities']['user_mentions']))#extract number of mentions (length of the list)
            mentions_ratio.append(len(json_object['tweet']['entities']['user_mentions'])/len(json_object['tweet']['text']))#extract ration of mentions
            num_urls.append(len(json_object['tweet']['entities']['urls']))#extract number of urls
            urls_ratio.append(len(json_object['tweet']['entities']['urls'])/len(json_object['tweet']['text']))#extract ratio of urls
            num_hashtags.append(len(json_object['tweet']['entities']['hashtags']))#extract number of hastages
            hashtags_ratio.append(len(json_object['tweet']['entities']['hashtags'])/len(json_object['tweet']['text']))#extract ratio of hastages
            num_favorites.append(json_object['tweet']['favorite_count'])
            num_impressions.append(json_object['metrics']['impressions'])
            num_rank.append(json_object['metrics']['ranking_score'])
        file.close()
    date_list=[]
    pst_tz=pytz.timezone('America/Los_Angeles')#time info conversion
    for time in time_list:
        date=datetime.datetime.fromtimestamp(time,pst_tz).replace(tzinfo=None)
        date_list.append(date)#save converted time into a new list
    date_df=pd.DataFrame(date_list,columns=['Date'])#generate a dataframe based on date
    date_df=date_df.set_index(date_df['Date'])#set a date string as index
    #print(date_df)
    date_df.drop(['Date'],'columns',inplace=True)#discard the second column
    date_df['num_tweets']=1 #initialize a new column and set the column name as num_tweets
    date_df['num_followers']=num_followers
    date_df['num_retweets']=num_retweets
    date_df['num_replies']=num_replies
    date_df['num_mentions']=num_mentions
    date_df['num_urls']=num_urls
    date_df['num_hashtags']=num_hashtags
    date_df['num_favorites']=num_favorites
    date_df['num_impressions']=num_impressions
    date_df['num_rank']=num_rank
    
    return date_df

def readfile_HM(file_name):
    with open(file_name,encoding='utf8') as file:
        #Each line is a tweet info
        data=file.readlines()
    time_list=[]
    num_followers=[]
    num_retweets=[]
    num_replies=[]
    num_mentions=[]
    mentions_ratio=[]
    num_urls=[]
    urls_ratio=[]
    num_hashtags=[]
    hashtags_ratio=[]
    num_favorites=[]
    num_impressions=[]
    num_rank=[]
    for line in data:
        json_object=json.loads(line)
        time_list.append(json_object['citation_date'])#extract time a tweet is posted by
        num_followers.append(json_object['author']['followers'])#extract number of followers of teh person tweeting
        num_retweets.append(json_object['metrics']['citations']['total'])#extract number of retweets of a tweet
        num_replies.append(json_object['metrics']['citations']['replies'])#extract number of replies
        num_mentions.append(len(json_object['tweet']['entities']['user_mentions']))#extract number of mentions (length of the list)
        #mentions_ratio.append(len(json_object['tweet']['entities']['user_mentions'])/len(json_object['tweet']['text']))#extract ration of mentions
        num_urls.append(len(json_object['tweet']['entities']['urls']))#extract number of urls
        #urls_ratio.append(len(json_object['tweet']['entities']['urls'])/len(json_object['tweet']['text']))#extract ratio of urls
        num_hashtags.append(len(json_object['tweet']['entities']['hashtags']))#extract number of hastages
        #hashtags_ratio.append(len(json_object['tweet']['entities']['hashtags'])/len(json_object['tweet']['text']))#extract ratio of hastages
        num_favorites.append(json_object['tweet']['favorite_count'])
        num_impressions.append(json_object['metrics']['impressions'])
        num_rank.append(json_object['metrics']['ranking_score'])
    file.close()
    date_list=[]
    pst_tz=pytz.timezone('America/Los_Angeles')#time info conversion
    for time in time_list:
        date=datetime.datetime.fromtimestamp(time,pst_tz).replace(tzinfo=None)
        date_list.append(date)#save converted time into a new list
    date_df=pd.DataFrame(date_list,columns=['Date'])#generate a dataframe based on date
    date_df=date_df.set_index(date_df['Date'])#set a date string as index
    #print(date_df)
    date_df.drop(['Date'],'columns',inplace=True)#discard the second column
    date_df['num_tweets']=1 #initialize a new column and set the column name as num_tweets
    date_df['num_followers']=num_followers
    date_df['num_retweets']=num_retweets
    date_df['num_replies']=num_replies
    date_df['num_mentions']=num_mentions
    date_df['num_urls']=num_urls
    date_df['num_hashtags']=num_hashtags
    date_df['num_favorites']=num_favorites
    date_df['num_impressions']=num_impressions
    date_df['num_rank']=num_rank
    return date_df

def feature_extraction(date_df,time_period):
    feature_map=date_df.resample(time_period).sum()#total number of tweets, total num_followers, total num_retweets, replies, mentions, urls, hashtags
    #date_df['mentions_ratio']=mentions_ratio
    #date_df['urls_ratio']=urls_ratio
    #date_df['hashtags_ratio']=hashtags_ratio
    feature_map['max_followers']=date_df['num_followers'].resample(time_period).max()#pick maximum num of followers and resample
    feature_map['max_replies']=date_df['num_replies'].resample(time_period).max()
    feature_map['max_mentions']=date_df['num_mentions'].resample(time_period).max()
    feature_map['max_urls']=date_df['num_urls'].resample(time_period).max()
    feature_map['max_hashtags']=date_df['num_hashtags'].resample(time_period).max()
    #feature_map['mentions_ratio']=date_df['mentions_ratio'].resample('H').mean()
    #feature_map['urls_ratio']=date_df['urls_ratio'].resample('H').mean()
    #feature_map['hashtags_ratio']=date_df['hashtags_ratio'].resample('H').mean()
    feature_map['max_favorites']=date_df['num_favorites'].resample(time_period).max()
    feature_map['max_impressions']=date_df['num_impressions'].resample(time_period).max()
    feature_map['max_rank']=date_df['num_rank'].resample(time_period).max()
    feature_map['time']=feature_map.index.hour#extract hour index from 0 to 23
    return feature_map

def divide_three(date_df):
    start_time='2015-02-01 08:00:00'
    end_time='2015-02-01 20:00:00'
    #the first part
    feature_map_first=feature_extraction(date_df,'H')
    feature_map_first=feature_map_first.iloc[feature_map_first.index<start_time,:]
    #the second part
    feature_map_second=feature_extraction(date_df,'5T')
    feature_map_second=feature_map_second.iloc[feature_map_second.index<=end_time,:]
    feature_map_second=feature_map_second.iloc[feature_map_second.index>=start_time,:]
    #the third part
    feature_map_third=feature_extraction(date_df,'H')
    feature_map_third=feature_map_third.iloc[feature_map_third.index>end_time,:]
    
    return feature_map_first, feature_map_second, feature_map_third

In [3]:
file_list=['tweets_#gohawks.txt','tweets_#gopatriots.txt','tweets_#nfl.txt','tweets_#patriots.txt','tweets_#sb49.txt','tweets_#superbowl.txt']
date_df_agg=readfile_aggregate(file_list)

In [4]:
feature_map_all_first,feature_map_all_second,feature_map_all_third=divide_three(date_df_agg)


In [5]:
#For the first time segment
#s0_p1,s1_p1,s2_p1
X_train=feature_map_all_first.iloc[:-1]
Y_train=feature_map_all_first['num_tweets'][1:]
X_train=np.nan_to_num(X_train)
Y_train=np.nan_to_num(Y_train)
scalar=StandardScaler()
X_train_scale=scalar.fit_transform(X_train)


In [6]:
date_df_s0_p1=readfile_HM('sample0_period1.txt')
feature_s0_p1=feature_extraction(date_df_s0_p1,'H')
Y_test=feature_s0_p1['num_tweets'][1:]
feature_s0_p1=np.nan_to_num(feature_s0_p1)
scalar=StandardScaler()
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=1,min_samples_split=10,n_estimators=1000)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s0_p1)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

27916.621093240265
[159.19841027 373.06134633 211.64989163 260.19709543 274.70698026
 297.52282734]


In [7]:
date_df_s1_p1=readfile_HM('sample1_period1.txt')
feature_s1_p1=feature_extraction(date_df_s1_p1,'H')
Y_test=feature_s1_p1['num_tweets'][1:]
feature_s1_p1=np.nan_to_num(feature_s1_p1)
scalar=StandardScaler()
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=1,min_samples_split=10,n_estimators=1000)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s1_p1)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

12961.697368590507
[215.36516421 175.69361579 184.80403928 543.17412863 620.60666141
 870.5883959 ]


In [8]:
date_df_s2_p1=readfile_HM('sample2_period1.txt')
feature_s2_p1=feature_extraction(date_df_s2_p1,'H')
Y_test=feature_s2_p1['num_tweets'][1:]
feature_s2_p1=np.nan_to_num(feature_s2_p1)
scalar=StandardScaler()
feature_s2_p1_scale=scalar.fit_transform(feature_s2_p1)
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=1,min_samples_split=10,n_estimators=1000)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s2_p1)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

11614.880866328454
[353.49830566 178.24384163 134.66626599 152.8285168  129.07648929
  88.38401825]


In [9]:
#For the second time segment
#s0_p2,s2_p2,s2_p2
X_train=feature_map_all_second.iloc[:-1]
Y_train=feature_map_all_second['num_tweets'][1:]
X_train=np.nan_to_num(X_train)
Y_train=np.nan_to_num(Y_train)
scalar=StandardScaler()
X_train_scale=scalar.fit_transform(X_train)


In [11]:
date_df_s0_p2=readfile_HM('sample0_period2.txt')
feature_s0_p2=feature_extraction(date_df_s0_p2,'5T')
Y_test=feature_s0_p2['num_tweets'][1:]
feature_s0_p2=np.nan_to_num(feature_s0_p2)
scalar=StandardScaler()
rf_mdl=RandomForestRegressor(max_depth=50,max_features='sqrt',min_samples_leaf=2,min_samples_split=2,n_estimators=1800)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s0_p2)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

1302087.5458595278
[1781.21660692 2229.09451058 1949.30689729 2275.79838069 2106.91156367
 2040.13959678]


In [13]:
date_df_s1_p2=readfile_HM('sample1_period2.txt')
feature_s1_p2=feature_extraction(date_df_s1_p2,'5T')
Y_test=feature_s1_p2['num_tweets'][1:]
feature_s1_p2=np.nan_to_num(feature_s1_p2)
scalar=StandardScaler()
rf_mdl=RandomForestRegressor(max_depth=50,max_features='sqrt',min_samples_leaf=2,min_samples_split=2,n_estimators=1800)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s1_p2)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

2099291.3707208107
[2518.82275353 2502.29474559 2507.02886332 1737.15778329 2435.67223104
 2215.61793871]


In [14]:
date_df_s2_p2=readfile_HM('sample2_period2.txt')
feature_s2_p2=feature_extraction(date_df_s2_p2,'5T')
Y_test=feature_s2_p2['num_tweets'][1:]
feature_s2_p2=np.nan_to_num(feature_s2_p2)
scalar=StandardScaler()
rf_mdl=RandomForestRegressor(max_depth=50,max_features='sqrt',min_samples_leaf=2,min_samples_split=2,n_estimators=1800)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s2_p2)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

2607849.013290526
[1464.9920496  1755.58385538 1765.23672178 1702.23260141 1488.22509877
 1510.5283772 ]


In [15]:
#For the third time segment
#s0_p2,s2_p2,s2_p2
X_train=feature_map_all_third.iloc[:-1]
Y_train=feature_map_all_third['num_tweets'][1:]
X_train=np.nan_to_num(X_train)
Y_train=np.nan_to_num(Y_train)
scalar=StandardScaler()
X_train_scale=scalar.fit_transform(X_train)



In [84]:
#For the third time segment
#s0_p3,s1_p3,s2_p3
date_df_s0_p3=readfile_HM('sample0_period3.txt')
feature_s0_p3=feature_extraction(date_df_s0_p3,'H')
Y_test=feature_s0_p3['num_tweets'][1:]
feature_s0_p3=np.nan_to_num(feature_s0_p3)
scalar=StandardScaler()
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=4,min_samples_split=5,n_estimators=400)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s0_p3)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

305.6043929183226
[ 28.63610658 100.88628185  57.59947799  53.25457925  67.42836577
 144.49891697]


In [88]:
date_df_s1_p3=readfile_HM('sample1_period3.txt')
feature_s1_p3=feature_extraction(date_df_s1_p3,'H')
Y_test=feature_s1_p3['num_tweets'][1:]
feature_s1_p3=np.nan_to_num(feature_s1_p3)
scalar=StandardScaler()
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=4,min_samples_split=5,n_estimators=400)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s1_p3)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

15416.402362682824
[354.61637145 -12.51105875  13.55221866  -2.05479516  37.04874762
  19.55290619]


In [98]:
date_df_s2_p3=readfile_HM('sample2_period3.txt')
feature_s2_p3=feature_extraction(date_df_s2_p3,'H')
Y_test=feature_s2_p3['num_tweets'][1:]
feature_s2_p3=np.nan_to_num(feature_s2_p3)
scalar=StandardScaler()
rf_mdl=GradientBoostingRegressor(max_depth=70,max_features='sqrt',min_samples_leaf=4,min_samples_split=5,n_estimators=400)
rf_mdl.fit(X_train,Y_train)
Y_pred=rf_mdl.predict(feature_s2_p3)
MSE=mean_squared_error(Y_pred[:-1],Y_test)
print(MSE)
print(Y_pred)

10237.412680761923
[ 45.66088963  41.4898875   63.66547167 307.89828437  23.23292322
  35.1644546 ]
