### QUESTION 7: 
    
Also, aggregate the data of all hashtags, and train 3 models (for the intervals mentioned above) to predict the number of tweets in the next hour on the aggregated data.
Perform the same evaluations on your combined model and compare with models you trained for individual hashtags.

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
globalMinTs = 1552522378
globalMaxTs = 0
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    
    if(ts_list[0]<globalMinTs):
        globalMinTs = ts_list[0]
    
    if(ts_list[1]>globalMaxTs):
        globalMaxTs = ts_list[1]
        
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [4]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [5]:
import statsmodels.api as sm



#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(globalMinTs/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(features, "q7_tp1_features")
save_object(labels, "q7_tp1_labels")
print("Finished Building feature vectors for time period 1")

Finished Building feature vectors for time period 1


In [6]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 1')
# X_orig = load_object("q7_tp1_features")
# y = load_object("q7_tp1_labels")
X_orig = features
y = labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()
pred_y = results.predict(X)

print("\nMSE : {}".format(ste.mse(pred_y, y,axis=0)))
print("R-squared : {}".format(results.rsquared))
print("P values for the features are \n {} \n\n".format(results.pvalues))

print(results.summary())
print('---'*20)
print('\n\n')


Linear Regression Model for Time period 1

MSE : 4418015.068686534
R-squared : 0.40979828340375035
P values for the features are 
 [4.49388527e-02 2.08213571e-05 7.68533992e-01 3.23580150e-02
 4.56836862e-01 8.63099801e-01] 


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.403
Method:                 Least Squares   F-statistic:                     60.27
Date:                Wed, 20 Mar 2019   Prob (F-statistic):           1.30e-47
Time:                        12:13:49   Log-Likelihood:                -3990.6
No. Observations:                 440   AIC:                             7993.
Df Residuals:                     434   BIC:                             8018.
Df Model:                           5                                         
Covariance Type:            nonrobust                        

In [7]:
#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
features,labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(features, "q7_tp2_features")
save_object(labels, "q7_tp2_labels")
print("Finished Building feature vectors for time period 2")

Finished Building feature vectors for time period 2


In [8]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 2')
# X_orig = load_object("q7_tp2_features")
# y = load_object("q7_tp2_labels")
X_orig = features
y = labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()
pred_y = results.predict(X)

print("\nMSE : {}".format(ste.mse(pred_y, y,axis=0)))
print("R-squared : {}".format(results.rsquared))
print("P values for the features are \n {} \n\n".format(results.pvalues))

print(results.summary())
print('---'*20)
print('\n\n')


Linear Regression Model for Time period 2

MSE : 17345388.875838757
R-squared : 0.8481182442791568
P values for the features are 
 [5.88070117e-01 5.66227916e-24 2.55305227e-02 8.10938051e-01
 3.96822925e-01 3.19727598e-01] 


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.848
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     154.1
Date:                Wed, 20 Mar 2019   Prob (F-statistic):           1.16e-54
Time:                        12:18:48   Log-Likelihood:                -1404.5
No. Observations:                 144   AIC:                             2821.
Df Residuals:                     138   BIC:                             2839.
Df Model:                           5                                         
Covariance Type:            nonrobust                        

In [9]:
#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
features,labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
save_object(features, "q7_tp3_features")
save_object(labels, "q7_tp3_labels")
print("Finished Building feature vectors for time period 3")

Finished Building feature vectors for time period 3


In [10]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 3')
# X_orig = load_object("q7_tp3_features")
# y = load_object("q7_tp3_labels")
X_orig = features
y = labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()
pred_y = results.predict(X)

print("\nMSE : {}".format(ste.mse(pred_y, y,axis=0)))
print("R-squared : {}".format(results.rsquared))
print("P values for the features are \n {} \n\n".format(results.pvalues))

print(results.summary())
print('---'*20)
print('\n\n')


Linear Regression Model for Time period 3

MSE : 2332820.8808568306
R-squared : 0.5084587539500655
P values for the features are 
 [0.20245378 0.04086468 0.12992942 0.02705564 0.49855171 0.46134929] 


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.508
Model:                            OLS   Adj. R-squared:                  0.489
Method:                 Least Squares   F-statistic:                     26.69
Date:                Wed, 20 Mar 2019   Prob (F-statistic):           1.90e-18
Time:                        12:22:13   Log-Likelihood:                -1181.3
No. Observations:                 135   AIC:                             2375.
Df Residuals:                     129   BIC:                             2392.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
       