### QUESTION 6: 

We define three time periods and their corresponding window length as follows:
1. Before Feb. 1, 8:00 a.m.: 1-hour window
2. Between Feb. 1, 8:00 a.m. and 8:00 p.m.: 5-minute window 
3. After Feb. 1, 8:00 p.m.: 1-hour window

For each hashtag, train 3 regression models, one for each of these time periods (the times are all in PST). Report the MSE and R-squared score for each case.


In [26]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [27]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [28]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [29]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(tag,start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    filename = 'data/tweets_'+tag+'.txt'
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            
            if timestamp < start_ts or timestamp > end_ts:                            
                continue
                
            key = getWindowNumber(start_ts,timestamp,window)
#             print(key)
            if key not in windowToTweets.keys():
                windowToTweets[key]=0
            windowToTweets[key]+=1
            
            retweetCount = json_object['metrics']['citations']['total']        
            
            if key not in windowToRetweets.keys():
                windowToRetweets[key]=0
            windowToRetweets[key]+=retweetCount
        
            followerCount = json_object['author']['followers']
            if key not in windowToFollowerCount.keys():
                windowToFollowerCount[key]=0
            windowToFollowerCount[key]+=followerCount
        
            if key not in windowToMaxFollowers.keys():
                windowToMaxFollowers[key]=0
            windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
        for period in range(start_ts,end_ts,window):
            key = getWindowNumber(start_ts,period,window)
            tweetCount = windowToTweets.get(key, 0)
            retweetCount = windowToRetweets.get(key,0)
            followerCount = windowToFollowerCount.get(key,0)
            maxFollowers = windowToMaxFollowers.get(key,0)

            h = getLocalHour(key)
            
            feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
            features.append(feature)
                
            nextKey = getWindowNumber(start_ts, period + window, window)
            labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [30]:
print("Building feature vectors for time period 1")

for tag in hash_tags:
    print("Started building feature vector for {}".format(tag))
    tp1_window_size = 3600 # 1 hour window size
    tp1_start_ts = tp1_window_size * math.floor(tagsToMinTs[tag]/(tp1_window_size*1.0))
    tp1_end_ts = 1422806400
    features,labels = getFeatures(tag,tp1_start_ts,tp1_end_ts,tp1_window_size)
    print(labels)
    print("No. of features: "+str(len(features)))
    save_object(features, "tp1_features_{}".format(tag))
    save_object(labels, "tp1_labels_{}".format(tag))
    print("Finished building feature vector for {}".format(tag))

Building feature vectors for time period 1
Started building feature vector for #gohawks
[29, 13, 14, 9, 24, 29, 43, 71, 127, 140, 123, 115, 125, 162, 131, 123, 85, 77, 67, 122, 126, 94, 80, 37, 10, 19, 12, 13, 17, 23, 40, 89, 155, 133, 75, 151, 164, 147, 151, 133, 101, 150, 129, 158, 91, 82, 82, 20, 56, 9, 4, 4, 22, 31, 71, 278, 436, 453, 327, 419, 450, 346, 380, 348, 241, 377, 385, 235, 220, 182, 151, 98, 53, 36, 16, 19, 18, 15, 39, 111, 182, 244, 164, 359, 354, 311, 278, 264, 173, 300, 254, 353, 272, 276, 309, 247, 275, 124, 67, 3, 75, 133, 334, 792, 1094, 1467, 1316, 3653, 6678, 3893, 4392, 18606, 3673, 1598, 1488, 1309, 964, 696, 570, 350, 226, 139, 75, 86, 93, 94, 180, 282, 362, 440, 245, 375, 362, 333, 251, 256, 132, 196, 204, 218, 195, 226, 133, 120, 79, 33, 26, 23, 27, 26, 67, 110, 150, 177, 103, 158, 5, 19, 144, 162, 103, 216, 282, 228, 258, 201, 157, 89, 44, 19, 22, 24, 24, 27, 49, 71, 67, 88, 82, 87, 112, 83, 85, 122, 161, 182, 185, 192, 150, 124, 77, 70, 31, 15, 14, 12, 12,

[8, 6, 12, 10, 11, 10, 19, 31, 55, 51, 63, 49, 48, 60, 46, 28, 22, 20, 16, 31, 28, 16, 15, 14, 8, 15, 7, 13, 23, 16, 25, 41, 53, 64, 33, 41, 57, 54, 52, 47, 30, 32, 25, 20, 17, 25, 32, 9, 12, 2, 1, 5, 13, 22, 26, 70, 82, 81, 71, 82, 106, 102, 67, 55, 44, 59, 34, 39, 31, 22, 20, 17, 7, 5, 6, 8, 12, 12, 23, 24, 45, 41, 39, 60, 52, 51, 45, 46, 22, 46, 58, 29, 43, 44, 37, 18, 23, 60, 6, 4, 18, 34, 89, 116, 185, 217, 180, 523, 589, 435, 581, 9327, 2416, 1091, 5214, 6969, 1469, 794, 512, 392, 307, 255, 285, 216, 849, 463, 447, 500, 374, 383, 220, 394, 365, 293, 266, 297, 824, 648, 7045, 697, 122, 127, 82, 80, 39, 42, 51, 44, 78, 61, 89, 156, 202, 220, 136, 198, 23, 24, 167, 202, 112, 179, 155, 115, 307, 230, 150, 91, 61, 43, 48, 63, 135, 103, 113, 158, 167, 171, 246, 210, 194, 199, 153, 254, 219, 234, 204, 192, 139, 105, 83, 66, 36, 31, 33, 37, 61, 74, 163, 190, 329, 336, 207, 148, 123, 28, 25, 26, 136, 389, 209, 223, 203, 152, 114, 62, 67, 61, 51, 86, 84, 118, 233, 362, 320, 227, 305, 290, 

In [31]:
import statsmodels.api as sm

for tag in hash_tags:
    print('\nLinear Regression Model for {}'.format(tag))
    X_orig = load_object("tp1_features_{}".format(tag))
    y = load_object("tp1_labels_{}".format(tag))
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
    X = sm.add_constant(X_orig)
    
    model = sm.OLS(y,X)
    results = model.fit()
    #     print(results.pvalues)
    print(results.summary())
    print('---'*20)
    print('\n\n')
    


Linear Regression Model for #gohawks
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.303
Model:                            OLS   Adj. R-squared:                  0.297
Method:                 Least Squares   F-statistic:                     47.32
Date:                Thu, 14 Mar 2019   Prob (F-statistic):           5.01e-33
Time:                        20:51:22   Log-Likelihood:                -3590.5
No. Observations:                 440   AIC:                             7191.
Df Residuals:                     435   BIC:                             7211.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            

In [32]:
print("Building feature vectors for time period 2")

#tp2
for tag in hash_tags:
    print("Started building feature vector for {}".format(tag))
    tp2_window_size = 300 # 5 minute window size
    tp2_start_ts = 1422806400
    tp2_end_ts = 1422849600
    features,labels = getFeatures(tag,tp2_start_ts,tp2_end_ts,tp2_window_size)
    print(labels)
    print("No. of features: "+str(len(features)))
    save_object(features, "tp2_features_{}".format(tag))
    save_object(labels, "tp2_labels_{}".format(tag))
    print("Finished {}".format(tag))

Building feature vectors for time period 2
Started building feature vector for #gohawks
[27, 46, 35, 36, 32, 9, 1, 13, 5, 3, 9, 15, 16, 6, 11, 15, 45, 38, 59, 37, 36, 43, 35, 54, 232, 212, 201, 203, 220, 255, 236, 219, 228, 235, 258, 225, 164, 231, 234, 251, 234, 250, 249, 250, 228, 273, 213, 255, 263, 290, 279, 301, 271, 289, 282, 262, 255, 312, 290, 295, 301, 296, 287, 287, 314, 352, 311, 287, 220, 279, 273, 297, 342, 298, 315, 320, 312, 325, 427, 483, 448, 432, 491, 598, 827, 791, 1121, 826, 1071, 1330, 1329, 680, 474, 482, 1438, 893, 427, 406, 488, 406, 373, 346, 1757, 1020, 426, 337, 324, 1738, 1608, 731, 393, 230, 231, 383, 452, 818, 465, 825, 1848, 1316, 548, 511, 665, 413, 487, 457, 407, 349, 292, 739, 918, 1615, 247, 501, 478, 384, 340, 303, 236, 251, 214, 206, 104, 16]
No. of features: 144
Finished #gohawks
Started building feature vector for #gopatriots
[4, 7, 6, 5, 4, 0, 1, 1, 2, 0, 0, 1, 1, 0, 2, 3, 4, 1, 6, 4, 2, 7, 0, 9, 28, 23, 25, 26, 29, 24, 29, 20, 26, 28, 36, 31, 15

In [33]:
for tag in hash_tags:
    print('\nLinear Regression Model for {}'.format(tag))
    X_orig = load_object("tp2_features_{}".format(tag))
    y = load_object("tp2_labels_{}".format(tag))
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
    X = sm.add_constant(X_orig)
    
    model = sm.OLS(y,X)
    results = model.fit()
    #     print(results.pvalues)
    print(results.summary())
    print('---'*20)
    print('\n\n')
    


Linear Regression Model for #gohawks
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.474
Model:                            OLS   Adj. R-squared:                  0.458
Method:                 Least Squares   F-statistic:                     31.26
Date:                Thu, 14 Mar 2019   Prob (F-statistic):           1.45e-18
Time:                        20:56:41   Log-Likelihood:                -1012.4
No. Observations:                 144   AIC:                             2035.
Df Residuals:                     139   BIC:                             2050.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            

In [34]:
print("Building feature vectors for time period 3")

#tp3
for tag in hash_tags:
    print("Started building feature vector for {}".format(tag))
    tp3_window_size = 3600 # 1 hour window size
    tp3_start_ts = 1422849600
    tp3_end_ts = tp3_window_size * math.ceil(tagsToMaxTs[tag]/(tp3_window_size*1.0))
    features,labels = getFeatures(tag,tp3_start_ts,tp3_end_ts,tp3_window_size)
    save_object(features, "tp3_features_{}".format(tag))
    save_object(labels, "tp3_labels_{}".format(tag))
    print("Finished {}".format(tag))

Building feature vectors for time period 3
Started building feature vector for #gohawks
Finished #gohawks
Started building feature vector for #gopatriots
Finished #gopatriots
Started building feature vector for #nfl
Finished #nfl
Started building feature vector for #patriots
Finished #patriots
Started building feature vector for #sb49
Finished #sb49
Started building feature vector for #superbowl
Finished #superbowl


In [35]:
for tag in hash_tags:
    print('\nLinear Regression Model for {}'.format(tag))
    X_orig = load_object("tp3_features_{}".format(tag))
    y = load_object("tp3_labels_{}".format(tag))
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
    X = sm.add_constant(X_orig)
    
    model = sm.OLS(y,X)
    results = model.fit()
    #     print(results.pvalues)
    print(results.summary())
    print('---'*20)
    print('\n\n')
    


Linear Regression Model for #gohawks
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.489
Model:                            OLS   Adj. R-squared:                  0.473
Method:                 Least Squares   F-statistic:                     29.23
Date:                Thu, 14 Mar 2019   Prob (F-statistic):           4.82e-17
Time:                        21:00:31   Log-Likelihood:                -758.84
No. Observations:                 127   AIC:                             1528.
Df Residuals:                     122   BIC:                             1542.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            