### QUESTION 7: 
    
Also, aggregate the data of all hashtags, and train 3 models (for the intervals mentioned above) to predict the number of tweets in the next hour on the aggregated data.
Perform the same evaluations on your combined model and compare with models you trained for individual hashtags.

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [4]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(key)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [7]:
import statsmodels.api as sm

# print("Building feature vectors for time period 1")

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(tagsToMinTs[tag]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
print(len(features))
print(labels)
save_object(features, "q7_tp1_features")
save_object(labels, "q7_tp1_labels")

X_orig = features
y = labels
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)
    
model = sm.OLS(y,X)
results = model.fit()
print(results.summary())
print('---'*20)
print('\n\n')

440
[111, 89, 110, 100, 137, 169, 215, 353, 569, 533, 530, 544, 525, 628, 611, 675, 260, 256, 233, 342, 402, 334, 258, 119, 88, 275, 155, 173, 336, 160, 291, 479, 632, 563, 304, 528, 651, 534, 626, 614, 376, 398, 496, 387, 243, 258, 203, 65, 145, 34, 20, 42, 180, 115, 185, 687, 885, 954, 704, 981, 1032, 825, 884, 799, 474, 799, 724, 473, 459, 337, 334, 236, 125, 103, 82, 133, 218, 93, 192, 309, 500, 612, 430, 710, 783, 687, 686, 670, 428, 712, 697, 614, 619, 542, 582, 417, 434, 347, 139, 21, 312, 544, 1141, 1839, 2459, 3087, 2493, 6380, 10260, 6702, 7271, 39421, 14572, 8757, 22584, 18564, 4389, 2640, 1997, 1413, 914, 709, 721, 666, 1510, 1123, 1475, 2183, 1964, 1975, 1158, 1718, 1546, 1413, 1080, 1221, 1293, 1270, 7962, 1361, 654, 658, 443, 409, 287, 204, 219, 221, 336, 282, 434, 690, 840, 957, 511, 775, 46, 85, 705, 786, 524, 817, 811, 670, 3310, 2254, 995, 609, 329, 261, 343, 667, 975, 739, 838, 928, 870, 858, 844, 843, 842, 767, 783, 1087, 1101, 1263, 931, 966, 814, 793, 416, 409, 2

In [9]:
print("Building feature vectors for time period 2")

#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
features,labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
print(len(features))
print(labels)
save_object(features, "q7_tp2_features")
save_object(labels, "q7_tp2_labels")
X_orig = features
y = labels
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)
    
model = sm.OLS(y,X)
results = model.fit()
print(results.summary())
print('---'*20)
print('\n\n')

Building feature vectors for time period 2
144
[1300, 1343, 1226, 1302, 1224, 319, 245, 231, 230, 237, 284, 266, 470, 281, 332, 400, 1401, 1493, 1709, 1494, 1593, 1463, 1107, 1716, 1861, 1611, 1544, 1560, 1770, 1676, 1820, 1569, 1697, 8499, 9648, 9991, 5736, 9791, 14318, 15471, 15151, 15241, 13539, 13771, 13805, 15622, 14181, 14508, 13067, 13880, 12876, 12221, 11966, 12615, 11745, 11469, 11083, 11119, 10466, 11276, 10371, 11089, 10611, 10115, 9768, 9986, 9730, 8193, 7300, 7410, 11060, 9837, 9391, 9201, 9217, 8763, 9215, 8532, 9360, 10176, 10267, 10557, 10779, 12747, 20744, 19360, 27623, 25137, 34867, 32175, 34416, 23369, 22546, 20954, 27243, 25419, 23214, 24047, 36027, 25872, 23175, 26587, 30303, 28063, 22709, 26830, 23483, 27450, 34990, 25681, 35025, 45904, 45773, 37186, 23920, 23983, 19550, 17554, 24798, 22708, 13866, 14376, 12927, 12568, 14721, 19292, 17790, 14340, 12789, 28000, 16210, 21143, 35905, 41702, 24998, 17300, 13897, 12540, 9347, 8027, 6915, 5890, 3177, 610]
              

In [10]:
print("Building feature vectors for time period 3")

#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(tagsToMaxTs[tag]/(tp3_window_size*1.0))
features,labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
print(len(features))
print(labels)
save_object(features, "q7_tp3_features")
save_object(labels, "q7_tp3_labels")

X_orig = features
y = labels
    
    #     https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)
    
model = sm.OLS(y,X)
results = model.fit()
print(results.summary())
print('---'*20)
print('\n\n')

Building feature vectors for time period 3
135
[16271, 6497, 9039, 5013, 3293, 2602, 2544, 3310, 4070, 4689, 5023, 5052, 4529, 6075, 5143, 7565, 7467, 6555, 6482, 5822, 5175, 4380, 4126, 3566, 2994, 2276, 1863, 1404, 1347, 1225, 1157, 1310, 1397, 1741, 2162, 2377, 2657, 3563, 2801, 2517, 2663, 2253, 1784, 1926, 1609, 1617, 1116, 526, 1268, 1182, 973, 549, 646, 450, 815, 634, 780, 884, 1137, 1407, 1696, 1884, 1489, 1618, 1075, 1305, 1209, 1299, 1167, 909, 1037, 897, 812, 783, 681, 539, 449, 352, 405, 385, 540, 683, 776, 872, 1045, 992, 1080, 1055, 1120, 1154, 983, 920, 906, 869, 865, 537, 679, 727, 524, 256, 406, 416, 441, 429, 514, 380, 915, 1091, 1213, 1150, 1242, 1421, 1558, 1160, 1253, 942, 936, 1220, 983, 852, 856, 882, 503, 411, 30, 28, 26, 13, 17, 36, 36, 85, 59, 49, 52]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.506
Model:                            OLS   Adj. R-squared