### QUESTION 13: 
Using grid search, find the best architecture (for scaled data) for each period (with corresponding window length) described in Question 6.

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [None]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [None]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
globalMinTs = 1552522378
globalMaxTs = 0
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    
    if(ts_list[0]<globalMinTs):
        globalMinTs = ts_list[0]
    
    if(ts_list[1]>globalMaxTs):
        globalMaxTs = ts_list[1]
        
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [None]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [None]:
# https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search
import numpy  as np
import pandas as pd

from sklearn.model_selection import GridSearchCV

def GridSearch_table_plot(grid_clf, param_name,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']
    best_param = best_row['param_' + param_name]

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

### Timeperiod 1

In [None]:

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(globalMinTs/(tp1_window_size*1.0))
tp1_end_ts = 1422806400

features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(features,"q13_tp1_features")
save_object(features,"q13_tp1_labels")
# features = load_object("q7_tp1_features")
# labels = load_object("q7_tp1_labels")
print("Finished Building feature vectors for time period 1")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

hidden_layer_sizes = [(100,),
                      (100,100,),
                      (100,100,100,),
                      (100,100,100,100,100),
                      (200,),
                      (200,200,),
                      (200,200,200,),
                      (500,250,125,63,),
                      (800,700,600,500,400,300,200,100,),
                      (1000,100,10),                      
                     ]

param_grid = [
    {
        'hidden_layer_sizes' : hidden_layer_sizes,
    }
]


nn = MLPRegressor(hidden_layer_sizes=(50,50),max_iter=500)

gs = GridSearchCV(estimator=nn,
                  param_grid=param_grid,
                  cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')

gs = gs.fit(scaled_features, labels)
GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)

save_object(gs,"q13_tp1_grid")


In [None]:
# GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)


### Timeperiod 2:

In [None]:
#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
features,labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(features,"q13_tp2_features")
save_object(labels,"q13_tp2_labels")
print("Finished Building feature vectors for time period 2")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

hidden_layer_sizes = [(100,),
                      (100,100,),
                      (100,100,100,),
                      (100,100,100,100,100),
                      (200,),
                      (200,200,),
                      (200,200,200,),
                      (500,250,125,63,),
                      (800,700,600,500,400,300,200,100,),
                      (1000,100,10),                      
                     ]

param_grid = [
    {
        'hidden_layer_sizes' : hidden_layer_sizes,
    }
]


# nn = MLPRegressor(hidden_layer_sizes=(50,50), solver='adam')
nn = MLPRegressor(hidden_layer_sizes=(50,50),max_iter=500)

gs = GridSearchCV(estimator=nn,
                  param_grid=param_grid,
                  cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')

gs = gs.fit(scaled_features, labels)

GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)

save_object(gs,"q13_tp2_grid")

In [None]:
GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)


In [None]:
#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
features,labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
save_object(features,"q13_tp3_features")
save_object(labels,"q13_tp3_labels")

print("Finished Building feature vectors for time period 3")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

hidden_layer_sizes = [(100,),
                      (100,100,),
                      (100,100,100,),
                      (100,100,100,100,100),
                      (200,),
                      (200,200,),
                      (200,200,200,),
                      (500,250,125,63,),
                      (800,700,600,500,400,300,200,100,),
                      (1000,100,10),                      
                     ]

param_grid = [
    {
        'hidden_layer_sizes' : hidden_layer_sizes,
    }
]


# nn = MLPRegressor(hidden_layer_sizes=(50,50), solver='adam')
nn = MLPRegressor(hidden_layer_sizes=(50,50),max_iter=500)

gs = GridSearchCV(estimator=nn,
                  param_grid=param_grid,
                  cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')

gs = gs.fit(scaled_features, labels)

GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)

save_object(gs,"q13_tp3_grid")

In [None]:
GridSearch_table_plot(gs, "hidden_layer_sizes", negative=False)