### QUESTION 10: 

For each time period described in Question 6, perform the same grid search above for GradientBoostingRegressor (with corresponding time window length). Does the cross-validation test error change? Are the best parameter set you find in each period agree with those you found above?

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
# https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search
import numpy  as np
import pandas as pd

from sklearn.model_selection import GridSearchCV

def GridSearch_table_plot(grid_clf,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

In [4]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [5]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [8]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

min_ts = min(list(tagsToMinTs.values()))

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(min_ts/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(features, "q10_tp1_features")
save_object(labels, "q10_tp1_labels")


param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp1grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp1grid.fit(features, labels)
save_object(tp1grid,"q10_tp1_grid")

print(tp1grid.best_score_)
print(tp1grid.best_params_)
GridSearch_table_plot(tp1grid, negative=False)

best parameters: {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
best score:      -3867938.33908 (+/-4734295.30616)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
260,0.159741,0.006179,0.001894,0.000349,20.0,auto,4,10,200,"{'max_depth': 20, 'max_features': 'auto', 'min...",...,-3867938.0,4734295.0,1,-12016.447178,-93888.958363,-128475.223342,-123103.341607,-120669.156117,-95630.625321,43485.192822
80,0.093424,0.001688,0.001016,6.6e-05,10.0,auto,4,10,200,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-3873890.0,4731017.0,2,-7047.165787,-102295.987017,-122162.170474,-108621.887923,-132364.919963,-94498.426233,44960.318439
980,0.186834,0.015937,0.002043,0.000219,100.0,auto,4,10,200,"{'max_depth': 100, 'max_features': 'auto', 'mi...",...,-3879074.0,4752090.0,3,-12505.678127,-91336.198942,-124652.782795,-119788.338596,-118694.038286,-93395.407349,42097.55266
800,0.211172,0.00521,0.002225,0.000154,80.0,auto,4,10,200,"{'max_depth': 80, 'max_features': 'auto', 'min...",...,-3879074.0,4752090.0,3,-12505.678127,-91336.198942,-124652.782795,-119788.338596,-118694.038286,-93395.407349,42097.55266
1160,0.150805,0.004149,0.001939,0.000156,200.0,auto,4,10,200,"{'max_depth': 200, 'max_features': 'auto', 'mi...",...,-3879074.0,4752090.0,3,-12505.678127,-91336.198942,-124652.782795,-119788.338596,-118694.038286,-93395.407349,42097.55266
620,0.208082,0.01906,0.002276,0.000573,60.0,auto,4,10,200,"{'max_depth': 60, 'max_features': 'auto', 'min...",...,-3879074.0,4752090.0,3,-12505.678127,-91336.198942,-124652.782795,-119788.338596,-118694.038286,-93395.407349,42097.55266
1340,0.150916,0.004295,0.001858,2.7e-05,,auto,4,10,200,"{'max_depth': None, 'max_features': 'auto', 'm...",...,-3879074.0,4752090.0,3,-12505.678127,-91336.198942,-124652.782795,-119788.338596,-118694.038286,-93395.407349,42097.55266
440,0.199614,0.035945,0.002037,0.000284,40.0,auto,4,10,200,"{'max_depth': 40, 'max_features': 'auto', 'min...",...,-3881329.0,4751357.0,8,-12505.678127,-91336.198942,-124652.782795,-124956.640096,-118694.038286,-94429.067649,42790.657916
790,0.29583,0.062642,0.002632,0.000112,80.0,auto,4,5,200,"{'max_depth': 80, 'max_features': 'auto', 'min...",...,-3903497.0,4572764.0,9,-6728.920111,-21890.605232,-36626.307098,-102375.714199,-59994.553944,-45523.220117,33419.392115
1140,0.152377,0.004004,0.001931,3.9e-05,200.0,auto,4,2,200,"{'max_depth': 200, 'max_features': 'auto', 'mi...",...,-3903497.0,4572764.0,9,-6728.920111,-21890.605232,-36626.307098,-102375.714199,-59994.553944,-45523.220117,33419.392115


In [9]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
features,labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(features, "q10_tp2_features")
save_object(labels, "q10_tp2_labels")    


param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp2grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp2grid.fit(features, labels)
save_object(tp2grid,"q10_tp2_grid")
print(tp2grid.best_score_)
print(tp2grid.best_params_)
GridSearch_table_plot(tp2grid, negative=False)

-25204887.61461869
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
best score:      -25204887.61462 (+/-10271181.37371)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
280,0.043533,0.002382,0.000818,4.5e-05,20.0,sqrt,1,5,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204890.0,10271180.0,1,-0.001082882,-4.247733e-05,-3.666804e-05,-0.001208076,-5.910688e-05,-0.000485842,0.0005400945
288,0.218177,0.024602,0.001316,7.4e-05,20.0,sqrt,1,5,1800,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
287,0.194438,0.017846,0.001309,0.000135,20.0,sqrt,1,5,1600,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
286,0.182101,0.014147,0.001321,0.000136,20.0,sqrt,1,5,1400,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
284,0.139,0.003523,0.0012,4.6e-05,20.0,sqrt,1,5,1000,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
283,0.10937,0.001825,0.001102,3.4e-05,20.0,sqrt,1,5,800,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
282,0.088429,0.002752,0.001064,2.8e-05,20.0,sqrt,1,5,600,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
281,0.068588,0.001243,0.000986,4.6e-05,20.0,sqrt,1,5,400,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
289,0.257103,0.026273,0.001516,0.00014,20.0,sqrt,1,5,2000,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09
285,0.173859,0.018815,0.001289,0.000133,20.0,sqrt,1,5,1200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-25204910.0,10271220.0,2,-9.943964e-08,-9.493291e-08,-9.925159e-08,-9.878407e-08,-9.845827e-08,-9.81733e-08,1.656545e-09


In [12]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

max_ts = min(list(tagsToMaxTs.values()))

#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(max_ts/(tp3_window_size*1.0))
features,labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
save_object(features, "q10_tp3_features")
save_object(labels, "q10_tp3_labels")
print("Finished Building feature vectors for time period 3")

param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp3grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp3grid.fit(features, labels)
save_object(tp3grid,"q10_tp3_grid")
print(tp3grid.best_score_)
print(tp3grid.best_params_)
GridSearch_table_plot(tp3grid, negative=False)

-2455227.1022114074
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best score:      -2455227.10221 (+/-4004378.34050)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
272,0.078267,0.001489,0.000734,7e-06,20,sqrt,1,2,600,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
278,0.202923,0.00772,0.001064,0.000139,20,sqrt,1,2,1800,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
277,0.202947,0.014828,0.001022,6.2e-05,20,sqrt,1,2,1600,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
276,0.167688,0.009809,0.000982,0.000101,20,sqrt,1,2,1400,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
275,0.15536,0.013306,0.000986,8.2e-05,20,sqrt,1,2,1200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
274,0.124059,0.006336,0.000838,4.5e-05,20,sqrt,1,2,1000,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
273,0.099187,0.000975,0.000765,1.7e-05,20,sqrt,1,2,800,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
279,0.222535,0.000808,0.001058,5.9e-05,20,sqrt,1,2,2000,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
270,0.036775,0.000827,0.000759,0.00011,20,sqrt,1,2,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
271,0.058816,0.000756,0.000751,0.000101,20,sqrt,1,2,400,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-2455227.0,4004378.0,1,-9.783298e-08,-9.838896e-08,-9.700794e-08,-9.881365e-08,-9.839766e-08,-9.808824e-08,6.236345e-10
