### QUESTION 8: 

Use grid search to find the best parameter set for RandomForestRegressor and GradientBoostingRegressor respectively. Use the following param_grid

```
{
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
```

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
# https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search
import numpy  as np
import pandas as pd

from sklearn.model_selection import GridSearchCV

def GridSearch_table_plot(grid_clf,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

In [4]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [6]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [7]:
min_ts = min(list(tagsToMinTs.values()))
max_ts = max(list(tagsToMaxTs.values()))
tp1_window_size = 3600 
tp1_start_ts = tp1_window_size * math.floor(min_ts/(tp1_window_size*1.0))
tp1_end_ts = tp1_window_size * math.ceil(max_ts/(tp1_window_size*1.0))
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(features,"q8_features")
# save_object(labels,"q8_labels")

In [8]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
grid.fit(features, labels)

print(grid.best_score_)
print(grid.best_params_)

GridSearch_table_plot(grid, negative=False)



-180257932.325396
{'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best score:      -180257932.32540 (+/-175667410.37540)
{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 40,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
990,0.28971,0.026831,0.019583,0.003989,100.0,sqrt,1,2,200,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
450,0.183841,0.000932,0.010678,0.000172,40.0,sqrt,1,2,200,"{'max_depth': 40, 'max_features': 'sqrt', 'min...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
1170,0.186662,0.002706,0.010314,0.000199,200.0,sqrt,1,2,200,"{'max_depth': 200, 'max_features': 'sqrt', 'mi...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
1350,0.183386,0.000981,0.01061,0.000123,,sqrt,1,2,200,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
630,0.273457,0.054088,0.016758,0.005295,60.0,sqrt,1,2,200,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
810,0.191708,0.007534,0.011074,0.000889,80.0,sqrt,1,2,200,"{'max_depth': 80, 'max_features': 'sqrt', 'min...",...,-180257900.0,175667400.0,1,-52409620.0,-17842420.0,-52214160.0,-52356920.0,-36765230.0,-42317670.0,13641420.0
270,0.206599,0.00857,0.011924,0.001286,20.0,sqrt,1,2,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-180611800.0,174019600.0,7,-53032540.0,-17681750.0,-53594450.0,-53004150.0,-36276440.0,-42717870.0,14133640.0
90,0.181074,0.015392,0.010734,0.001675,10.0,sqrt,1,2,200,"{'max_depth': 10, 'max_features': 'sqrt', 'min...",...,-181278200.0,179475500.0,8,-54924020.0,-16920610.0,-53757700.0,-49652830.0,-37038500.0,-42458730.0,14257490.0
93,0.732189,0.058637,0.045089,0.014729,10.0,sqrt,1,2,800,"{'max_depth': 10, 'max_features': 'sqrt', 'min...",...,-182556900.0,178985200.0,9,-51926160.0,-18309610.0,-46582430.0,-47371890.0,-35806660.0,-39999350.0,12066180.0
991,0.527839,0.025612,0.025182,0.004011,100.0,sqrt,1,2,400,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-183028800.0,179025700.0,10,-49918070.0,-18189640.0,-48709080.0,-51365920.0,-37411220.0,-41118790.0,12486280.0


In [9]:
# save_object(grid,"q8_grid1")

In [7]:
# features = load_object("q8_features")
# labels = load_object("q8_labels")
# grid=load_object("q8_grid1")
# GridSearch_table_plot(grid, negative=False)

In [24]:
from sklearn.metrics import mean_squared_error

y_pred = grid.predict(features)
y_true = labels

print("Mean squared error for RandomForestRegressor grid search was "+str(mean_squared_error(y_true, y_pred)))

import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste


# print('\nTesting OLS model')
X = features
y = labels
    
X = sm.add_constant(X)
    
model = sm.OLS(y,X)
results = model.fit()
pred_y = results.predict(X)
print("Mean squared error for OLS was : {}".format(ste.mse(pred_y, y,axis=0)))

print('---'*20)

Mean squared error for RandomForestRegressor grid search was 100190562.76714939
Mean squared error for OLS was : 137228551.19951168
------------------------------------------------------------


In [9]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
newgrid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
newgrid.fit(features, labels)

GridSearch_table_plot(newgrid, negative=False)



best parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 2000}
best score:      -236408802.62328 (+/-286006774.67193)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 60,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 2000,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
719,1.425425,0.052532,0.021487,0.00163,60.0,sqrt,4,10,2000,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-236408800.0,286006800.0,1,-79.504452,-1.464418,-0.130069,-48.490404,-19.189662,-29.755801,30.383032
1259,1.295127,0.040703,0.020831,0.00112,200.0,sqrt,4,10,2000,"{'max_depth': 200, 'max_features': 'sqrt', 'mi...",...,-236408800.0,286006800.0,1,-79.504452,-1.464418,-0.130069,-48.490404,-19.189662,-29.755801,30.383032
1079,1.484065,0.031649,0.022094,0.001505,100.0,sqrt,4,10,2000,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-236408800.0,286006800.0,1,-79.504452,-1.464418,-0.130069,-48.490404,-19.189662,-29.755801,30.383032
899,1.329578,0.070811,0.021393,0.001408,80.0,sqrt,4,10,2000,"{'max_depth': 80, 'max_features': 'sqrt', 'min...",...,-236408800.0,286006800.0,1,-79.504452,-1.464418,-0.130069,-48.490404,-19.189662,-29.755801,30.383032
1439,1.287792,0.041935,0.020906,0.001149,,sqrt,4,10,2000,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-236408800.0,286006800.0,1,-79.504452,-1.464418,-0.130069,-48.490404,-19.189662,-29.755801,30.383032
1258,1.178348,0.027755,0.01925,0.001035,200.0,sqrt,4,10,1800,"{'max_depth': 200, 'max_features': 'sqrt', 'mi...",...,-236419000.0,286023500.0,6,-276.661494,-7.846374,-0.738628,-120.793606,-81.243248,-97.45667,100.305774
1078,1.360885,0.062085,0.020136,0.001798,100.0,sqrt,4,10,1800,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-236419000.0,286023500.0,6,-276.661494,-7.846374,-0.738628,-120.793606,-81.243248,-97.45667,100.305774
898,1.181797,0.029768,0.019242,0.000959,80.0,sqrt,4,10,1800,"{'max_depth': 80, 'max_features': 'sqrt', 'min...",...,-236419000.0,286023500.0,6,-276.661494,-7.846374,-0.738628,-120.793606,-81.243248,-97.45667,100.305774
718,1.363421,0.079847,0.019916,0.001466,60.0,sqrt,4,10,1800,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-236419000.0,286023500.0,6,-276.661494,-7.846374,-0.738628,-120.793606,-81.243248,-97.45667,100.305774
1438,1.179601,0.033053,0.019266,0.000918,,sqrt,4,10,1800,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-236419000.0,286023500.0,6,-276.661494,-7.846374,-0.738628,-120.793606,-81.243248,-97.45667,100.305774


In [11]:
# save_object(newgrid,"q8_grid2")

In [9]:
# features = load_object("q8_features")
# labels = load_object("q8_labels")
# newgrid=load_object("q8_grid2")
# GridSearch_table_plot(newgrid, negative=False)