### QUESTION 10: 

For each time period described in Question 6, perform the same grid search above for GradientBoostingRegressor (with corresponding time window length). Does the cross-validation test error change? Are the best parameter set you find in each period agree with those you found above?

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
# https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search
import numpy  as np
import pandas as pd

from sklearn.model_selection import GridSearchCV

def GridSearch_table_plot(grid_clf,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

In [4]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [5]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [6]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

min_ts = min(list(tagsToMinTs.values()))

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(min_ts/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(features, "q10_tp1_features")
save_object(labels, "q10_tp1_labels")


param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp1grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp1grid.fit(features, labels)
save_object(tp1grid,"q10_tp1_grid")

print(tp1grid.best_score_)
print(tp1grid.best_params_)
GridSearch_table_plot(tp1grid, negative=False)

-5205833.936769421
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
best score:      -5205833.93677 (+/-3851746.31762)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
350,0.083136,0.000678,0.001678,2.1e-05,20.0,sqrt,4,10,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-5205834.0,3851746.0,1,-14380.53,-3648.297,-5270.572,-8932.388,-18852.14,-10216.79,5677.428
710,0.084725,0.00191,0.001704,3e-05,60.0,sqrt,4,10,200,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
1070,0.097746,0.016656,0.001931,0.000321,100.0,sqrt,4,10,200,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
1430,0.090434,0.005297,0.00183,0.00025,,sqrt,4,10,200,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
1250,0.091686,0.002917,0.001816,9.4e-05,200.0,sqrt,4,10,200,"{'max_depth': 200, 'max_features': 'sqrt', 'mi...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
890,0.089678,0.003241,0.00174,0.000153,80.0,sqrt,4,10,200,"{'max_depth': 80, 'max_features': 'sqrt', 'min...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
530,0.084457,0.00125,0.001587,1.9e-05,40.0,sqrt,4,10,200,"{'max_depth': 40, 'max_features': 'sqrt', 'min...",...,-5270269.0,3825482.0,2,-15037.91,-4765.791,-5884.027,-9897.276,-17519.02,-10620.8,4986.74
351,0.169247,0.001208,0.003001,8.2e-05,20.0,sqrt,4,10,400,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-5289115.0,3823887.0,8,-475.5595,-65.75884,-54.78322,-141.1387,-254.7885,-198.4057,155.8816
352,0.255423,0.000874,0.004292,7.8e-05,20.0,sqrt,4,10,600,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-5296608.0,3822706.0,9,-14.82558,-1.477281,-1.94053,-2.226468,-2.531708,-4.600313,5.124413
353,0.341228,0.001444,0.005613,8.4e-05,20.0,sqrt,4,10,800,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-5297102.0,3822537.0,10,-0.3469281,-0.04428356,-0.08485817,-0.03077382,-0.02127267,-0.1056233,0.1225881


In [7]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
features,labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(features, "q10_tp2_features")
save_object(labels, "q10_tp2_labels")    


param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp2grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp2grid.fit(features, labels)
save_object(tp2grid,"q10_tp2_grid")
print(tp2grid.best_score_)
print(tp2grid.best_params_)
GridSearch_table_plot(tp2grid, negative=False)

-23558349.6669906
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
best score:      -23558349.66699 (+/-8666798.41696)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
330,0.133599,0.020495,0.001047,0.000197,20.0,sqrt,4,2,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-23558350.0,8666798.0,1,-139299.808515,-79597.197553,-55006.512585,-71238.340574,-370257.213208,-143079.814487,117108.501373
340,0.065123,0.00267,0.000892,2e-05,20.0,sqrt,4,5,200,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",...,-23558350.0,8666798.0,1,-139299.808515,-79597.197553,-55006.512585,-71238.340574,-370257.213208,-143079.814487,117108.501373
1230,0.036593,0.000245,0.000615,6e-06,200.0,sqrt,4,2,200,"{'max_depth': 200, 'max_features': 'sqrt', 'mi...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
520,0.054824,0.001491,0.000935,6.1e-05,40.0,sqrt,4,5,200,"{'max_depth': 40, 'max_features': 'sqrt', 'min...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
1420,0.036579,0.000368,0.000621,9e-06,,sqrt,4,5,200,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
1410,0.03668,0.000304,0.000625,1.3e-05,,sqrt,4,2,200,"{'max_depth': None, 'max_features': 'sqrt', 'm...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
700,0.050872,0.002234,0.000792,0.000111,60.0,sqrt,4,5,200,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
1050,0.056643,0.0032,0.000997,0.000129,100.0,sqrt,4,2,200,"{'max_depth': 100, 'max_features': 'sqrt', 'mi...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
690,0.051117,0.001003,0.000856,8.9e-05,60.0,sqrt,4,2,200,"{'max_depth': 60, 'max_features': 'sqrt', 'min...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469
510,0.055854,0.003431,0.000886,5.2e-05,40.0,sqrt,4,2,200,"{'max_depth': 40, 'max_features': 'sqrt', 'min...",...,-23624070.0,8609108.0,3,-139296.402748,-79597.197553,-55006.512585,-71238.340574,-398393.794167,-148706.449525,128054.202469


In [8]:
# http://aplunket.com/random-forest-regressor/
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

max_ts = min(list(tagsToMaxTs.values()))

#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(max_ts/(tp3_window_size*1.0))
features,labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
save_object(features, "q10_tp3_features")
save_object(labels, "q10_tp3_labels")
print("Finished Building feature vectors for time period 3")

param_grid = {
  'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

model = GradientBoostingRegressor(random_state=0)
tp3grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = KFold(5, shuffle=True), scoring='neg_mean_squared_error')
tp3grid.fit(features, labels)
save_object(tp3grid,"q10_tp3_grid")
print(tp3grid.best_score_)
print(tp3grid.best_params_)
GridSearch_table_plot(tp3grid, negative=False)

Finished Building feature vectors for time period 3
-2375903.408523168
{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
best parameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
best score:      -2375903.40852 (+/-3820061.35853)
{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
20,0.047813,0.001285,0.00062,9.2e-05,10,auto,1,10,200,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375903.0,3820061.0,1,-0.08472608,-0.2277975,-0.1220762,-0.03439058,-0.06783809,-0.1073657,0.06651403
23,0.141472,0.008107,0.000961,0.000144,10,auto,1,10,800,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
24,0.150608,0.004526,0.000911,5.1e-05,10,auto,1,10,1000,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
25,0.16734,0.002764,0.000906,2.8e-05,10,auto,1,10,1200,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
26,0.186351,0.001917,0.000942,3.9e-05,10,auto,1,10,1400,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
27,0.20481,0.002532,0.000972,2.6e-05,10,auto,1,10,1600,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
28,0.225701,0.002553,0.001041,9.3e-05,10,auto,1,10,1800,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
29,0.249362,0.010997,0.001032,1.7e-05,10,auto,1,10,2000,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
22,0.124826,0.003669,0.001006,0.000142,10,auto,1,10,600,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,2,-9.900963e-08,-9.888689e-08,-9.703609e-08,-9.835045e-08,-9.94135e-08,-9.853931e-08,8.248119e-10
21,0.099546,0.003574,0.000763,1.5e-05,10,auto,1,10,400,"{'max_depth': 10, 'max_features': 'auto', 'min...",...,-2375982.0,3820158.0,10,-8.847576e-07,-2.700952e-06,-1.984245e-06,-2.310233e-07,-3.008384e-07,-1.220363e-06,9.708358e-07


## Comparison with Q7
Performing cross validation for question 7

In [9]:
# https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

### Cross validation for timeperiod 1

In [11]:
from sklearn.model_selection import cross_val_score
# features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
features = load_object("q10_tp1_features")
labels = load_object("q10_tp1_labels")
tp1_cross_val_score = cross_val_score(SMWrapper(sm.OLS), features, labels, scoring='neg_mean_squared_error')
print(tp1_cross_val_score)

[-13651136.84635217   -331190.35619098  -3216831.56470503]




### Cross validation for timeperiod 2

In [12]:
# features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
features = load_object("q10_tp2_features")
labels = load_object("q10_tp2_labels")
tp2_cross_val_score = cross_val_score(SMWrapper(sm.OLS), features, labels, scoring='neg_mean_squared_error')
print(tp2_cross_val_score)

[-3.39534800e+06 -1.10900854e+07 -1.20488027e+08]




### Cross validation for timeperiod 3

In [13]:
# features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
features = load_object("q10_tp3_features")
labels = load_object("q10_tp3_labels")
tp3_cross_val_score = cross_val_score(SMWrapper(sm.OLS), features, labels, scoring='neg_mean_squared_error')
print(tp3_cross_val_score)

[-9149627.39063494  -400244.03042572  -449373.08022806]


