In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
from tqdm import tqdm
import pytz
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

In [3]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets\Training\tweets_#gohawks.txt
Datasets\Training\tweets_#gopatriots.txt
Datasets\Training\tweets_#nfl.txt
Datasets\Training\tweets_#patriots.txt
Datasets\Training\tweets_#sb49.txt
Datasets\Training\tweets_#superbowl.txt


In [4]:
# Initialize dictionaries and Unix times for Feb 1, 8 am and Feb 1, 8 pm. 
# Dictionary keys: hashtag.
# Dictionary values: [time of tweet (Unix), number of retweets for tweet, number of followers for tweeter]
# Each row in dictionary value is an individual tweet.

hashtag_dict_before = {}
hashtag_dict_during = {}
hashtag_dict_after = {}
start_unix_time = 1422806400 # 8 am, Feb 1, PST
end_unix_time = 1422849600 # 8 pm, Feb 1, PST
pst_tz = pytz.timezone('America/Los_Angeles')


In [5]:
""" Parse files to get necessary data """

for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        hashtag_dict_before[filename] = []
        hashtag_dict_during[filename] = []
        hashtag_dict_after[filename] = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get desired statistics
                citation_date = json_obj['citation_date'] # Unix time
                num_retweets = json_obj['metrics']['citations']['total'] # Number of retweets for this tweet
                num_followers = json_obj['author']['followers'] # Number of followers for tweeter
                
                # Check when tweet was made and add it to corresponding dictionary
                if citation_date < start_unix_time:
                    hashtag_dict_before[filename].append([citation_date, num_retweets, num_followers])
                elif citation_date > end_unix_time:
                    hashtag_dict_after[filename].append([citation_date, num_retweets, num_followers])
                else:
                    hashtag_dict_during[filename].append([citation_date, num_retweets, num_followers])
    print('done')

Parsing gohawks...
Parsing gopatriots...
Parsing nfl...
Parsing patriots...
Parsing sb49...
Parsing superbowl...
done


## Organize Data

##### Variables:
<span>
key = one of the hashtags <br \> <br \>
</span>

<span>
data_hashtag_before[key] = data before 2/1 8am, split into 1-hour windows (separated by hashtag) <br \>
data_hashtag_during[key] = data between 2/1 8am and 8pm, split into 5-min windows (separated by hashtag) <br \>
data_hashtag_after[key] = data after 2/1 8pm, split into 1-hour windows (separated by hashtag) <br \> <br \>
</span>

<span>
data_aggregate_before = data before 2/1 8am, split into 1-hour windows (all hashtags combined) <br \>
data_aggregate_during = data between 2/1 8am and 8pm, split into 5-min windows (all hashtags combined) <br \>
data_aggregate_after = data after 2/1 8pm, split into 1-hour windows (all hashtags combined) <br \> <br \>
</span>

<span>
data_hashtag_all[key] = all data, split into 1-hour windows (separated by hashtag) <br \> <br \>
</span>

<span>
data_all = all data, split into 1-hour windows (all hashtags combined) <br \>
</span>

In [6]:
# Explicitly list hashtags. 
# Convert each value in dictionary to numpy arrays.

hashtags = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl']

for key in hashtags:
    hashtag_dict_before[key] = np.array(hashtag_dict_before[key])
    hashtag_dict_during[key] = np.array(hashtag_dict_during[key])
    hashtag_dict_after[key] = np.array(hashtag_dict_after[key])

In [7]:
# Find how many time windows there are

ftt = int(np.min([np.min(hashtag_dict_before[key][:,0]) for key in hashtags])) # first tweet time
ltt = int(np.max([np.max(hashtag_dict_after[key][:,0]) for key in hashtags])) # last tweet time

num_windows_before = int(np.max([((start_unix_time - ftt) // 3600) + 1 for key in hashtags]))
num_windows_during = int(np.max([((end_unix_time - start_unix_time) // 3600 * 12) for key in hashtags]))
num_windows_after = int(np.max([((ltt - end_unix_time) // 3600) + 1 for key in hashtags]))

In [8]:
""" Organize data into specific time periods:
     before 2/1 8am with 1-hour windows, 
     between 2/1 8am and 2/1 8pm with 5-min windows,
     and after 2/1 8pm with 1-hour windows """

# Initialize dictionary for each time frame.
data_hashtag_before = {}
data_hashtag_during = {}
data_hashtag_after = {}

# Iterate through each hashtag.
for key in hashtags:
    print(key)
    
    # Rename the dictionary value for readability
    temp_before = hashtag_dict_before[key]
    temp_during = hashtag_dict_during[key]
    temp_after = hashtag_dict_after[key]
    
    data_hashtag_before[key] = np.zeros((num_windows_before, 5)) # Initialize array: rows = time window, columns = feature
    num_followers_before = {} # Initialize dictionary to count # of followers for each tweet
    
    
    
    # Iterate through all elements before start time
    for i in range(np.shape(temp_before)[0]):
        # Get row number
        item_before = int(num_windows_before - 1 - ((start_unix_time - temp_before[i,0] - 1) // 3600))
        # Update first 3 elements (# of tweets, total # retweets, total # followers)
        data_hashtag_before[key][item_before] += np.array([1, int(temp_before[i, 1]), int(temp_before[i, 2]), 0, 0])
        # Get time of day (hour)
        dt_obj_pst = datetime.fromtimestamp(temp_before[i,0], pst_tz)
        data_hashtag_before[key][item_before][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        # Get number of followers
        if item_before not in num_followers_before.keys():
            num_followers_before[item_before] = []
        num_followers_before[item_before].append(temp_before[i,2])
    for i in num_followers_before.keys():
        data_hashtag_before[key][i][3] = np.max(num_followers_before[i])
        
        
    # Iterate through all elements during time
    data_hashtag_during[key] = np.zeros((num_windows_during, 5))
    num_followers_during = {}
    for i in range(np.shape(temp_during)[0]):
        item_during = int(((temp_during[i,0] - start_unix_time) * 12) // 3600)
        data_hashtag_during[key][item_during] += np.array([1, int(temp_during[i, 1]), int(temp_during[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_during[i,0], pst_tz)
        data_hashtag_during[key][item_during][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_during not in num_followers_during.keys():
            num_followers_during[item_during] = []
        num_followers_during[item_during].append(temp_during[i,2])
    for i in num_followers_during.keys():
        data_hashtag_during[key][i][3] = np.max(num_followers_during[i])
        
    # Iterate through all elements after end time
    data_hashtag_after[key] = np.zeros((num_windows_after, 5))
    num_followers_after = {}
    for i in range(np.shape(temp_after)[0]):
        item_after = int((temp_after[i,0] - end_unix_time) // 3600)
        data_hashtag_after[key][item_after] += np.array([1, int(temp_after[i, 1]), int(temp_after[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_after[i,0], pst_tz)
        data_hashtag_after[key][item_after][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_after not in num_followers_after.keys():
            num_followers_after[item_after] = []
        num_followers_after[item_after].append(temp_after[i,2])
    for i in num_followers_after.keys():
        data_hashtag_after[key][i][3] = np.max(num_followers_after[i])
        
print('done')

gohawks
gopatriots
nfl
patriots
sb49
superbowl
done


In [9]:
""" Aggregate data within each time period by combining all hashtags. """

# Initialize aggregated data variables
data_aggregate_before = np.zeros([num_windows_before, 5])
data_aggregate_during = np.zeros([num_windows_during, 5])
data_aggregate_after = np.zeros([num_windows_after, 5])

# Sum the # of tweets, total # of retweets, and # of followers
for key in hashtags:
    data_aggregate_before[:,0:3] += data_hashtag_before[key][:,0:3]
    data_aggregate_during[:,0:3] += data_hashtag_during[key][:,0:3]
    data_aggregate_after[:,0:3] += data_hashtag_after[key][:,0:3]
# Find the max # of followers for each
data_aggregate_before[:,3] = np.amax([data_hashtag_before[key][:,3] for key in hashtags], axis=0)
data_aggregate_during[:,3] = np.amax([data_hashtag_during[key][:,3] for key in hashtags], axis=0)
data_aggregate_after[:,3] = np.amax([data_hashtag_after[key][:,3] for key in hashtags], axis=0)

# Copy over the same time frames
data_aggregate_before[:,4] = data_hashtag_before['superbowl'][:,4]
data_aggregate_during[:,4] = data_hashtag_during['superbowl'][:,4]
data_aggregate_after[:,4] = data_hashtag_after['superbowl'][:,4]

In [10]:
""" Get data for the whole time frame with 1-hour windows, separated by hashtag """

# Initialize dictionary to store data.
# Key: hashtag
# Value: data separated by 1-hour time windows
data_hashtag_all = {}

for key in hashtags: # Iterate through all hashtags
    temp_during = np.zeros([12, 5]) # Initialize array to store data in the middle time period
    # Combine data in the middle time period
    for i in range(np.shape(data_hashtag_during[key])[0]):
        hour = int(data_hashtag_during[key][i,4] - 8)
        temp_during[hour, :3] += data_hashtag_during[key][i, :3]
        if not i % 12:
            temp_during[hour, 3] = np.max(data_hashtag_during[key][i:(i+12), 3])
            temp_during[hour, 4] = data_hashtag_during[key][i,4]
    data_hashtag_all[key] = np.vstack((data_hashtag_before[key], temp_during, data_hashtag_after[key]))

In [11]:
""" Combine data for the whole time frame from all hashtags """

data_all = np.zeros([587, 5])
for key in hashtags:
    data_all += data_hashtag_all[key]

### Nonlinear Regressions: Ensemble methods

In [17]:
# Perform GridSearch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold
import time
import pandas as pd

# Just initialize the pipeline with any estimator you like    
pipe = Pipeline(steps=[('estimator', RandomForestRegressor())])

param_grid = [
    {
        'estimator':[RandomForestRegressor(), GradientBoostingRegressor()],
        'estimator__max_depth': [10, 20, 40, 60, 80, 100, 200, None],
        'estimator__max_features': ['auto', 'sqrt'],
        'estimator__min_samples_leaf': [1, 2, 4],
        'estimator__min_samples_split': [2, 5, 10],
        'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }
]

In [18]:
def show_results(grid_search_cv, top_results=15):
    print("Top {} grid search scores on the basis of mean validation accuracy: ".format(top_results))
    means = grid_search_cv.cv_results_['mean_test_score']
    stds = grid_search_cv.cv_results_['std_test_score']
    params = grid_search_cv.cv_results_['params']
    
    np_rep = np.array([means, stds, params])
    np_rep = np_rep.T
    
    # sort this array
    sorted_gs = np_rep[(-np_rep[:,0]).argsort()]
    
    for i in range(top_results):
        mean, std, param = sorted_gs[i]
        print("%0.6f (+/-%0.06f) for %r" % (mean, std * 2, param))
    print()
    
    print("Best estimator: ")
    print(grid_search_cv.best_estimator_)
    print()

    print("Best cross-val score: ")
    print(grid_search_cv.best_score_)
    print()

    print("Optimal params: ")
    print(grid_search_cv.best_params_)
    print()

##### Question 8

##### Analysis of aggregated data in 1-hour windows

In [19]:
# Define train data and targets
y = data_all[1:,0] # Number of tweets (except first)
X = np.delete(data_all, -1, 0) # Delete last row

print('X shape:', X.shape)
print('y shape:', y.shape)

X shape: (586, 5)
y shape: (586,)


In [20]:
# Random Forest regressor grid search
print('performing grid search...')

gs_cv_all = GridSearchCV(pipe, param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gs_cv_all.fit(X, y)

performing grid search...
Fitting 5 folds for each of 2880 candidates, totalling 14400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  7.8min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 11.1min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 15.2min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 19.8min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 25.1min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 31.1min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 37.5min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 44.4min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 47.3min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 51.1min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed: 56.0mi

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('estimator', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'estimator__max_features': ['auto', 'sqrt'], 'estimator': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_sample...0, 200, None], 'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 14

In [26]:
show_results(gs_cv_all)
results = pd.DataFrame(gs_cv_all.cv_results_)
results.to_csv('gs_q8.csv')

Top 15 grid search scores on the basis of mean validation accuracy: 
-206197689.412178 (+/-618071666.379951) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=60,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=4, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=800,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 5, 'estimator__max_depth': 60, 'estimator__n_estimators': 800}
-206520173.441280 (+/-621256005.337476) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=Non

             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 5, 'estimator__max_depth': 80, 'estimator__n_estimators': 1600}
-210140360.833795 (+/-622016728.199475) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=60,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=4, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=800,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 2, 'estimator__max_depth': 20, 'estimator__n_estimators': 1400}
-210616476.425294 (+/-623228268.572052) for {'estimator__max_features'

In [34]:
results.sort_values('mean_test_score', ascending=False).iloc[:, 0:12]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,params,rank_test_score
2143,0.565929,0.010804,-2.061977e+08,-5.684975e+04,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,4,5,800,"{'estimator__max_features': 'sqrt', 'estimator...",1
2866,0.977620,0.017604,-2.065202e+08,-2.931519e+03,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,4,5,1400,"{'estimator__max_features': 'sqrt', 'estimator...",2
2695,0.831192,0.015205,-2.066646e+08,-8.750317e+03,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,4,10,1200,"{'estimator__max_features': 'sqrt', 'estimator...",3
2139,1.338304,0.023205,-2.074178e+08,-1.533645e+02,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,4,2,2000,"{'estimator__max_features': 'sqrt', 'estimator...",4
1609,0.736765,0.009202,-2.079053e+08,-1.304299e+02,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,4,5,2000,"{'estimator__max_features': 'sqrt', 'estimator...",5
1598,0.665551,0.008600,-2.088445e+08,-4.364090e+02,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,4,2,1800,"{'estimator__max_features': 'sqrt', 'estimator...",6
1954,0.704565,0.013003,-2.088784e+08,-1.806359e+04,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,4,2,1000,"{'estimator__max_features': 'sqrt', 'estimator...",7
2864,0.706164,0.013203,-2.091391e+08,-2.033681e+04,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,4,5,1000,"{'estimator__max_features': 'sqrt', 'estimator...",8
1955,0.844197,0.015603,-2.092728e+08,-7.293323e+03,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,4,2,1200,"{'estimator__max_features': 'sqrt', 'estimator...",9
1596,0.519716,0.006801,-2.095378e+08,-2.680139e+03,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,4,2,1400,"{'estimator__max_features': 'sqrt', 'estimator...",10


#### Question 9

In [12]:
# Define train data and targets
y = data_all[1:,0] # Number of tweets (except first)
X = np.delete(data_all, -1, 0) # Delete last row

print('X shape:', X.shape)
print('y shape:', y.shape)

X shape: (586, 5)
y shape: (586,)


In [13]:
import statsmodels.api as sm

model = sm.OLS(y,X)
ols_results = model.fit()

In [14]:
print(ols_results.params)
print(ols_results.tvalues)

[ 1.61392815e+00 -4.77959471e-01  5.35862092e-05 -8.73389384e-06
  2.03607311e+00]
[ 25.6642281  -10.69125407   4.00537976  -0.11285542   0.26233519]


In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

errors = []

for train, test in KFold(n_splits=10).split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    lineal_model = sm.OLS(y_train, X_train).fit()
    y_pred = lineal_model.predict(X_test)
    errors.append(mean_squared_error(y_test, y_pred))
    
print(np.average(np.array(errors)))

215518342.9196384


#### Question 10

In [38]:
# only testing GradientBoostingRegressor, like the specs mention
param_grid = [
    {
        'estimator':[GradientBoostingRegressor()],
        'estimator__max_depth': [10, 20, 40, 60, 80, 100, 200, None],
        'estimator__max_features': ['auto', 'sqrt'],
        'estimator__min_samples_leaf': [1, 2, 4],
        'estimator__min_samples_split': [2, 5, 10],
        'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }
]

##### Analysis of data aggregated before Feb 1, 8:00 am

In [39]:
# Define train data and targets for BEFORE period
y_before = data_aggregate_before[1:,0] # Number of tweets (except first)
X_before = np.delete(data_aggregate_before, -1, 0) # Delete last row

print('X shape:', X_before.shape)
print('y shape:', y_before.shape)

X shape: (439, 5)
y shape: (439,)


In [40]:
# Random Forest regressor grid search
print('performing grid search...')

gs_cv_all = GridSearchCV(pipe, param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gs_cv_all.fit(X_before, y_before)

performing grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   18.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   47.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 10.7min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed: 12.8min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('estimator', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'estimator__max_features': ['auto', 'sqrt'], 'estimator': [GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, mi...0, 200, None], 'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 14

In [41]:
show_results(gs_cv_all)
results = pd.DataFrame(gs_cv_all.cv_results_)
results.to_csv('gs_q10_1.csv')

Top 15 grid search scores on the basis of mean validation accuracy: 
-5689844.462268 (+/-11366914.605459) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=2, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=400,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 10, 'estimator__max_depth': 10, 'estimator__n_estimators': 400}
-5695530.230750 (+/-11499751.227911) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
 

             warm_start=False), 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__max_depth': 20, 'estimator__n_estimators': 800}
-5897017.168356 (+/-11137393.370677) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=2, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=400,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 2, 'estimator__max_depth': 20, 'estimator__n_estimators': 2000}
-5901773.987906 (+/-10866920.077775) for {'estimator__max_features': 'sqr

In [42]:
results.sort_values('mean_test_score', ascending=False).iloc[:, 0:12]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,params,rank_test_score
141,0.128429,0.002200,-5.689844e+06,-5.711139e-01,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,2,10,400,"{'estimator__max_features': 'sqrt', 'estimator...",1
321,0.180044,0.003001,-5.695530e+06,-7.716574e-02,([DecisionTreeRegressor(criterion='friedman_ms...,20,sqrt,2,10,400,"{'estimator__max_features': 'sqrt', 'estimator...",2
685,0.386286,0.005601,-5.742796e+06,-9.941796e-08,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,2,10,1200,"{'estimator__max_features': 'sqrt', 'estimator...",3
140,0.065815,0.001200,-5.775355e+06,-2.062176e+02,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,2,10,200,"{'estimator__max_features': 'sqrt', 'estimator...",4
1223,0.330282,0.005601,-5.813588e+06,-5.018112e-07,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,2,10,800,"{'estimator__max_features': 'sqrt', 'estimator...",5
1351,0.123027,0.002203,-5.815243e+06,-9.846533e-08,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,1,2,400,"{'estimator__max_features': 'sqrt', 'estimator...",6
688,0.460103,0.006001,-5.863930e+06,-9.950413e-08,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,2,10,1800,"{'estimator__max_features': 'sqrt', 'estimator...",7
502,0.280263,0.005001,-5.866801e+06,-5.576346e-04,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,2,10,600,"{'estimator__max_features': 'sqrt', 'estimator...",8
142,0.192243,0.003001,-5.870778e+06,-2.062107e-03,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,2,10,600,"{'estimator__max_features': 'sqrt', 'estimator...",9
1225,0.379095,0.005401,-5.883174e+06,-9.916614e-08,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,2,10,1200,"{'estimator__max_features': 'sqrt', 'estimator...",10


##### Analysis of data aggregated between Feb 1, 8:00 am and Feb 1, 8:00 pm

In [43]:
y_during = data_aggregate_during[1:,0]
X_during = np.delete(data_aggregate_during, -1, 0)

print('X shape:', X_during.shape)
print('y shape:', y_during.shape)

X shape: (143, 5)
y shape: (143,)


In [44]:
# Random Forest regressor grid search
print('performing grid search...')

gs_cv_all = GridSearchCV(pipe, param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gs_cv_all.fit(X_during, y_during)

performing grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   20.3s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   46.8s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed:  6.7min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('estimator', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'estimator__max_features': ['auto', 'sqrt'], 'estimator': [GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=20,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, mi...0, 200, None], 'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 14

In [45]:
show_results(gs_cv_all)
results = pd.DataFrame(gs_cv_all.cv_results_)
results.to_csv('gs_q10_2.csv')

Top 15 grid search scores on the basis of mean validation accuracy: 
-28482971.277054 (+/-24536737.150611) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=20,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__max_depth': 20, 'estimator__n_estimators': 200}
-29086568.141361 (+/-22003396.812168) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
 

             warm_start=False), 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 2, 'estimator__max_depth': 100, 'estimator__n_estimators': 400}
-30186568.644052 (+/-23521339.243215) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=20,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__max_depth': 80, 'estimator__n_estimators': 1000}
-30201730.540017 (+/-24775286.582863) for {'estimator__max_features': 's

In [46]:
results.sort_values('mean_test_score', ascending=False).iloc[:, 0:12]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,params,rank_test_score
270,0.045214,0.001000,-2.848297e+07,-9.900099e-08,([DecisionTreeRegressor(criterion='friedman_ms...,20,sqrt,1,2,200,"{'estimator__max_features': 'sqrt', 'estimator...",1
92,0.097826,0.001000,-2.908657e+07,-9.872871e-08,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,1,2,600,"{'estimator__max_features': 'sqrt', 'estimator...",2
1375,0.192643,0.001800,-2.913824e+07,-9.808062e-08,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,1,10,1200,"{'estimator__max_features': 'sqrt', 'estimator...",3
1357,0.228451,0.002000,-2.946787e+07,-9.836797e-08,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,1,2,1600,"{'estimator__max_features': 'sqrt', 'estimator...",4
520,0.041209,0.000600,-2.952291e+07,-1.194109e+05,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,4,5,200,"{'estimator__max_features': 'sqrt', 'estimator...",5
845,0.201045,0.001600,-2.961121e+07,-9.886103e-08,([DecisionTreeRegressor(criterion='friedman_ms...,80,sqrt,2,2,1200,"{'estimator__max_features': 'sqrt', 'estimator...",6
279,0.283263,0.001600,-2.986481e+07,-9.895955e-08,([DecisionTreeRegressor(criterion='friedman_ms...,20,sqrt,1,2,2000,"{'estimator__max_features': 'sqrt', 'estimator...",7
1174,0.150634,0.001200,-2.990555e+07,-9.868195e-08,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,1,2,1000,"{'estimator__max_features': 'sqrt', 'estimator...",8
678,0.281063,0.001800,-2.991034e+07,-9.950403e-08,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,2,5,1800,"{'estimator__max_features': 'sqrt', 'estimator...",9
994,0.149834,0.001400,-3.014752e+07,-9.901552e-08,([DecisionTreeRegressor(criterion='friedman_ms...,100,sqrt,1,2,1000,"{'estimator__max_features': 'sqrt', 'estimator...",10


##### Analysis of data aggregated after Feb 1, 8:00 pm

In [47]:
y_after = data_aggregate_after[1:,0]
X_after = np.delete(data_aggregate_after, -1, 0)

print('X shape:', X_after.shape)
print('y shape:', y_after.shape)

X shape: (134, 5)
y shape: (134,)


In [48]:
# Random Forest regressor grid search
print('performing grid search...')

gs_cv_all = GridSearchCV(pipe, param_grid, cv=KFold(5, shuffle=True), scoring='neg_mean_squared_error',
                         verbose=1, n_jobs=4)
gs_cv_all.fit(X_after, y_after)

performing grid search...
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    6.6s
[Parallel(n_jobs=4)]: Done 562 tasks      | elapsed:   28.0s
[Parallel(n_jobs=4)]: Done 1062 tasks      | elapsed:   52.5s
[Parallel(n_jobs=4)]: Done 1762 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2662 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 3762 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done 5062 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 6562 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 7200 out of 7200 | elapsed:  6.2min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('estimator', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'estimator__max_features': ['auto', 'sqrt'], 'estimator': [GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, mi...0, 200, None], 'estimator__n_estimators': [200, 400, 600, 800, 1000, 1200, 14

In [49]:
show_results(gs_cv_all)
results = pd.DataFrame(gs_cv_all.cv_results_)
results.to_csv('gs_q10_3.csv')

Top 15 grid search scores on the basis of mean validation accuracy: 
-331572.339233 (+/-302518.998931) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=4, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=2000,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 5, 'estimator__max_depth': 10, 'estimator__n_estimators': 2000}
-336085.514476 (+/-283271.289248) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
       

             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 5, 'estimator__max_depth': 10, 'estimator__n_estimators': 1600}
-345104.871926 (+/-297758.086928) for {'estimator__max_features': 'sqrt', 'estimator': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=4, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=2000,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False), 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 5, 'estimator__max_depth': 20, 'estimator__n_estimators': 400}
-345130.447981 (+/-314781.234756) for {'estimator__max_features': 'sqrt', 'e

In [50]:
results.sort_values('mean_test_score', ascending=False).iloc[:, 0:12]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,params,rank_test_score
169,0.345678,0.003001,-331572.339233,-1.110052e-07,([DecisionTreeRegressor(criterion='friedman_ms...,10,sqrt,4,5,2000,"{'estimator__max_features': 'sqrt', 'estimator...",1
1236,0.265866,0.003000,-336085.514476,-8.297246e-07,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,4,2,1400,"{'estimator__max_features': 'sqrt', 'estimator...",2
700,0.039609,0.000400,-337121.354505,-4.634730e+02,([DecisionTreeRegressor(criterion='friedman_ms...,60,sqrt,4,5,200,"{'estimator__max_features': 'sqrt', 'estimator...",3
344,0.197244,0.002400,-338091.737329,-3.207575e-04,([DecisionTreeRegressor(criterion='friedman_ms...,20,sqrt,4,5,1000,"{'estimator__max_features': 'sqrt', 'estimator...",4
525,0.231651,0.003001,-338362.920420,-1.345420e-05,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,4,5,1200,"{'estimator__max_features': 'sqrt', 'estimator...",5
1410,0.039409,0.000600,-338972.416711,-4.568849e+02,([DecisionTreeRegressor(criterion='friedman_ms...,,sqrt,4,2,200,"{'estimator__max_features': 'sqrt', 'estimator...",6
890,0.038813,0.000600,-340831.686698,-6.154387e+02,([DecisionTreeRegressor(criterion='friedman_ms...,80,sqrt,4,10,200,"{'estimator__max_features': 'sqrt', 'estimator...",7
888,0.323273,0.003201,-343294.875047,-1.255015e-07,([DecisionTreeRegressor(criterion='friedman_ms...,80,sqrt,4,5,1800,"{'estimator__max_features': 'sqrt', 'estimator...",8
521,0.078820,0.001000,-343313.807463,-7.297854e+00,([DecisionTreeRegressor(criterion='friedman_ms...,40,sqrt,4,5,400,"{'estimator__max_features': 'sqrt', 'estimator...",9
1237,0.292070,0.003000,-344354.934503,-2.197247e-07,([DecisionTreeRegressor(criterion='friedman_ms...,200,sqrt,4,2,1600,"{'estimator__max_features': 'sqrt', 'estimator...",10
