In [2]:
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import datetime, time
import pytz
from itertools import compress
from sklearn import datasets, linear_model
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import os
import math
from sklearn import cross_validation


def cv(X, y, n_splits=10, verbose=True):
    kf = KFold(n_splits=n_splits)
    rmses = [] 
    for reg_name in ['Linear','RandomForest','KNeighbors']:
        mses = []
        if reg_name == 'Linear':
            regr = linear_model.LinearRegression()
        elif reg_name == 'RandomForest':
            regr = RandomForestRegressor()
        elif reg_name == 'KNeighbors':
            regr = KNeighborsRegressor(n_neighbors=5)
        scores =cross_val_score(regr, X, y, scoring='neg_mean_squared_error',  cv=10)
        print(reg_name)
        print(math.sqrt(np.mean(np.abs(scores))))
        
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            #lr = sm.OLS(y_train, X_train).fit()
            #lr = sm.RLM(y_train, X_train).fit()
            #lr = sm.GLM(y_train, X_train).fit()
            regr.fit(X_train, y_train)
            y_pred = regr.predict(X_test)
            mses.append(mean_squared_error(y_test, y_pred))
        
        '''
        if display_summary:
            print(lr.summary())
        '''
        
        rmses.append(math.sqrt(np.mean(np.abs(scores))))
        if verbose: 
            print(reg_name, " RMSE: ", math.sqrt(np.mean(mses)))
        
    return rmses

def build_matrix(raw_df, index='date'):
    
    raw_df = raw_df.set_index(index)
    time_series = raw_df.groupby(pd.Grouper(freq='60Min'))
    print(time_series)
    X = np.zeros((len(time_series), 8))
    y = np.zeros((len(time_series), 1))
    for i, (time_interval, g) in enumerate(time_series):
    
        X[i, 0] = g.total_tweets.sum()
        X[i, 1] = g.total_retweets.sum()
        X[i, 2] = g.sum_followers.sum()
        X[i, 3] = g.max_followers.max()
        X[i, 4] = time_interval.hour     #store the hour of the day -> preserve order
        X[i, 5] = g.total_replies.sum()
        X[i, 6] = g.total_ranking.sum()
        X[i, 7] = g.total_impressions.sum()
        
        y[i, 0] = g.total_tweets.sum()
    print('finished building matrix')
    return np.nan_to_num(X[:-1]), y[1:]

def filter_and_test(df):
    #before Feb 1, 8:00am
    first_date_marker = datetime.datetime(2015, 2, 1, 8, 0, 0, 0)

    #end at 8pm
    second_date_marker = datetime.datetime(2015, 2, 1, 20, 0, 0, 0)

    ###Set up the data by filtering via index
    #Before Feb. 1, 8:00 a.m.
    #sort out the times in the dataframe before this period
    df_p1 = df[df.date < first_date_marker]


    #Between Feb. 1, 8:00 a.m. and 8:00 p.m. 
    df_p2 = df[(df.date > first_date_marker) &
               (df.date < second_date_marker)]

    #After Feb. 1, 8:00 p.m.
    df_p3 = df[df.date > second_date_marker]
    
    print("Before Feb. 1, 8:00 a.m.")
    X_df_p1, y_df_p1 = build_matrix(df_p1, index='date')
    errors_df_p1 = cv(X_df_p1, y_df_p1) #default splits = 10 no need to specify
    #errors_df_p1 = cross_val(X_df_p1, y_df_p1)
    
    print("Between Feb. 1, 8:00 a.m. and 8:00 p.m.")
    X_df_p2, y_df_p2 = build_matrix(df_p2, index='date')
    errors_df_p2 = cv(X_df_p2, y_df_p2)
    #errors_df_p2 = cross_val(X_df_p2, y_df_p2)
    
    print("After Feb. 1, 8:00 p.m.")
    X_df_p3, y_df_p3 = build_matrix(df_p3, index='date')
    errors_df_p3 = cv(X_df_p3, y_df_p3)
    #errors_df_p3 = cross_val(X_df_p3, y_df_p3)
    
    return[errors_df_p1, errors_df_p2, errors_df_p3]
    #return [X_df_p1, y_df_p1, X_df_p2, y_df_p2, X_df_p3, y_df_p3]

data_dir = 'C:/Users/tians/Desktop/tweet_data' # MAKE SURE TO CHANGE THIS TO WHERE EVER YOUR DATA IS. 
# The total data size is ~14gb which is too large to be committed into github

hashtags = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl'] 
#hashtags = ['gopatriots']

data = {}
for hashtag in hashtags:
    file_name = data_dir + '/tweets_#' + hashtag + '.txt' 
    with open(file_name, 'rb') as f:
        tweets = []
        for i, l in enumerate(f):
            tweet = json.loads(l)
            tweets.append(tweet)
        data[hashtag] = tweets
        
print('done loading')

pst_tz = pytz.timezone('US/Pacific')
hashtag_dict = {}

for hashtag in hashtags:
    total_tweets = len(data[hashtag])
    hashtag_dict[hashtag] = total_tweets
    min_utc = data[hashtag][0]['citation_date']//3600*3600
    max_utc = data[hashtag][-1]['citation_date']//3600*3600
    print(min_utc, max_utc)
    bins = np.arange(min_utc, max_utc+3600, 3600)
    x = []

max_utc = data[hashtag][-1]['citation_date']//3600*3600
bins = np.arange(min_utc, max_utc+3600, 3600) # [0, 5, 10, 15]


print(datetime.datetime.fromtimestamp(min_utc, pst_tz))
#datetime args: Attributes: year, month, day, hour, minute, second, microsecond, and tzinfo.


df_aggregated = None
cv_dict = {}

for hashtag, num_tweets in hashtag_dict.items():
    print("---")
    print("Porcessing: ", hashtag)
    print("---")
    with open(os.path.join('C:/Users/tians/Desktop/tweet_data','tweets_#' + hashtag +'.txt'), 'rb') as file:
        df = pd.DataFrame(index=range(num_tweets),
                         columns=['date', 'total_tweets', 'total_retweets', 'sum_followers',
                                 'max_followers', 'total_replies', 'total_ranking',
                                 'total_impressions'])
        
#total_tweets, total_retweets, sum_followers, 
#max_followers, time_of_day, toatl_replies, total_ranking, total_impressions
        for i, l in enumerate(file):
            tweet = json.loads(l)
            
            #get the date for sorting
            date = datetime.datetime.fromtimestamp(tweet['firstpost_date'])
            df.set_value(i, 'date', date)
            df.set_value(i, 'total_tweets', 1)
            df.set_value(i, 'total_retweets', tweet['metrics']['citations']['total'])

            #will sum and take max in post processing
            df.set_value(i, 'sum_followers', tweet['author']['followers'])
            df.set_value(i, 'max_followers', tweet['author']['followers'])
            df.set_value(i, 'total_replies', tweet['metrics']['citations']['replies'])
            df.set_value(i, 'total_ranking', tweet['metrics']['ranking_score'])
            df.set_value(i, 'total_impressions', tweet['metrics']['impressions'])

        print('/tweets_#',hashtag)
        #temps = filter_and_test(df)
        cv_dict[hashtag] = filter_and_test(df)
        if df_aggregated is None: #first iteration 
            df_aggregated = df
        else: #aggregate
            df_aggregated = pd.concat([df_aggregated, df])


all_cv_scores = filter_and_test(df_aggregated)



print('-----------Problem 1.4 Results ---------------')
print(cv_dict)
print('aggregate the data of all hashtags cv scores: ', all_cv_scores)
print('----------------------------------------------')

done loading
1421517600 1423303200
1421254800 1423292400
1421514000 1423332000
1421247600 1423332000
1421236800 1423332000
1421366400 1423332000
2015-01-15 16:00:00-08:00
---
Porcessing:  gohawks
---




/tweets_# gohawks
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x0000014A59146AC8>
finished building matrix
Linear
1173.4968218378565
Linear  RMSE:  1173.4968218378565


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
634.0600558475293




RandomForest  RMSE:  693.7336452887937
KNeighbors
804.2340663910537
KNeighbors  RMSE:  804.2340663910537
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91A7B8>
finished building matrix
Linear
77499.57334265517
Linear  RMSE:  77499.57334265517


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
4087.8909606299435
RandomForest  RMSE:  3898.621236539913
KNeighbors
3581.8054869018224
KNeighbors  RMSE:  3581.8054869018224
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91AD68>
finished building matrix
Linear
582.4558967268821
Linear  RMSE:  582.4558967268821
RandomForest
78.15511730433715


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  80.46802687913873
KNeighbors
102.2995725233568
KNeighbors  RMSE:  102.2995725233568
---
Porcessing:  gopatriots
---
/tweets_# gopatriots
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43DE48>
finished building matrix
Linear
49.13674446518472
Linear  RMSE:  49.13674446518472


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
49.61253820471025
RandomForest  RMSE:  53.97560228479587
KNeighbors
58.23872267534049
KNeighbors  RMSE:  58.23872267534049
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91A438>
finished building matrix
Linear
65794.42955988712
Linear  RMSE:  65794.42955988712
RandomForest
958.1497769138185


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  1072.9051677571508
KNeighbors
1249.5188249882433
KNeighbors  RMSE:  1249.5188249882433
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581BCC1FD0>
finished building matrix
Linear
9.10416922951089
Linear  RMSE:  9.10416922951089


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
9.93727765915867
RandomForest  RMSE:  8.42873977515677
KNeighbors
11.699920008492716
KNeighbors  RMSE:  11.699920008492716
---
Porcessing:  nfl
---




/tweets_# nfl
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x0000014A59146D30>
finished building matrix
Linear
241.16188169003343
Linear  RMSE:  241.16188169003343


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
229.06756420464734


  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  222.2214271482158
KNeighbors
279.636909518102
KNeighbors  RMSE:  279.636909518102
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581BCC1B38>
finished building matrix
Linear
26189.972048220006
Linear  RMSE:  26189.972048220006


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
2839.493118850616
RandomForest  RMSE:  2979.1219694735564
KNeighbors
3356.9990125110257
KNeighbors  RMSE:  3356.9990125110257
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43D668>
finished building matrix
Linear
157.91836699830068
Linear  RMSE:  157.91836699830068
RandomForest
202.16810522243802


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  205.02497529448817
KNeighbors
201.61990967665366
KNeighbors  RMSE:  201.61990967665366
---
Porcessing:  patriots
---




/tweets_# patriots
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91A6D8>
finished building matrix
Linear
610.7295029502072
Linear  RMSE:  610.7295029502072


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
671.3943943149602




RandomForest  RMSE:  686.0134082027565
KNeighbors
691.4366702216025
KNeighbors  RMSE:  691.4366702216025
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43DF60>
finished building matrix
Linear
136601.2215009904
Linear  RMSE:  136601.2215009904
RandomForest
19936.486211667292
RandomForest  RMSE:  18365.267154032907
KNeighbors
17696.35031067141


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


KNeighbors  RMSE:  17696.35031067141
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x0000015818914F98>
finished building matrix
Linear
212.84589902429693
Linear  RMSE:  212.84589902429693


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
253.6703831465437
RandomForest  RMSE:  246.69686981581884
KNeighbors
290.1033621027111
KNeighbors  RMSE:  290.1033621027111




---
Porcessing:  sb49
---




/tweets_# sb49
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581BCC16A0>
finished building matrix
Linear
107.44757759610891
Linear  RMSE:  107.44757759610891


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
135.75005292223975
RandomForest  RMSE:  135.33637345548055
KNeighbors
165.44717293996484




KNeighbors  RMSE:  165.44717293996484
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581BCC1DA0>
finished building matrix
Linear
166621.5307068395
Linear  RMSE:  166621.5307068395
RandomForest
37056.226089282216
RandomForest  RMSE:  40495.66203597615


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


KNeighbors
42303.2171929039
KNeighbors  RMSE:  42303.2171929039
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91AE10>
finished building matrix
Linear
346.7104236331856
Linear  RMSE:  346.7104236331856
RandomForest
331.076746456099


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  330.9205280080746
KNeighbors
467.98786154071405
KNeighbors  RMSE:  467.98786154071405
---
Porcessing:  superbowl
---




/tweets_# superbowl
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43DF60>
finished building matrix
Linear
645.0439503201999
Linear  RMSE:  645.0439503201999


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
698.2976925803789




RandomForest  RMSE:  755.5943893050702
KNeighbors
766.9877367914282
KNeighbors  RMSE:  766.9877367914282
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43D0F0>
finished building matrix
Linear
165518.8950515691
Linear  RMSE:  165518.8950515691
RandomForest
72556.44733851707
RandomForest

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


  RMSE:  80547.73214619391
KNeighbors
85425.4580802702
KNeighbors  RMSE:  85425.4580802702
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581BCC1630>
finished building matrix
Linear
593.4084238528479
Linear  RMSE:  593.4084238528479
RandomForest
604.0354304068704


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  492.98363629419407
KNeighbors
654.9769957205191
KNeighbors  RMSE:  654.9769957205191
Before Feb. 1, 8:00 a.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91A048>
finished building matrix
Linear
2068.4410773514073
Linear  RMSE:  2068.4410773514073


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest
1885.6833750875996




RandomForest  RMSE:  1805.1997598235175
KNeighbors
2489.7495280306293
KNeighbors  RMSE:  2489.7495280306293
Between Feb. 1, 8:00 a.m. and 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581A91A940>
finished building matrix
Linear
3880609.9905768223
Linear  RMSE:  3880609.9905768223
RandomForest
101039.95485940698
RandomForest  RMSE:  102715.1711233691


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


KNeighbors
118672.92632181107
KNeighbors  RMSE:  118672.92632181107
After Feb. 1, 8:00 p.m.
<pandas.core.groupby.DataFrameGroupBy object at 0x000001581C43D6D8>
finished building matrix
Linear
1091.064020170654
Linear  RMSE:  1091.064020170654
RandomForest
729.5184218199876


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RandomForest  RMSE:  646.4092669560001
KNeighbors
832.6019908352712
KNeighbors  RMSE:  832.6019908352712
-----------Problem 1.4 Results ---------------
{'gohawks': [[1173.4968218378565, 634.0600558475293, 804.2340663910537], [77499.57334265517, 4087.8909606299435, 3581.8054869018224], [582.4558967268821, 78.15511730433715, 102.2995725233568]], 'gopatriots': [[49.13674446518472, 49.61253820471025, 58.23872267534049], [65794.42955988712, 958.1497769138185, 1249.5188249882433], [9.10416922951089, 9.93727765915867, 11.699920008492716]], 'nfl': [[241.16188169003343, 229.06756420464734, 279.636909518102], [26189.972048220006, 2839.493118850616, 3356.9990125110257], [157.91836699830068, 202.16810522243802, 201.61990967665366]], 'patriots': [[610.7295029502072, 671.3943943149602, 691.4366702216025], [136601.2215009904, 19936.486211667292, 17696.35031067141], [212.84589902429693, 253.6703831465437, 290.1033621027111]], 'sb49': [[107.44757759610891, 135.75005292223975, 165.44717293996484], [1666