### QUESTION 14: 
Report the model you use. For each test file, provide your predictions on the number of tweets in the next time window.
Note: Test data should not be used as a source for training. You are not bounded to only linear models. You can find your best model through cross validation of your training data.

In [1]:
tp1_files = ['sample0_period1.txt','sample1_period1.txt','sample2_period1.txt']
tp2_files = ['sample0_period2.txt','sample1_period2.txt','sample2_period2.txt']
tp3_files = ['sample0_period3.txt','sample1_period3.txt','sample2_period3.txt']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [4]:
import json

def getMinAndMaxTs(tag,filename=None):
    if filename is None:
        filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
globalMinTs = 1552522378
globalMaxTs = 0
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    
    if(ts_list[0]<globalMinTs):
        globalMinTs = ts_list[0]
    
    if(ts_list[1]>globalMaxTs):
        globalMaxTs = ts_list[1]
        
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])


In [5]:

filesToMinTs = {}
filesToMaxTs = {}

for file in tp1_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    
    
for file in tp2_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    
    
for file in tp3_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    


In [6]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

## Timeperiod 1

### Building training set for Timeperiod 1(same as q7)

In [7]:
import statsmodels.api as sm

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(globalMinTs/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
# tp1_train_features,tp1_train_labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(tp1_train_features, "q14_tp1_train_features")
# save_object(tp1_train_labels, "q14_tp1_train_labels")
tp1_train_features = load_object("q14_tp1_train_features")
tp1_train_labels = load_object("q14_tp1_train_labels")
print("Finished Building feature vectors for training set for time period 1")

Finished Building feature vectors for training set for time period 1


### Building test set for Timeperiod 1(sample 0)

In [8]:
import statsmodels.api as sm

file = 'sample0_period1.txt'

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
# tp1_test_p1_s0_features,tp1_test_p1_s0_labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(tp1_test_p1_s0_features, "q14_tp1_test_p1_s0_features")
# save_object(tp1_test_p1_s0_labels, "q14_tp1_test_p1_s0_labels")
tp1_test_p1_s0_features = load_object("q14_tp1_test_p1_s0_features")
tp1_test_p1_s0_labels = load_object("q14_tp1_test_p1_s0_labels")
print("Finished Building test feature vectors for time period 1 sample 0")

Finished Building test feature vectors for time period 1 sample 0


### Building test set for Timeperiod 1(sample 1)

In [9]:
import statsmodels.api as sm

file = 'sample1_period1.txt'

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
# tp1_test_p1_s1_features,tp1_test_p1_s1_labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(tp1_test_p1_s1_features, "q14_tp1_test_p1_s1_features")
# save_object(tp1_test_p1_s1_labels, "q14_tp1_test_p1_s1_labels")
tp1_test_p1_s1_features = load_object("q14_tp1_test_p1_s1_features")
tp1_test_p1_s1_labels = load_object("q14_tp1_test_p1_s1_labels")
print("Finished Building test feature vectors for time period 1 sample 1")

Finished Building test feature vectors for time period 1 sample 1


### Building test set for Timeperiod 1(sample 2)

In [10]:
import statsmodels.api as sm

file = 'sample2_period1.txt'

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
# tp1_test_p1_s2_features,tp1_test_p1_s2_labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(tp1_test_p1_s2_features, "q14_tp1_test_p1_s2_features")
# save_object(tp1_test_p1_s2_labels, "q14_tp1_test_p1_s2_labels")
tp1_test_p1_s2_features = load_object("q14_tp1_test_p1_s2_features")
tp1_test_p1_s2_labels = load_object("q14_tp1_test_p1_s2_labels")
print("Finished Building test feature vectors for time period 1 sample 2")

Finished Building test feature vectors for time period 1 sample 2


#### Training Linear Regression Model

In [11]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 1')
X_orig = tp1_train_features
y = tp1_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 1


#### Testing Linear Regression Model

In [12]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 2410661.7048371565
[ 430.58850546  900.99058911 1146.58159312 1249.77711228 1946.72375783
 1638.45979224 1951.18951237 1998.29524492 1979.50374369 2568.14402217
 2074.86567082 2081.05684259 3728.85610001 4267.01924102 4002.51662366
 2407.89135211 3183.67049467 2963.50059119 2548.0745483  2079.8197556
 1795.37007641 1818.92894834 2312.00535483 2067.44313895 3025.8626828
 4825.05466881 7055.87086711]

Sample 1:
 MSE : 7843671.629176044
[ 430.58850546 1818.92894834 2312.00535483 2067.44313895 3025.8626828
 4825.05466881 7055.87086711]

Sample 2:
 MSE : 944410.0671393679
[ 431.76418551 3303.93979602 1475.87917732 1395.84119575 1288.22100284
 1020.84572585  861.72120263  764.96207345  572.59082317  617.61611741
  600.27548467  695.314635    792.88435659  696.29768776 1014.80250125
 1298.12736775 1527.98535483 1957.83568752 1940.82796962 2347.33832833
 1931.34126823 1617.48031802 1634.11244264 2240.11549617 1213.85722866
 1406.67471408 1431.34180469 1192.44759854 1197.10408

#### Training Random forest

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


print('\nRandom forest Model for Time period 1')
X = tp1_train_features
y = tp1_train_labels
model = GradientBoostingRegressor(max_depth=60,max_features='auto',min_samples_leaf=1,min_samples_split=10,n_estimators=200)
results = model.fit(X,y)


Random forest Model for Time period 1


#### Testing Random Forest

In [14]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 6348.203948116693
[  110.99336068   708.01421034   915.99867541  1111.01084224
  1262.01253369  1328.00938891  1184.9836888   1620.00586256
  1432.01483713  1573.0212808   1469.99893579  3975.98645912
  4931.96767842  4863.99266945  2368.00940688  3730.00278486
  3317.98337974  3031.01676957  2143.00343633  2012.0113689
  2173.00032284  2266.99211755  2504.99133584  3267.01206458
  5792.92731914  9021.99786868 12047.99664101]

Sample 1:
 MSE : 607410.1980253566
[  110.99336068  2266.99211755  2504.99133584  3267.01206458
  5792.92731914  9021.99786868 12047.99664101]

Sample 2:
 MSE : 128005.21643993311
[  110.99336068  1784.02665884  1441.99306424  1420.00850994
   952.00060794   693.01227673   536.00795201   288.0134846
   301.98966018   314.99253715   418.99331616   559.99421605
   703.99499371   908.99276281  1245.98589484  1411.99461125
  2327.99066631  2168.00571583  2895.99799457  2196.99148076
  1689.00529309  1706.9993143   1584.01512781  1216.01483414
  1241

#### Training Neural Network

In [15]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp1_train_features
y = tp1_train_labels
nn = MLPRegressor(hidden_layer_sizes=(200, 200, 200),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [16]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 24880060.546742223
[    71.37138047  -2324.5428235    3284.46576366   2516.57819823
   4589.01149889   2050.26765551   9186.07551887  11521.71156865
   3192.46664662   7659.83277786   6546.5870827    3197.06751246
  12790.97941201   3447.47220446   3253.7773082    4328.33232902
   4836.14045807   5932.06400685   6013.04613349   5122.68709206
 -10495.46480678  -5068.87671951   6797.98012373  -5733.71731698
   6497.84112338   9203.42156192  11873.14225979]

Sample 1:
 MSE : 22604972.954695467
[   71.37138047 -5068.87671951  6797.98012373 -5733.71731698
  6497.84112338  9203.42156192 11873.1422598 ]

Sample 2:
 MSE : 18550365.870708916
[ 1.26808578e-01  4.42550640e+03  7.61529712e+03  4.23202998e+02
  1.71075029e+03  3.21138479e+02  1.17115418e+02  1.69363850e+02
 -5.94441766e+02  6.51978197e+01 -9.23227514e+02  6.29102932e+01
  2.29146856e+02  4.28301192e+03  1.79037882e+02  1.41319069e+03
  3.49991634e+03  2.84157569e+03  3.03006794e+03  7.77941436e+02
  1.47000783e+03

## Timeperiod 2

### Building training set for Timeperiod 2(same as q7)

In [17]:
import statsmodels.api as sm

#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_train_features,tp2_train_labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(tp2_train_features, "q14_tp2_train_features")
save_object(tp2_train_labels, "q14_tp2_train_labels")
print("Finished Building feature vectors for training set for time period 2")

Finished Building feature vectors for training set for time period 2


### Building test set for Timeperiod 2(sample 0)

In [18]:
import statsmodels.api as sm

file = 'sample0_period2.txt'

#tp1
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s0_features,tp2_test_p2_s0_labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(tp2_test_p2_s0_features, "q14_tp2_test_p2_s0_features")
save_object(tp2_test_p2_s0_labels, "q14_tp2_test_p2_s0_labels")
print("Finished Building test feature vectors for time period 2 sample 0")

Finished Building test feature vectors for time period 2 sample 0


### Building test set for Timeperiod 2(sample 1)

In [19]:
import statsmodels.api as sm

file = 'sample1_period2.txt'

tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s1_features,tp2_test_p2_s1_labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(tp2_test_p2_s1_features, "q14_tp2_test_p2_s1_features")
save_object(tp2_test_p2_s1_labels, "q14_tp2_test_p2_s1_labels")
print("Finished Building test feature vectors for time period 2 sample 1")

Finished Building test feature vectors for time period 2 sample 1


### Building test set for Timeperiod 2(sample 2)

In [20]:
import statsmodels.api as sm

file = 'sample2_period2.txt'


tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s2_features,tp2_test_p2_s2_labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
save_object(tp2_test_p2_s2_features, "q14_tp2_test_p2_s2_features")
save_object(tp2_test_p2_s2_labels, "q14_tp2_test_p2_s2_labels")
print("Finished Building test feature vectors for time period 2 sample 2")

Finished Building test feature vectors for time period 2 sample 2


#### Training Linear Regression Model

In [21]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 2')
X_orig = tp2_train_features
y = tp2_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 2


#### Testing Linear Regression Model

In [22]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 17470732.924933676
[  640.70870533  2522.47331865  2064.55554919  1811.75928556
  1888.76871169  1819.22140524  1049.76894955  1430.68373013
  1427.28387691  1071.47698452  1450.96959242  1450.17584133
   899.7833565   1184.83545807   928.6401269   1189.96315097
  1146.23167347  2065.40494332  2068.49004696  2484.53219625
  2070.28993844  2146.7540324   2158.7838928   2203.36655421
  2487.42058577  3234.28871981  2224.47939685  2106.81222021
  2649.37578006  2256.87300645  2453.30508351  2830.12627476
  2155.40795806  2247.40461376  8884.62370744  9365.54655641
 10107.58213277  6274.36210345 10254.46072641 14686.19854017
 15342.84513193 15160.44573019 15086.75773787 13824.04880977
 13736.42206374 13953.55100892 15695.86074547 14619.37510443
 14354.14651277 13414.62104207 13812.85419065 13329.88159274
 12313.93636829 12010.46166643 12612.36133162 12089.0392491
 11543.94314792 11193.79504809 11352.66947284 10600.23694607
 11432.60349998 10460.91613395 11456.31704573 110

#### Training Random forest

In [25]:
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


print('\nRandom forest Model for Time period 2')
X = tp2_train_features
y = tp2_train_labels
model = GradientBoostingRegressor(max_depth=20,max_features='sqrt',min_samples_leaf=1,min_samples_split=5,n_estimators=200)
results = model.fit(X,y)


Random forest Model for Time period 2


#### Testing Random Forest

In [26]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 0.00033024215369161533
[ 1299.99797923  1342.99818058  1226.00205988  1302.00238647
  1223.99899977   318.99971415   245.0017484    231.00570369
   230.00068857   237.00244558   283.99564121   265.99918893
   469.9963885    280.9999222    332.00071973   400.0022418
  1400.99524927  1492.99618304  1708.99547117  1494.00241589
  1593.00189101  1463.00132118  1107.00345718  1716.00016575
  1860.99675291  1611.00095501  1543.9986567   1560.00080508
  1769.99610982  1676.00124515  1820.00284903  1569.00090012
  1697.0056598   8498.9961299   9648.00047906  9991.00049911
  5736.00167218  9790.99378172 14317.99348357 15471.00366332
 15150.99877189 15240.99812585 13539.00013146 13771.00421306
 13805.00000749 15621.99515314 14181.00261504 14507.99684213
 13067.00157364 13879.99254356 12876.00215046 12221.00776686
 11965.99901406 12614.99817298 11745.00077776 11468.9932806
 11083.00042296 11118.99877619 10466.00018656 11275.99477531
 10371.00487545 11089.00246184 10611.00487316 

#### Training Neural Network

In [27]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp2_train_features
y = tp2_train_labels
nn = MLPRegressor(hidden_layer_sizes=(500, 250, 125, 63),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [28]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 142190872890.59784
[-7.51560643e+02 -3.54304515e+04 -8.85583292e+03 -1.57963898e+04
 -2.73523993e+04 -2.01301754e+04 -2.03935405e+04 -2.65184042e+04
 -5.79443591e+04 -2.37513347e+04 -2.44536936e+04 -4.49707472e+04
 -1.27930055e+04 -9.19365598e+04 -1.36705338e+04 -1.28551383e+04
 -9.86388581e+03 -2.04286307e+04 -2.13017016e+04 -5.31621911e+04
 -1.94361249e+04 -2.61934401e+04 -1.47850169e+04 -4.67439147e+04
 -1.24889952e+04 -3.77304336e+04 -1.50206746e+04 -2.39407984e+04
 -5.03557747e+04 -1.36282569e+04 -3.02003151e+04 -3.74633796e+04
 -3.44858617e+04 -9.86076273e+03 -1.29549515e+04 -3.38082963e+04
 -3.88784141e+04 -4.61727501e+04 -4.45734785e+04 -2.72576615e+04
 -3.24994484e+04 -3.62151675e+04 -4.36898323e+04 -2.39578173e+04
 -4.23730702e+04 -3.52782699e+04 -4.49713785e+04 -2.36968271e+04
 -4.43641685e+04 -4.94002776e+04 -4.93274841e+04 -4.54832473e+04
 -2.91378261e+04 -5.19012351e+04 -3.79244128e+04 -2.37133650e+04
 -3.34791441e+04 -4.69298292e+04 -4.88617659e+04 -4.0

## Timeperiod 3

### Building training set for Timeperiod 3(same as q7)

In [29]:
#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_train_features,tp3_train_labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)

save_object(tp3_train_features, "q14_tp3_train_features")
save_object(tp3_train_labels, "q14_tp3_train_labels")

print("Finished Building feature vectors for training set for time period 2")

Finished Building feature vectors for training set for time period 2


### Building test set for Timeperiod 3(sample 0)

In [30]:
import statsmodels.api as sm

file = 'sample0_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s0_features,tp3_test_p3_s0_labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)

save_object(tp3_test_p3_s0_features, "q14_tp3_test_p3_s0_features")
save_object(tp3_test_p3_s0_labels, "q14_tp3_test_p3_s0_labels")
print("Finished Building test feature vectors for time period 3 sample 0")

Finished Building test feature vectors for time period 3 sample 0


### Building test set for Timeperiod 3(sample 1)

In [31]:
import statsmodels.api as sm

file = 'sample1_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s1_features,tp3_test_p3_s1_labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)

save_object(tp3_test_p3_s1_features, "q14_tp3_test_p3_s1_features")
save_object(tp3_test_p3_s1_labels, "q14_tp3_test_p3_s1_labels")
print("Finished Building test feature vectors for time period 3 sample 1")

Finished Building test feature vectors for time period 3 sample 1


### Building test set for Timeperiod 3(sample 2)

In [32]:
import statsmodels.api as sm

file = 'sample2_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s2_features,tp3_test_p3_s2_labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)

save_object(tp3_test_p3_s2_features, "q14_tp3_test_p3_s2_features")
save_object(tp3_test_p3_s2_labels, "q14_tp3_test_p3_s2_labels")
print("Finished Building test feature vectors for time period 3 sample 2")

Finished Building test feature vectors for time period 3 sample 2


#### Training Linear Regression Model

In [33]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 3')
X_orig = tp3_train_features
y = tp3_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 3


#### Testing Linear Regression Model

In [34]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 2342692.4629372233
[ 526.82117027 9082.84056009 4584.27948952 4919.49676512 4107.09562305
 2562.32062628 2018.55868467 2513.59230503 2935.20688273 3435.79809984
 4852.5611094  4660.83601093 4320.49640057 3435.91795707 4712.39929162
 4124.78589035 7740.57637216 7110.07082011 6142.03175499 5929.47879199
 5712.09436694 4410.69959764 3629.23932998 3850.24031715 3384.37834685
 3296.6146317  3033.56916609 1770.36837763 1731.88494142 1588.07616929
 1383.27428818 1370.29316864 1062.41421746 1471.70364319 1678.05833336
 2738.90004154 2359.76376465 2338.27203269 2865.99414818 2752.0645245
 2695.61215512 2960.84680565 1954.71695029 2276.91475283 2018.59881373
 2221.41427978 1544.10129702 1330.60421104 1001.57163592 1275.15153735
 1267.94932014 1064.39248253  834.39072793  865.61170595  770.12139064
  970.84667855  897.75254861  912.38995955 1063.46079774 1374.46133498
 1526.81029051 1560.82472822 1700.16302128 1763.90513032 1675.72627961
 1321.13242493 1305.5128619  1537.1438629

#### Training Random forest

In [35]:
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


print('\nRandom forest Model for Time period 3')
X = tp3_train_features
y = tp3_train_labels
model = GradientBoostingRegressor(max_depth=20,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=200)
results = model.fit(X,y)


Random forest Model for Time period 3


#### Testing Random Forest

In [36]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 9.835521900158752e-08
[1.62709990e+04 6.49699977e+03 9.03899925e+03 5.01299980e+03
 3.29300032e+03 2.60200003e+03 2.54399989e+03 3.30999961e+03
 4.06999985e+03 4.68899977e+03 5.02299980e+03 5.05199980e+03
 4.52899988e+03 6.07499929e+03 5.14299978e+03 7.56499926e+03
 7.46699925e+03 6.55499973e+03 6.48199972e+03 5.82199987e+03
 5.17499978e+03 4.38000024e+03 4.12600003e+03 3.56600004e+03
 2.99399989e+03 2.27600027e+03 1.86300025e+03 1.40400027e+03
 1.34700009e+03 1.22500006e+03 1.15699993e+03 1.30999989e+03
 1.39700011e+03 1.74099956e+03 2.16199960e+03 2.37699973e+03
 2.65699991e+03 3.56299969e+03 2.80100005e+03 2.51700002e+03
 2.66299976e+03 2.25300029e+03 1.78399985e+03 1.92599977e+03
 1.60900015e+03 1.61700003e+03 1.11600018e+03 5.26000620e+02
 1.26799968e+03 1.18199988e+03 9.73000269e+02 5.49000436e+02
 6.46000013e+02 4.50000330e+02 8.14999677e+02 6.34000086e+02
 7.79999715e+02 8.83999967e+02 1.13699966e+03 1.40699948e+03
 1.69599960e+03 1.88399963e+03 1.48900002e+03

#### Training Neural Network

In [37]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp3_train_features
y = tp3_train_labels
nn = MLPRegressor(hidden_layer_sizes=(200,200),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [38]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 1817388020.0557055
[ 1.84629038e+01  1.09644394e+04  1.61719221e+05  9.62021406e+04
  9.81289416e+04  5.05255494e+04  2.51083356e+04  3.85426313e+04
  4.44434222e+04  1.15405432e+05  7.56205965e+04  1.48881349e+04
  9.14634636e+04  5.93336765e+04  9.58077710e+04  9.77467197e+04
  2.07973492e+04  4.41375408e+04  1.47371410e+04  6.07811158e+04
  2.32609510e+03  3.23822208e+04  8.69919521e+04  2.04491141e+04
  1.15658507e+04  2.87666523e+04  9.57072846e+04  3.20631944e+04
  2.91938112e+04 -8.66891035e+03 -7.26134632e+04  2.93446690e+04
  3.42191594e+04  2.02326607e+04  2.12428553e+04  8.45143231e+04
  7.65686614e+04  1.23705458e+04  7.45681684e+04  7.20466584e+04
  7.04234301e+04  8.17840933e+04 -4.90354689e+04  7.30672673e+04
  4.38499955e+04  6.97325914e+04 -9.54621295e+04  2.02344806e+04
  1.73066660e+04  1.38634705e+04 -7.72451647e+04  3.70738101e+03
 -1.80745960e+04  4.40321354e+04 -1.01975367e+02  5.47471500e+03
  5.07598349e+03  7.12505970e+03  7.97569448e+03  2.5