### QUESTION 14: 
Report the model you use. For each test file, provide your predictions on the number of tweets in the next time window.
Note: Test data should not be used as a source for training. You are not bounded to only linear models. You can find your best model through cross validation of your training data.

In [7]:
tp1_files = ['sample0_period1.txt','sample1_period1.txt','sample2_period1.txt']
tp2_files = ['sample0_period2.txt','sample1_period2.txt','sample2_period2.txt']
tp3_files = ['sample0_period3.txt','sample1_period3.txt','sample2_period3.txt']

In [8]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [9]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [10]:
import json

def getMinAndMaxTs(tag,filename=None):
    if filename is None:
        filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
globalMinTs = 1552522378
globalMaxTs = 0
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    
    if(ts_list[0]<globalMinTs):
        globalMinTs = ts_list[0]
    
    if(ts_list[1]>globalMaxTs):
        globalMaxTs = ts_list[1]
        
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])


In [11]:

filesToMinTs = {}
filesToMaxTs = {}

for file in tp1_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    
    
for file in tp2_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    
    
for file in tp3_files:
    ts_list = getMinAndMaxTs(None,'test_data/'+file)
    filesToMinTs[file] = (ts_list[0])
    filesToMaxTs[file] = (ts_list[1])    


In [31]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                
                key = getWindowNumber(start_ts,timestamp,window)
    #             print(key)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

def getFeaturesFromFile(filename,start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    with open(filename) as f:
        for line in f:            
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            
            if timestamp < start_ts or timestamp > end_ts:                            
                continue
                
            key = getWindowNumber(start_ts,timestamp,window)
            if key not in windowToTweets.keys():
                windowToTweets[key]=0
            windowToTweets[key]+=1
            
            retweetCount = json_object['metrics']['citations']['total']        
            
            if key not in windowToRetweets.keys():
                windowToRetweets[key]=0
            windowToRetweets[key]+=retweetCount
        
            followerCount = json_object['author']['followers']
            if key not in windowToFollowerCount.keys():
                windowToFollowerCount[key]=0
            windowToFollowerCount[key]+=followerCount
        
            if key not in windowToMaxFollowers.keys():
                windowToMaxFollowers[key]=0
            windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
    
#     print(len([i for i in range(start_ts,end_ts,window)]
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        if tweetCount > 0:
            features.append(feature)
                
            nextKey = getWindowNumber(start_ts, period + window, window)
            labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

## Timeperiod 1

### Building training set for Timeperiod 1(same as q7)

In [14]:
import statsmodels.api as sm

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(globalMinTs/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
# tp1_train_features,tp1_train_labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
# save_object(tp1_train_features, "q14_tp1_train_features")
# save_object(tp1_train_labels, "q14_tp1_train_labels")
tp1_train_features = load_object("q14_tp1_train_features")
tp1_train_labels = load_object("q14_tp1_train_labels")
print("Finished Building feature vectors for training set for time period 1")

Finished Building feature vectors for training set for time period 1


### Building test set for Timeperiod 1(sample 0)

In [34]:
import statsmodels.api as sm

file = 'sample0_period1.txt'

print(filesToMinTs[file])
#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
print(tp1_start_ts)
tp1_end_ts = 1422806400
tp1_test_p1_s0_features,tp1_test_p1_s0_labels = getFeaturesFromFile('test_data/'+file,tp1_start_ts,tp1_end_ts,tp1_window_size)
print(tp1_test_p1_s0_features)
save_object(tp1_test_p1_s0_features, "q14_tp1_test_p1_s0_features")
save_object(tp1_test_p1_s0_labels, "q14_tp1_test_p1_s0_labels")
# tp1_test_p1_s0_features = load_object("q14_tp1_test_p1_s0_features")
# tp1_test_p1_s0_labels = load_object("q14_tp1_test_p1_s0_labels")
print("Finished Building test feature vectors for time period 1 sample 0")

1422709237
1422709200
[[52, 109, 424498.0, 168371.0, 6], [79, 761, 2975692.0, 2034387.0, 7], [94, 226, 860594.0, 328882.0, 8], [101, 258, 2349147.0, 368626.0, 9], [122, 483, 1369748.0, 291130.0, 10], [120, 322, 9022480.0, 5883161.0, 11]]
Finished Building test feature vectors for time period 1 sample 0


### Building test set for Timeperiod 1(sample 1)

In [37]:
import statsmodels.api as sm

file = 'sample1_period1.txt'

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
tp1_test_p1_s1_features,tp1_test_p1_s1_labels = getFeaturesFromFile('test_data/'+file,tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(tp1_test_p1_s1_features, "q14_tp1_test_p1_s1_features")
save_object(tp1_test_p1_s1_labels, "q14_tp1_test_p1_s1_labels")
print(len(tp1_test_p1_s1_features))
# tp1_test_p1_s1_features = load_object("q14_tp1_test_p1_s1_features")
# tp1_test_p1_s1_labels = load_object("q14_tp1_test_p1_s1_labels")
print("Finished Building test feature vectors for time period 1 sample 1")

6
Finished Building test feature vectors for time period 1 sample 1


### Building test set for Timeperiod 1(sample 2)

In [38]:
import statsmodels.api as sm

file = 'sample2_period1.txt'

#tp1
tp1_window_size = 3600 # 1 hour window size
tp1_start_ts = tp1_window_size * math.floor(filesToMinTs[file]/(tp1_window_size*1.0))
tp1_end_ts = 1422806400
tp1_test_p1_s2_features,tp1_test_p1_s2_labels = getFeaturesFromFile('test_data/'+file,tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(tp1_test_p1_s2_features, "q14_tp1_test_p1_s2_features")
save_object(tp1_test_p1_s2_labels, "q14_tp1_test_p1_s2_labels")
print(len(tp1_test_p1_s2_features))
# tp1_test_p1_s2_features = load_object("q14_tp1_test_p1_s2_features")
# tp1_test_p1_s2_labels = load_object("q14_tp1_test_p1_s2_labels")
print("Finished Building test feature vectors for time period 1 sample 2")

6
Finished Building test feature vectors for time period 1 sample 2


#### Training Linear Regression Model

In [39]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 1')
X_orig = tp1_train_features
y = tp1_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 1


#### Testing Linear Regression Model

In [40]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 40197.85888844652
[192.63069413 196.82015362 271.93418781 332.04939508 349.71110204
 302.64963134]

Sample 1:
 MSE : 125828.83613818546
[172.6480249  179.28249985 217.94306262 332.11548282 529.50718169
 773.80424573]

Sample 2:
 MSE : 298897.98915656423
[682.77077015 588.38237051 598.9167575  650.10339882 635.2280852
 654.51393027]
------------------------------------------------------------





#### Training Random forest

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
# 'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 2000}

print('\nRandom forest Model for Time period 1')
X = tp1_train_features
y = tp1_train_labels
model = GradientBoostingRegressor(max_depth=60,max_features='sqrt',min_samples_leaf=4,min_samples_split=10,n_estimators=2000)
results = model.fit(X,y)


Random forest Model for Time period 1


#### Testing Random Forest

In [42]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 231255.68839953945
[ 180.08841007  275.62248463 1121.61081071  569.60182612  175.89926275
  314.97616984]

Sample 1:
 MSE : 144006.480175517
[194.80398337 253.74421148 309.06175016 639.49496427 478.64147017
 847.83472149]

Sample 2:
 MSE : 24095.608009303236
[477.56468142 162.11062202  78.50567379 134.49515699 198.16343835
  60.42603916]
------------------------------------------------------------





#### Training Neural Network

In [43]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp1_train_features
y = tp1_train_labels
nn = MLPRegressor(hidden_layer_sizes=(100,100),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [44]:
##sample 0
pred_y = results.predict(tp1_test_p1_s0_features)
y = tp1_test_p1_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp1_test_p1_s1_features)
y = tp1_test_p1_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp1_test_p1_s2_features)
y = tp1_test_p1_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 219834.05763586448
[272.80979269  60.14484789 522.29080646 582.34456939 338.67834234
 918.18987969]

Sample 1:
 MSE : 812685.1415464861
[ 160.92315844  -22.40645831 -180.62544082  561.67383681  991.39088493
 2139.84492575]

Sample 2:
 MSE : 314074.3196384067
[1498.9679989   242.35242446   52.6121603    67.90406497  -40.77353314
   25.64220967]
------------------------------------------------------------





## Timeperiod 2

### Building training set for Timeperiod 2(same as q7)

In [46]:
import statsmodels.api as sm

#tp2
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
# tp2_train_features,tp2_train_labels = getFeatures(tp2_start_ts,tp2_end_ts,tp2_window_size)
# save_object(tp2_train_features, "q14_tp2_train_features")
# save_object(tp2_train_labels, "q14_tp2_train_labels")
tp2_train_features = load_object("q14_tp2_train_features")
tp2_train_labels = load_object("q14_tp2_train_labels")
print("Finished Building feature vectors for training set for time period 2")

Finished Building feature vectors for training set for time period 2


### Building test set for Timeperiod 2(sample 0)

In [47]:
import statsmodels.api as sm

file = 'sample0_period2.txt'

#tp1
tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s0_features,tp2_test_p2_s0_labels = getFeaturesFromFile('test_data/'+file,tp2_start_ts,tp2_end_ts,tp2_window_size)
print(len(tp2_test_p2_s0_features))
save_object(tp2_test_p2_s0_features, "q14_tp2_test_p2_s0_features")
save_object(tp2_test_p2_s0_labels, "q14_tp2_test_p2_s0_labels")
print("Finished Building test feature vectors for time period 2 sample 0")

7
Finished Building test feature vectors for time period 2 sample 0


### Building test set for Timeperiod 2(sample 1)

In [49]:
import statsmodels.api as sm

file = 'sample1_period2.txt'

tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s1_features,tp2_test_p2_s1_labels = getFeaturesFromFile('test_data/'+file,tp2_start_ts,tp2_end_ts,tp2_window_size)
print(len(tp2_test_p2_s1_features))
save_object(tp2_test_p2_s1_features, "q14_tp2_test_p2_s1_features")
save_object(tp2_test_p2_s1_labels, "q14_tp2_test_p2_s1_labels")
print("Finished Building test feature vectors for time period 2 sample 1")

7
Finished Building test feature vectors for time period 2 sample 1


### Building test set for Timeperiod 2(sample 2)

In [50]:
import statsmodels.api as sm

file = 'sample2_period2.txt'


tp2_window_size = 300 # 5 minute window size
tp2_start_ts = 1422806400
tp2_end_ts = 1422849600
tp2_test_p2_s2_features,tp2_test_p2_s2_labels = getFeaturesFromFile('test_data/'+file,tp2_start_ts,tp2_end_ts,tp2_window_size)
print(len(tp2_test_p2_s2_features))
save_object(tp2_test_p2_s2_features, "q14_tp2_test_p2_s2_features")
save_object(tp2_test_p2_s2_labels, "q14_tp2_test_p2_s2_labels")
print("Finished Building test feature vectors for time period 2 sample 2")

6
Finished Building test feature vectors for time period 2 sample 2


#### Training Linear Regression Model

In [51]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 2')
X_orig = tp2_train_features
y = tp2_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 2


#### Testing Linear Regression Model

In [52]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 3026058.6288454114
[ 739.02639811 4519.08107638 4710.64660738 3038.95175338 2087.72055165
 1964.02475419 1789.15749376]

Sample 1:
 MSE : 526889.139481663
[ 471.75038795 1372.46862476 1473.4414827  1318.29903241 1678.67058252
 1284.38389114 1416.10150819]

Sample 2:
 MSE : 115165.93501020844
[334.09457888 328.32251906 341.18019657 360.8365362  345.05382627
 436.30110523]
------------------------------------------------------------





#### Training Random forest

In [53]:
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
print('\nRandom forest Model for Time period 2')
X = tp2_train_features
y = tp2_train_labels
model = GradientBoostingRegressor(max_depth=20,max_features='sqrt',min_samples_leaf=4,min_samples_split=2,n_estimators=200)
results = model.fit(X,y)


Random forest Model for Time period 2


#### Testing Random Forest

In [54]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 4014921.5498217354
[1161.90341733 1501.96529687 1732.65913814 2987.46086705 4659.30816717
 2546.23430965  963.23523849]

Sample 1:
 MSE : 1978235.2283417343
[1161.90341733 1069.88520001 2462.75739337 3200.02013853  911.83124961
 1069.88520001 2490.58831054]

Sample 2:
 MSE : 2886768.425517559
[1161.90341733 1161.90341733 1161.90341733 1161.90341733 1161.90341733
 3296.15231321]
------------------------------------------------------------





#### Training Neural Network

In [55]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp2_train_features
y = tp2_train_labels
nn = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [56]:
##sample 0
pred_y = results.predict(tp2_test_p2_s0_features)
y = tp2_test_p2_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp2_test_p2_s1_features)
y = tp2_test_p2_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp2_test_p2_s2_features)
y = tp2_test_p2_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 3786581.888847169
[5.32159002e-01 5.61460959e+02 3.86421987e+03 1.77988228e+03
 5.41987015e+02 6.29308533e+02 5.43647912e+02]

Sample 1:
 MSE : 1084366.5479518687
[  24.30751355  222.57564731 -266.60479676  804.68047688 2761.84764243
   86.44898596  752.12550257]

Sample 2:
 MSE : 13757.488097773421
[ -59.13686614   10.46342818  -16.59884067  235.60186686  145.82387177
 -133.26887018]
------------------------------------------------------------





## Timeperiod 3

### Building training set for Timeperiod 3(same as q7)

In [57]:
#tp3
tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
# tp3_train_features,tp3_train_labels = getFeatures(tp3_start_ts,tp3_end_ts,tp3_window_size)
# save_object(tp3_train_features, "q14_tp3_train_features")
# save_object(tp3_train_labels, "q14_tp3_train_labels")
tp3_train_features = load_object("q14_tp3_train_features")
tp3_train_labels = load_object("q14_tp3_train_labels")

print("Finished Building feature vectors for training set for time period 2")

Finished Building feature vectors for training set for time period 2


### Building test set for Timeperiod 3(sample 0)

In [59]:
import statsmodels.api as sm

file = 'sample0_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s0_features,tp3_test_p3_s0_labels = getFeaturesFromFile('test_data/'+file,tp3_start_ts,tp3_end_ts,tp3_window_size)
print(len(tp3_test_p3_s0_features))
save_object(tp3_test_p3_s0_features, "q14_tp3_test_p3_s0_features")
save_object(tp3_test_p3_s0_labels, "q14_tp3_test_p3_s0_labels")
print("Finished Building test feature vectors for time period 3 sample 0")

6
Finished Building test feature vectors for time period 3 sample 0


### Building test set for Timeperiod 3(sample 1)

In [60]:
import statsmodels.api as sm

file = 'sample1_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s1_features,tp3_test_p3_s1_labels = getFeaturesFromFile('test_data/'+file,tp3_start_ts,tp3_end_ts,tp3_window_size)
print(len(tp3_test_p3_s1_features))
save_object(tp3_test_p3_s1_features, "q14_tp3_test_p3_s1_features")
save_object(tp3_test_p3_s1_labels, "q14_tp3_test_p3_s1_labels")
print("Finished Building test feature vectors for time period 3 sample 1")

6
Finished Building test feature vectors for time period 3 sample 1


### Building test set for Timeperiod 3(sample 2)

In [62]:
import statsmodels.api as sm

file = 'sample2_period3.txt'

tp3_window_size = 3600 # 1 hour window size
tp3_start_ts = 1422849600
tp3_end_ts = tp3_window_size * math.ceil(globalMaxTs/(tp3_window_size*1.0))
tp3_test_p3_s2_features,tp3_test_p3_s2_labels = getFeaturesFromFile('test_data/'+file,tp3_start_ts,tp3_end_ts,tp3_window_size)
print(len(tp3_test_p3_s2_features))
save_object(tp3_test_p3_s2_features, "q14_tp3_test_p3_s2_features")
save_object(tp3_test_p3_s2_labels, "q14_tp3_test_p3_s2_labels")
print("Finished Building test feature vectors for time period 3 sample 2")

6
Finished Building test feature vectors for time period 3 sample 2


#### Training Linear Regression Model

In [63]:
import statsmodels.api as sm
import statsmodels.tools.eval_measures as ste

print('\nLinear Regression Model for Time period 3')
X_orig = tp3_train_features
y = tp3_train_labels

# https://becominghuman.ai/stats-models-vs-sklearn-for-linear-regression-f19df95ad99b
X = sm.add_constant(X_orig)

model = sm.OLS(y,X)
results = model.fit()


Linear Regression Model for Time period 3


#### Testing Linear Regression Model

In [64]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 14853.867722326375
[ 69.98283104  89.23745877 145.55940581 155.14803824 206.22973913
 241.20994798]

Sample 1:
 MSE : 251157.80755276975
[713.62824577 777.67642828 778.95562111  13.89773527  56.60245611
  89.88113324]

Sample 2:
 MSE : 437919.34939268
[633.10659979 678.57527832 678.95785623 713.62824577 777.67642828
 778.95562111]
------------------------------------------------------------





#### Training Random forest

In [65]:
# {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

from sklearn.ensemble import GradientBoostingRegressor

# {'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
# {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

print('\nRandom forest Model for Time period 3')
X = tp3_train_features
y = tp3_train_labels
model = GradientBoostingRegressor(max_depth=10,max_features='auto',min_samples_leaf=1,min_samples_split=10,n_estimators=200)
results = model.fit(X,y)


Random forest Model for Time period 3


#### Testing Random Forest

In [66]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 846.331871640485
[67.64548512 64.54813869 52.84695803 54.84532447 65.10414108 52.84695803]

Sample 1:
 MSE : 219792.98579223524
[1234.04437618   53.14238311   37.90781635   22.96061632   54.84532447
   48.00385414]

Sample 2:
 MSE : 219671.5664696974
[  66.14743631   52.32299923   65.6109342  1234.04437618   53.14238311
   37.90781635]
------------------------------------------------------------





#### Training Neural Network

In [67]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = tp3_train_features
y = tp3_train_labels
nn = MLPRegressor(hidden_layer_sizes=(200,200),max_iter=500)
results = nn.fit(X,y)

#### Testing Neural Network

In [68]:
##sample 0
pred_y = results.predict(tp3_test_p3_s0_features)
y = tp3_test_p3_s0_labels
print("\nSample 0:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 1
pred_y = results.predict(tp3_test_p3_s1_features)
y = tp3_test_p3_s1_labels
print("\nSample 1:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)

##sample 2
pred_y = results.predict(tp3_test_p3_s2_features)
y = tp3_test_p3_s2_labels
print("\nSample 2:\n MSE : {}".format(ste.mse(pred_y, y,axis=0)))
print(pred_y)
print('---'*20)
print('\n\n')


Sample 0:
 MSE : 684099756.299843
[-63424.51208286  -2041.40480846  -1397.65843932  -1769.20167756
  -7692.13524975  -2296.54626633]

Sample 1:
 MSE : 5757993790.952148
[-182664.00795983  -33736.04092973    -645.94643409   -1027.44448818
   -2294.23713599   -1389.48522874]

Sample 2:
 MSE : 5795040167.005656
[  -2214.53237908  -14645.66951      -3086.75441492 -182664.00795983
  -33736.04092973    -645.94643409]
------------------------------------------------------------



