### QUESTION 11: 
Now try to regress the aggregated data with MLPRegressor. Try different architectures (i.e. the structure of the network) by adjusting hidden_layer_sizes. You should try at least 5 architectures with various numbers of layers and layer sizes. Report the architectures you tried, as well as its MSE of fitting the entire aggregated data.

In [1]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [2]:
import pickle

def save_object(data, fileName):
    with open('pynb_data/'+fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open('pynb_data/'+fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [3]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [4]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    for tag in hash_tags:
        filename = 'data/tweets_'+tag+'.txt'
        with open(filename) as f:
            for line in f:
                json_object = json.loads(line)
                timestamp = json_object['citation_date']
            
                if timestamp < start_ts or timestamp > end_ts:                            
                    continue
                
                key = getWindowNumber(start_ts,timestamp,window)
                if key not in windowToTweets.keys():
                    windowToTweets[key]=0
                windowToTweets[key]+=1
            
                retweetCount = json_object['metrics']['citations']['total']        
            
                if key not in windowToRetweets.keys():
                    windowToRetweets[key]=0
                windowToRetweets[key]+=retweetCount
        
                followerCount = json_object['author']['followers']
                if key not in windowToFollowerCount.keys():
                    windowToFollowerCount[key]=0
                windowToFollowerCount[key]+=followerCount
        
                if key not in windowToMaxFollowers.keys():
                    windowToMaxFollowers[key]=0
                windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
    for period in range(start_ts,end_ts,window):
        key = getWindowNumber(start_ts,period,window)
        tweetCount = windowToTweets.get(key, 0)
        retweetCount = windowToRetweets.get(key,0)
        followerCount = windowToFollowerCount.get(key,0)
        maxFollowers = windowToMaxFollowers.get(key,0)

        h = getLocalHour(period)
            
        feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
        features.append(feature)
                
        nextKey = getWindowNumber(start_ts, period + window, window)
        labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

In [5]:
min_ts = min(list(tagsToMinTs.values()))
max_ts = max(list(tagsToMaxTs.values()))
tp1_window_size = 3600 
tp1_start_ts = tp1_window_size * math.floor(min_ts/(tp1_window_size*1.0))
tp1_end_ts = tp1_window_size * math.ceil(max_ts/(tp1_window_size*1.0))
features,labels = getFeatures(tp1_start_ts,tp1_end_ts,tp1_window_size)
save_object(features,"q11_features")
save_object(labels,"q11_labels")
print("Done building feature vectors")

Done building feature vectors


### Architecture 1:

* No. of hidden layers = 2 
* No. of neurons in hidden layer = 50

In [6]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(50,50 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 1 :')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = mean_squared_error(train_predict,labels)

For architecture 1 :
    MSE train:1025879749.074185




### Architecture 2:

* No. of hidden layers = 4
* No. of neurons in hidden layer = 100, 100, 50, 50

In [7]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(100,100,50,50 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 2:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 2:
    MSE train:1262779447.9312394


### Architecture 3:

* No. of hidden layers = 6
* No. of neurons in hidden layer = 200, 200, 100, 100, 50, 50

In [8]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(200, 200, 100, 100, 50, 50 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 3:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 3:
    MSE train:1403523405791.3384


### Architecture 4:

* No. of hidden layers = 8
* No. of neurons in hidden layer = 800,700,600,500,400,300,200,100

In [9]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(800,700,600,500,400,300,200,100 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 4:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 4:
    MSE train:397760478.2553001


### Architecture 5:

* No. of hidden layers = 10
* No. of neurons in hidden layer = 1000,500,250,125,63,30,15,8,4,2

In [10]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(1000,500,250,125,63,30,15,8,4,2 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 5:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 5:
    MSE train:817676426.0232316




### Architecture 6:

* No. of hidden layers = 3
* No. of neurons in hidden layer = 50,50,50

In [11]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(50, 50, 50 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 6:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 6:
    MSE train:11238562630.030281


### Architecture 7:

* No. of hidden layers = 3
* No. of neurons in hidden layer = 100,100,100

In [12]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error


nn = MLPRegressor(hidden_layer_sizes=(100, 100, 100 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 7:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 7:
    MSE train:3715327038083.21


### Architecture 8:

* No. of hidden layers = 3
* No. of neurons in hidden layer = 1000,100,10

In [13]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

nn = MLPRegressor(hidden_layer_sizes=(1000, 100, 10 ))
nn.fit(features, labels)
train_predict = nn.predict(features)
print('For architecture 8:')
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))

minmse = min(minmse,mean_squared_error(train_predict,labels))

For architecture 8:
    MSE train:817689466.367445


In [14]:
print(minmse)

397760478.2553001


In [15]:
print("Best score found for architecture 4")
print("* No. of hidden layers = 8 \n* No. of neurons in hidden layer = 800,700,600,500,400,300,200,100")

Best score found for architecture 4
* No. of hidden layers = 8 
* No. of neurons in hidden layer = 800,700,600,500,400,300,200,100


### QUESTION 12: 

Use StandardScaler to scale the data before feeding it to MLPRegressor (with the best architecture you got above). Does its performance increase?

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

nn = MLPRegressor(hidden_layer_sizes=(800,700,600,500,400,300,200,100 ))
nn.fit(scaled_features, labels)
train_predict = nn.predict(scaled_features)
print("After scaling, architecture 4")
print('    MSE train:'+ str(mean_squared_error(train_predict,labels)))
minmse = min(minmse,mean_squared_error(train_predict,labels))

After scaling, architecture 4
    MSE train:20006205.59679127


In [17]:
print(minmse)

20006205.59679127


In [19]:
print("Magnitude of improvement is "+ str(397760478.2553001 - 20006205.59679127))

Magnitude of improvement is 377754272.65850884
