In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [33]:
file = open("../../data/features/2019_05_07_all_features.pickle",'rb')
df = pickle.load(file, encoding='utf-8')
df.columns

Index(['id', 'Datetime', 'text', 'sentiment', 'subjectivity', 'retweets',
       'favorites', 'isreply', 'year', 'month', 'day', 'date', 'DayofWeek',
       'Weekend', 'Time', 'BinaryTrading', 'DayDistance', 'Hour',
       'Time_Difference', 'gtrend', 'countComment', 'posCommentAmount',
       'negCommentAmount', 'neuCommentAmount', 'posCommentSum',
       'negCommentSum', 'CommentSD', 'posCommentSD', 'negCommentSD'],
      dtype='object')

In [34]:
df.shape

(854, 29)

In [35]:
print(df.isnull().sum())

id                  0
Datetime            0
text                0
sentiment           0
subjectivity        0
retweets            0
favorites           0
isreply             0
year                0
month               0
day                 0
date                0
DayofWeek           0
Weekend             0
Time                0
BinaryTrading       0
DayDistance         0
Hour                0
Time_Difference     0
gtrend              0
countComment        0
posCommentAmount    0
negCommentAmount    0
neuCommentAmount    0
posCommentSum       0
negCommentSum       0
CommentSD           0
posCommentSD        0
negCommentSD        0
dtype: int64


In [10]:
# drop one observation with NaN 'employer_yr_established'
df = df[np.isfinite(df['That_hour_residual'])]

In [11]:
df[['origin_x','origin_y']].head()

Unnamed: 0,origin_x,origin_y
0,1.117554e+18,1117553530615648256
1,1.116887e+18,1116886883903078401
2,1.11751e+18,1117510077030834176
3,1.116887e+18,1116886883903078401
4,1.117443e+18,1117442574615183362


In [12]:
df[['posCommentAmount', 'negCommentAmount', 'neuCommentAmount',
       'posCommentSum', 'negCommentSum', 'CommentSD', 'posCommentSD',
       'negCommentSD']].head()

Unnamed: 0,posCommentAmount,negCommentAmount,neuCommentAmount,posCommentSum,negCommentSum,CommentSD,posCommentSD,negCommentSD
0,153,74,500,50.753336,-20.457826,0.211838,0.228215,0.21688
1,63,30,59,18.228486,-4.240006,0.237073,0.234062,0.131819
2,137,89,193,35.958364,-18.38285,0.22834,0.209668,0.194167
3,34,22,47,9.681724,-3.011236,0.212194,0.216935,0.120033
4,62,28,81,20.153503,-5.281033,0.252651,0.241471,0.19637


In [13]:
df['Time_Difference'].describe()

count     529.000000
mean     2017.733459
std         0.901827
min      2016.000000
25%      2017.000000
50%      2018.000000
75%      2018.000000
max      2019.000000
Name: Time_Difference, dtype: float64

In [27]:
# prepare X and Y

X = df[['sentiment', 'subjectivity',
        'DayofWeek', 'Weekend', 'TradingHour', 'DayDistance', 
        'gtrend', 
        'countComment', 'CommentSD', 'neuCommentAmount',
        'posCommentAmount', 'posCommentSum', 'posCommentSD',
        'negCommentAmount', 'negCommentSum',  'negCommentSD']]

Y = df['That_hour_volume_perc']
#Y = df[['That_hour_volume', 'That_hour_volume_perc', 'That_hour_residual']]

In [23]:
num_features = len(X.columns)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=100)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(476, 16) (476,)
(53, 16) (53,)


### 1. Linear Regression

In [28]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, Y_train)

print("Linear Reg R^2 training is: {}".format(linear_reg.score(X_train, Y_train)))

print("Linear Reg Train RMSE is: {}".format(np.sqrt(mean_squared_error(Y_train, linear_reg.predict(X_train)))))
print("Linear Reg Test RMSE is: {}".format(np.sqrt(mean_squared_error(Y_test, linear_reg.predict(X_test)))))

Linear Reg R^2 training is: 0.31492609593309706
Linear Reg Train RMSE is: 0.0997545518677333
Linear Reg Test RMSE is: 0.09194339345354793


### 2a. Simple Random Forest

In [29]:
# Random Forest
random_forest = RandomForestRegressor(n_estimators=100)     # instantiate
random_forest.fit(X_train, Y_train)                         # fit

print("Random Forest R^2 training is: {}".format(random_forest.score(X_train, Y_train)))                

print("Random Forest Train RMSE is: {}".format(np.sqrt(mean_squared_error(Y_train, random_forest.predict(X_train)))))
print("Random Forest Test RMSE is: {}".format(np.sqrt(mean_squared_error(Y_test, random_forest.predict(X_test)))))

Random Forest R^2 training is: 0.8952098212720323
Random Forest Train RMSE is: 0.039014343213655295
Random Forest Test RMSE is: 0.09547893014691154


### 2b. Random Forest with Cross Validation

In [66]:
rf_cv = RandomForestRegressor(n_estimators=1000, max_depth=70, max_features=np.sqrt(num_features), random_state=10)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_features = [10, 20, 30]
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]

# create the hyperparameter grid
hyperparam_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth}

print(hyperparam_grid)

{'n_estimators': [200, 400, 600, 800, 1000], 'max_features': [10, 20, 30], 'max_depth': [10, 20, 30, 40, 50]}


In [68]:
rf_grid = GridSearchCV(estimator = rf_cv, 
                       param_grid = hyperparam_grid,
                       cv = 5,
                       verbose=2,
                       n_jobs = -1)

# fit the grid search model
rf_grid.fit(X_train, Y_train)

# best hyperparameters
rf_grid.best_params_

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV] max_depth=10, max_features=10, n_estimators=200 .................


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. max_depth=10, max_features=10, n_estimators=200, total=   0.8s
[CV] max_depth=10, max_features=10, n_estimators=200 .................


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] .. max_depth=10, max_features=10, n_estimators=200, total=   0.8s
[CV] max_depth=10, max_features=10, n_estimators=200 .................
[CV] .. max_depth=10, max_features=10, n_estimators=200, total=   0.8s
[CV] max_depth=10, max_features=10, n_estimators=200 .................
[CV] .. max_depth=10, max_features=10, n_estimators=200, total=   0.8s
[CV] max_depth=10, max_features=10, n_estimators=200 .................
[CV] .. max_depth=10, max_features=10, n_estimators=200, total=   0.8s
[CV] max_depth=10, max_features=10, n_estimators=400 .................
[CV] .. max_depth=10, max_features=10, n_estimators=400, total=   1.5s
[CV] max_depth=10, max_features=10, n_estimators=400 .................
[CV] .. max_depth=10, max_features=10, n_estimators=400, total=   1.6s
[CV] max_depth=10, max_features=10, n_estimators=400 .................
[CV] .. max_depth=10, max_features=10, n_estimators=400, total=   1.6s
[CV] max_depth=10, max_features=10, n_estimators=400 .................
[CV] .



ValueError: max_features must be in (0, n_features]

In [None]:
# use optimal hyperparameters
rf_cv = RandomForestRegressor(n_estimators=1550, max_depth=70, max_features=5, random_state=10)
rf_cv.fit(X_train, Y_train)                         # fit

print("Random Forest R^2 training is: {}".format(rf_cv.score(X_train, Y_train)))                

print("Random Forest Train RMSE is: {}".format(np.sqrt(mean_squared_error(Y_train, rf_cv.predict(X_train)))))
print("Random Forest Test RMSE is: {}".format(np.sqrt(mean_squared_error(Y_test, rf_cv.predict(X_test)))))

### 3. Xtreme Gradient Boosting

Parameter Reference: https://www.datacamp.com/community/tutorials/xgboost-in-python

Default Hyperparams:  
 - XGB Train RMSE is: 0.0627184664815691  
 - XGB Test RMSE is: 0.10047705976306533

In [53]:
xgb = XGBRegressor(objective ='reg:linear', 
                   colsample_bytree = 0.3, 
                   learning_rate = 0.1,
                   max_depth = 5, 
                   alpha = 10, 
                   n_estimators = 40)

xgb.fit(X_train, Y_train)

#print("XGB R^2 training is: {}".format(xgb.score(X_train, Y_train)))                

print("XGB Train RMSE is: {}".format(np.sqrt(mean_squared_error(Y_train, xgb.predict(X_train)))))
print("XGB Test RMSE is: {}".format(np.sqrt(mean_squared_error(Y_test, xgb.predict(X_test)))))

XGB Train RMSE is: 0.055905365001703025
XGB Test RMSE is: 0.09439305722879879


  if getattr(data, 'base', None) is not None and \
