In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

import sklearn.metrics
from sklearn.model_selection import GridSearchCV


In [None]:
data = pd.read_pickle('dataraw_to2018.pkl')
gdp = pd.read_csv('GDP.csv')

gdp.head()

In [None]:
gdp['year'] = gdp['DATE'].apply(lambda x: x[:4])
gdp['month'] = gdp['DATE'].apply(lambda x: x[5:7])
gdp.head()

In [None]:
gdp = gdp[gdp['month']=='10']

In [None]:
gdp = gdp[['year', 'GDP']]
gdp['year'] = gdp['year'].apply(lambda x: int(x))

In [None]:
data.shape

In [None]:
data = data.merge(gdp, on='year', how='left')

In [None]:
X = data.iloc[:,3:-2].join(data.iloc[:,-1])
y = data.iloc[:,-2:-1]

In [None]:
X_norm = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.25, random_state=42)


In [None]:
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, y_train)

In [None]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

##0.5669654667807917

In [None]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = RandomForestRegressor()
# grid = GridSearchCV(model, parameters)
# grid.fit(X, y)

In [None]:
# grid.cv_results_

In [None]:
# grid.best_estimator_

## RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
 ##                     n_estimators=1000)

In [None]:
regr = RandomForestRegressor(bootstrap=False, max_depth=15, 
                                max_features='sqrt', n_estimators=1000,
                                random_state=0)
regr.fit(X_train, y_train)

In [None]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

##0.5554493055329761

In [None]:
##try extra trees regressor
from sklearn.ensemble import ExtraTreesRegressor
et = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
       X_train, y_train)
et.score(X_test, y_test)

## 0.6272394296098471

In [None]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = ExtraTreesRegressor()
# grid_et = GridSearchCV(model, parameters)
# grid_et.fit(X, y)

In [None]:
# grid_et.best_estimator_

##ExtraTreesRegressor(max_depth=15, n_estimators=500)

In [None]:
et = ExtraTreesRegressor(n_estimators=500, max_depth=15, random_state=0).fit(
       X_train, y_train)
et.score(X_test, y_test)

## 0.6294494812587801

In [None]:
df2019 = pd.read_pickle('dataraw_2019.pkl')
df2019.head()

In [None]:
gdp['GDP'][gdp['year']==2019]

In [None]:
df2019['GDP'] = 21694.458

In [None]:
X_val = df2019.iloc[:,3:-2].join(df2019.iloc[:,-1])
y_val = df2019.iloc[:,-2:-1]

In [None]:
score = regr.score(X_val, y_val)
print("Random Forest Regressor Validation score: "+str(score)+'\n')

## -0.16832835442893224

In [None]:
et.score(X_val, y_val)

## -0.00022179764539975722

In [None]:
d = dict()
for i, j in zip(X.columns, regr.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})
#print(et.feature_importances_)

In [None]:
d = dict()
for i, j in zip(X.columns, et.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})