# Imports

In [1]:
import pandas as pd
import numpy as np
np.random.seed(2121)

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
# import files
rdf_X_train = pd.read_csv('../data/rdf_x_train_preprocessed.csv')
rdf_y_train = pd.read_csv('../data/rdf_y_train_preprocessed.csv')

rdf_X_test = pd.read_csv('../data/rdf_x_test_preprocessed.csv')
rdf_y_test = pd.read_csv('../data/rdf_y_test_preprocessed.csv')

ydf_X_train = pd.read_csv('../data/ydf_x_train_preprocessed.csv')
ydf_y_train = pd.read_csv('../data/ydf_y_train_preprocessed.csv')

ydf_X_test = pd.read_csv('../data/ydf_x_test_preprocessed.csv')
ydf_y_test = pd.read_csv('../data/ydf_y_test_preprocessed.csv')

In [3]:
# set index and clear index name
for i in [rdf_X_train,rdf_y_train,rdf_X_test,rdf_y_test,\
          ydf_X_train,ydf_y_train,ydf_X_test,ydf_y_test]:
    
    i.set_index('Unnamed: 0', inplace=True)
    i.index.name=None

# Linear Regression

## Linear Regression (no regularization)

In [4]:
# initialize
lr_rdf = LinearRegression(n_jobs=-1)

# fit
lr_rdf.fit(rdf_X_train, rdf_y_train)

# score
print(f'Train Score: {lr_rdf.score(rdf_X_train, rdf_y_train):.4f}')
print(f'Test Score: {lr_rdf.score(rdf_X_test, rdf_y_test):.4f}')

Train Score: 0.3709
Test Score: 0.3486


In [5]:
# initialize
lr_ydf = LinearRegression(n_jobs=-1)

# fit
lr_ydf.fit(ydf_X_train, ydf_y_train)

# score
print(f'Train Score: {lr_ydf.score(ydf_X_train, ydf_y_train):.4f}')
print(f'Test Score: {lr_ydf.score(ydf_X_test, ydf_y_test):.4f}')

Train Score: 0.5910
Test Score: 0.5893


## Linear Regression (Lasso)

In [6]:
lasso_cv_rdf = LassoCV(n_jobs=-1,
                       cv=5,
                       max_iter=10000)

lasso_cv_rdf.fit(rdf_X_train, rdf_y_train.values.ravel())

print(f'Best Alpha: {lasso_cv_rdf.alpha_:.4f}')
print(f'Train Score: {lasso_cv_rdf.score(rdf_X_train, rdf_y_train):.4f}')
print(f'Test Score: {lasso_cv_rdf.score(rdf_X_test, rdf_y_test):.4f}')

Best Alpha: 0.0008
Train Score: 0.3692
Test Score: 0.3474


In [7]:
lasso_cv_ydf = LassoCV(n_jobs=-1,
                       cv=5)

lasso_cv_ydf.fit(ydf_X_train, ydf_y_train.values.ravel())

print(f'Best Alpha: {lasso_cv_ydf.alpha_:.4f}')
print(f'Train Score: {lasso_cv_ydf.score(ydf_X_train, ydf_y_train):.4f}')
print(f'Test Score: {lasso_cv_ydf.score(ydf_X_test, ydf_y_test):.4f}')

Best Alpha: 0.0003
Train Score: 0.5910
Test Score: 0.5893


# Random Forest

In [60]:
param_grid = {
    'max_features': [5,10,15,25],
    'max_depth': [3,5,10,None],
    'min_samples_leaf': [1,2,3,5]}

In [61]:
rf_rdf = RandomForestRegressor(n_jobs=-1,
                               n_estimators=100,
                               random_state=1212)

In [62]:
clf = GridSearchCV(rf_rdf, param_grid, verbose=3)

In [63]:
clf.fit(rdf_X_train, rdf_y_train.values.ravel())

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END max_depth=3, max_features=5, min_samples_leaf=1;, score=0.208 total time=   6.1s
[CV 2/5] END max_depth=3, max_features=5, min_samples_leaf=1;, score=0.217 total time=   0.2s
[CV 3/5] END max_depth=3, max_features=5, min_samples_leaf=1;, score=0.225 total time=   0.2s
[CV 4/5] END max_depth=3, max_features=5, min_samples_leaf=1;, score=0.219 total time=   0.2s
[CV 5/5] END max_depth=3, max_features=5, min_samples_leaf=1;, score=0.219 total time=   0.2s
[CV 1/5] END max_depth=3, max_features=5, min_samples_leaf=2;, score=0.208 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=5, min_samples_leaf=2;, score=0.218 total time=   0.2s
[CV 3/5] END max_depth=3, max_features=5, min_samples_leaf=2;, score=0.226 total time=   0.2s
[CV 4/5] END max_depth=3, max_features=5, min_samples_leaf=2;, score=0.219 total time=   0.1s
[CV 5/5] END max_depth=3, max_features=5, min_samples_leaf=2;, score=0.219 total time=   0.2

[CV 2/5] END max_depth=5, max_features=5, min_samples_leaf=2;, score=0.274 total time=   0.2s
[CV 3/5] END max_depth=5, max_features=5, min_samples_leaf=2;, score=0.284 total time=   0.2s
[CV 4/5] END max_depth=5, max_features=5, min_samples_leaf=2;, score=0.275 total time=   0.2s
[CV 5/5] END max_depth=5, max_features=5, min_samples_leaf=2;, score=0.276 total time=   0.2s
[CV 1/5] END max_depth=5, max_features=5, min_samples_leaf=3;, score=0.270 total time=   0.2s
[CV 2/5] END max_depth=5, max_features=5, min_samples_leaf=3;, score=0.274 total time=   0.2s
[CV 3/5] END max_depth=5, max_features=5, min_samples_leaf=3;, score=0.285 total time=   0.2s
[CV 4/5] END max_depth=5, max_features=5, min_samples_leaf=3;, score=0.273 total time=   0.2s
[CV 5/5] END max_depth=5, max_features=5, min_samples_leaf=3;, score=0.276 total time=   0.2s
[CV 1/5] END max_depth=5, max_features=5, min_samples_leaf=5;, score=0.271 total time=   0.2s
[CV 2/5] END max_depth=5, max_features=5, min_samples_leaf=5

[CV 4/5] END max_depth=10, max_features=5, min_samples_leaf=3;, score=0.312 total time=   0.3s
[CV 5/5] END max_depth=10, max_features=5, min_samples_leaf=3;, score=0.317 total time=   0.3s
[CV 1/5] END max_depth=10, max_features=5, min_samples_leaf=5;, score=0.318 total time=   0.3s
[CV 2/5] END max_depth=10, max_features=5, min_samples_leaf=5;, score=0.318 total time=   0.3s
[CV 3/5] END max_depth=10, max_features=5, min_samples_leaf=5;, score=0.326 total time=   0.3s
[CV 4/5] END max_depth=10, max_features=5, min_samples_leaf=5;, score=0.310 total time=   0.3s
[CV 5/5] END max_depth=10, max_features=5, min_samples_leaf=5;, score=0.319 total time=   0.3s
[CV 1/5] END max_depth=10, max_features=10, min_samples_leaf=1;, score=0.336 total time=   0.5s
[CV 2/5] END max_depth=10, max_features=10, min_samples_leaf=1;, score=0.329 total time=   0.5s
[CV 3/5] END max_depth=10, max_features=10, min_samples_leaf=1;, score=0.335 total time=   0.5s
[CV 4/5] END max_depth=10, max_features=10, min

[CV 5/5] END max_depth=None, max_features=5, min_samples_leaf=5;, score=0.320 total time=   0.3s
[CV 1/5] END max_depth=None, max_features=10, min_samples_leaf=1;, score=0.339 total time=   0.8s
[CV 2/5] END max_depth=None, max_features=10, min_samples_leaf=1;, score=0.332 total time=   0.7s
[CV 3/5] END max_depth=None, max_features=10, min_samples_leaf=1;, score=0.329 total time=   0.7s
[CV 4/5] END max_depth=None, max_features=10, min_samples_leaf=1;, score=0.315 total time=   0.7s
[CV 5/5] END max_depth=None, max_features=10, min_samples_leaf=1;, score=0.320 total time=   0.7s
[CV 1/5] END max_depth=None, max_features=10, min_samples_leaf=2;, score=0.331 total time=   0.6s
[CV 2/5] END max_depth=None, max_features=10, min_samples_leaf=2;, score=0.333 total time=   0.6s
[CV 3/5] END max_depth=None, max_features=10, min_samples_leaf=2;, score=0.331 total time=   0.6s
[CV 4/5] END max_depth=None, max_features=10, min_samples_leaf=2;, score=0.315 total time=   0.6s
[CV 5/5] END max_dept

GridSearchCV(estimator=RandomForestRegressor(n_jobs=-1, random_state=1212),
             param_grid={'max_depth': [3, 5, 10, None],
                         'max_features': [5, 10, 15, 25],
                         'min_samples_leaf': [1, 2, 3, 5]},
             verbose=3)

In [64]:
clf.best_params_

{'max_depth': None, 'max_features': 15, 'min_samples_leaf': 5}

In [8]:
rf_rdf = RandomForestRegressor(n_jobs=-1,
                               n_estimators=100,
                               random_state=1212,
                               max_features=15,
                               max_depth=None,
                               min_samples_leaf=5
                        )


scores = cross_val_score(rf_rdf, rdf_X_train, rdf_y_train.values.ravel())
# score
print(f'Train Score: {scores.mean():.4f}')

rf_rdf.fit(rdf_X_train, rdf_y_train.values.ravel())
print(f'Test Score: {rf_rdf.score(rdf_X_test, rdf_y_test.values.ravel()):.4f}')

Train Score: 0.3302
Test Score: 0.3222


In [9]:
rf_ydf = RandomForestRegressor(n_jobs=-1,
                               random_state=1212)

scores = cross_val_score(rf_ydf, ydf_X_train, ydf_y_train.values.ravel())
print(f'Train Score: {scores.mean():.4f}')


rf_ydf.fit(ydf_X_train, ydf_y_train.values.ravel())
print(f'Test Score: {rf_ydf.score(ydf_X_test, ydf_y_test):.4f}')

Train Score: 0.9487
Test Score: 0.9475


In [46]:
ydf_X_test_rf = pd.DataFrame(rf_ydf.predict(ydf_X_test),
                              index=ydf_y_test.index)

# SVM

In [32]:
np.logspace(-5,-1,3)

array([1.e-05, 1.e-03, 1.e-01])

In [39]:
gamma_range = np.logspace(-4,-2,3)
epsilon_range = np.logspace(-2,0,3)
C_range = [.1,1,10]

In [40]:
param_grid = dict(epsilon=epsilon_range,
                  C=C_range,
                  gamma=gamma_range)

svr_rdf = SVR()

grid = GridSearchCV(svr_rdf, param_grid,
                    cv = 5,
                    scoring = 'r2',
                    verbose = 3)

grid.fit(rdf_X_train, rdf_y_train.values.ravel())

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END .C=0.1, epsilon=0.01, gamma=0.0001;, score=0.248 total time=   2.1s
[CV 2/5] END .C=0.1, epsilon=0.01, gamma=0.0001;, score=0.240 total time=   2.1s
[CV 3/5] END .C=0.1, epsilon=0.01, gamma=0.0001;, score=0.257 total time=   2.2s
[CV 4/5] END .C=0.1, epsilon=0.01, gamma=0.0001;, score=0.242 total time=   2.1s
[CV 5/5] END .C=0.1, epsilon=0.01, gamma=0.0001;, score=0.247 total time=   2.0s
[CV 1/5] END ..C=0.1, epsilon=0.01, gamma=0.001;, score=0.370 total time=   2.1s
[CV 2/5] END ..C=0.1, epsilon=0.01, gamma=0.001;, score=0.348 total time=   2.3s
[CV 3/5] END ..C=0.1, epsilon=0.01, gamma=0.001;, score=0.348 total time=   2.2s
[CV 4/5] END ..C=0.1, epsilon=0.01, gamma=0.001;, score=0.337 total time=   2.2s
[CV 5/5] END ..C=0.1, epsilon=0.01, gamma=0.001;, score=0.374 total time=   2.1s
[CV 1/5] END ...C=0.1, epsilon=0.01, gamma=0.01;, score=0.375 total time=   2.2s
[CV 2/5] END ...C=0.1, epsilon=0.01, gamma=0.01

[CV 2/5] END ....C=10, epsilon=0.01, gamma=0.01;, score=0.147 total time=   9.9s
[CV 3/5] END ....C=10, epsilon=0.01, gamma=0.01;, score=0.199 total time=   9.6s
[CV 4/5] END ....C=10, epsilon=0.01, gamma=0.01;, score=0.154 total time=   9.9s
[CV 5/5] END ....C=10, epsilon=0.01, gamma=0.01;, score=0.161 total time=   9.6s
[CV 1/5] END ...C=10, epsilon=0.1, gamma=0.0001;, score=0.375 total time=   0.9s
[CV 2/5] END ...C=10, epsilon=0.1, gamma=0.0001;, score=0.350 total time=   0.9s
[CV 3/5] END ...C=10, epsilon=0.1, gamma=0.0001;, score=0.351 total time=   0.8s
[CV 4/5] END ...C=10, epsilon=0.1, gamma=0.0001;, score=0.346 total time=   0.9s
[CV 5/5] END ...C=10, epsilon=0.1, gamma=0.0001;, score=0.379 total time=   0.9s
[CV 1/5] END ....C=10, epsilon=0.1, gamma=0.001;, score=0.375 total time=   1.2s
[CV 2/5] END ....C=10, epsilon=0.1, gamma=0.001;, score=0.348 total time=   1.1s
[CV 3/5] END ....C=10, epsilon=0.1, gamma=0.001;, score=0.354 total time=   1.2s
[CV 4/5] END ....C=10, epsil

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10],
                         'epsilon': array([0.01, 0.1 , 1.  ]),
                         'gamma': array([0.0001, 0.001 , 0.01  ])},
             scoring='r2', verbose=3)

In [41]:
grid.best_params_

{'C': 1, 'epsilon': 0.1, 'gamma': 0.001}

In [42]:
svr_rdf = SVR(C=1, epsilon=0.1, gamma=0.001)

scores = cross_val_score(svr_rdf, rdf_X_train, rdf_y_train.values.ravel())
# score
print(f'Train Score: {scores.mean():.4f}')

svr_rdf.fit(rdf_X_train, rdf_y_train.values.ravel())
print(f'Test Score: {svr_rdf.score(rdf_X_test, rdf_y_test.values.ravel()):.4f}')

Train Score: 0.3608
Test Score: 0.3515


In [44]:
rdf_X_test_svr = pd.DataFrame(svr_rdf.predict(rdf_X_test),
                              index=rdf_y_test.index)

In [79]:
svr_ydf = SVR()

scores = cross_val_score(svr_ydf, ydf_X_train, ydf_y_train.values.ravel())
# score
print(f'Train Score: {scores.mean():.4f}')

svr_ydf.fit(ydf_X_train, ydf_y_train.values.ravel())
print(f'Test Score: {svr_ydf.score(ydf_X_test, ydf_y_test.values.ravel()):.4f}')

Train Score: 0.8205
Test Score: 0.8126


# Output

In [48]:
rdf_X_test_svr.to_csv('../data/rdf_X_test_svr.csv')
ydf_X_test_rf.to_csv('../data/ydf_X_test_rf.csv')