In [3]:
import numpy as np
import pandas as pd
np.random.seed(2121)

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [5]:
X_train = pd.read_csv('../../data/x_train_preprocessed.csv').set_index('Unnamed: 0')
X_test = pd.read_csv('../../data/x_test_preprocessed.csv').set_index('Unnamed: 0')
y_train = pd.read_csv('../../data/y_train.csv').set_index('Unnamed: 0')
y_test = pd.read_csv('../../data/y_test.csv').set_index('Unnamed: 0')
base_train = pd.read_csv('../../data/baseline_train.csv').set_index('Unnamed: 0')
base_test = pd.read_csv('../../data/baseline_test.csv').set_index('Unnamed: 0')

In [6]:
X_train.index.name = None
X_test.index.name = None
y_train.index.name = None
y_test.index.name = None
base_train.index.name = None
base_test.index.name = None

In [7]:
r2_score(y_test, base_test)

0.2998481414188715

In [8]:
X_train

Unnamed: 0,year_id,age,waa_pg_1yr,waa_pg_2yr,waa_pg_3yr,team_gp_3yr,inn_pg_1yr,runs_bat_pg_1yr,runs_bat_pg_2yr,runs_bat_pg_3yr,...,g_dh_share_1yr,g_dh_share_2yr,g_dh_share_3yr,pc_0,pc_1,pc_2,pc_3,pc_4,pc_5,pc_6
2021,-1.673800,0.170476,1.036871,0.033849,-0.204624,0.251271,-1.283726,0.801821,-0.050087,0.702537,...,-0.306257,-0.281748,-0.269488,5.831705,-1.794716,2.350115,0.961168,1.428859,-1.651573,-0.184321
2334,-1.870756,0.750317,-0.583988,0.123513,0.433228,0.251271,-1.365044,-0.241208,0.016080,0.196966,...,-0.306257,-0.281748,-0.269488,2.891187,-1.163036,-1.544370,-1.411330,-0.918092,1.019907,-1.150856
167,0.952289,-0.699287,-1.228515,0.704435,-0.328919,0.251271,0.487675,-0.043979,1.145227,-0.537878,...,-0.306257,-0.281748,-0.269488,-2.651506,1.967115,-0.622855,1.771594,-1.704079,-0.365019,0.319205
1070,-1.870756,0.460397,-1.400310,-0.607957,0.817766,0.251271,0.523218,-0.774254,-0.643920,0.320787,...,-0.306257,-0.281748,-0.269488,-1.680922,-2.048990,-0.810785,1.133113,-1.537679,0.193505,0.043850
4349,0.558375,0.460397,-0.993270,-0.458138,-0.413009,0.155255,0.834774,-1.359065,-0.646386,-0.015492,...,0.274963,-0.237298,-0.269488,-3.783996,3.978561,-0.760553,-0.407349,-1.821573,0.242835,-0.350231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213,-1.017278,-0.989208,-0.875004,0.643226,1.088266,0.251271,-0.442153,-0.037916,1.319862,1.423243,...,-0.306257,-0.281748,-0.269488,0.582370,-1.298605,-0.907015,0.354861,-1.380179,-0.281663,0.529179
1125,-1.870756,-0.119445,1.599090,0.668580,0.251219,0.155255,0.846228,1.246760,0.865330,-0.217631,...,-0.306257,-0.281748,-0.269488,-1.320241,-2.727891,3.369144,0.775558,0.610927,-0.853164,-0.352463
2626,-0.492060,2.779763,-0.818940,-1.565914,-2.559286,0.251271,-1.461607,0.437881,-0.954095,-1.581492,...,2.160961,6.544147,4.084106,1.566285,1.107926,1.735406,1.872444,-0.808249,0.150345,0.206176
1690,0.033158,-0.409366,-0.040957,0.995936,0.755840,0.251271,-0.322957,-1.103126,0.207144,0.025843,...,-0.306257,-0.281748,-0.269488,-1.735564,-2.149114,-0.112767,0.527561,2.639416,1.166603,-0.565456


# Linear Regression

In [9]:
lr = LinearRegression()

In [10]:
lr.fit(X_train, y_train)

LinearRegression()

In [11]:
y_hat = lr.predict(X_test)

In [12]:
lr.score(X_test,y_test)

0.36503329044763344

# Random Forest

In [9]:
rf = RandomForestRegressor()

In [10]:
rf.fit(X_train,y_train)

  rf.fit(X_train,y_train)


RandomForestRegressor()

In [11]:
y_hat_rf = rf.predict(X_test)

In [12]:
r2_score(y_test, y_hat_rf)

0.3623127255317514

In [13]:
pd.DataFrame(rf.feature_importances_, index=X_train.columns).sort_values(0,ascending=False).head(10)

Unnamed: 0,0
waa_pg_1yr,0.270876
waa_pg_2yr,0.113211
waa_pg_3yr,0.044219
age,0.036384
runs_defense_pg_3yr,0.027269
runs_defense_pg_2yr,0.025231
pc_5,0.02499
runs_br_pg_1yr,0.024581
runs_br_pg_3yr,0.024423
runs_defense_pg_1yr,0.024376


# SVM

In [19]:
np.logspace(-6,-4,3)

array([1.e-06, 1.e-05, 1.e-04])

In [30]:
reg = SVR()

gamma_range = np.logspace(-5,-3,3)
epsilon_range = np.logspace(-3,-1,3)
C_range = np.logspace(-1,1,3)


param_grid = dict(epsilon=epsilon_range,
                  C=C_range,
                  gamma=gamma_range)

grid = GridSearchCV(reg, param_grid,
                    cv = 5,
                    scoring = 'r2',
                    verbose = 3)

grid.fit(X_train,y_train.values.ravel())

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END .C=0.1, epsilon=0.001, gamma=1e-05;, score=0.322 total time=   2.2s
[CV 2/5] END .C=0.1, epsilon=0.001, gamma=1e-05;, score=0.285 total time=   2.2s
[CV 3/5] END .C=0.1, epsilon=0.001, gamma=1e-05;, score=0.305 total time=   2.2s
[CV 4/5] END .C=0.1, epsilon=0.001, gamma=1e-05;, score=0.300 total time=   2.4s
[CV 5/5] END .C=0.1, epsilon=0.001, gamma=1e-05;, score=0.295 total time=   2.3s
[CV 1/5] END C=0.1, epsilon=0.001, gamma=0.0001;, score=0.382 total time=   2.2s
[CV 2/5] END C=0.1, epsilon=0.001, gamma=0.0001;, score=0.328 total time=   2.3s
[CV 3/5] END C=0.1, epsilon=0.001, gamma=0.0001;, score=0.362 total time=   2.3s
[CV 4/5] END C=0.1, epsilon=0.001, gamma=0.0001;, score=0.359 total time=   2.4s
[CV 5/5] END C=0.1, epsilon=0.001, gamma=0.0001;, score=0.348 total time=   2.6s
[CV 1/5] END .C=0.1, epsilon=0.001, gamma=0.001;, score=0.377 total time=   2.6s
[CV 2/5] END .C=0.1, epsilon=0.001, gamma=0.001

[CV 2/5] END C=10.0, epsilon=0.001, gamma=0.001;, score=0.188 total time=  12.2s
[CV 3/5] END C=10.0, epsilon=0.001, gamma=0.001;, score=0.250 total time=  12.4s
[CV 4/5] END C=10.0, epsilon=0.001, gamma=0.001;, score=0.256 total time=  12.3s
[CV 5/5] END C=10.0, epsilon=0.001, gamma=0.001;, score=0.257 total time=  13.0s
[CV 1/5] END .C=10.0, epsilon=0.01, gamma=1e-05;, score=0.384 total time=   0.9s
[CV 2/5] END .C=10.0, epsilon=0.01, gamma=1e-05;, score=0.329 total time=   1.0s
[CV 3/5] END .C=10.0, epsilon=0.01, gamma=1e-05;, score=0.365 total time=   0.9s
[CV 4/5] END .C=10.0, epsilon=0.01, gamma=1e-05;, score=0.360 total time=   0.9s
[CV 5/5] END .C=10.0, epsilon=0.01, gamma=1e-05;, score=0.351 total time=   0.9s
[CV 1/5] END C=10.0, epsilon=0.01, gamma=0.0001;, score=0.380 total time=   1.3s
[CV 2/5] END C=10.0, epsilon=0.01, gamma=0.0001;, score=0.318 total time=   1.3s
[CV 3/5] END C=10.0, epsilon=0.01, gamma=0.0001;, score=0.362 total time=   1.3s
[CV 4/5] END C=10.0, epsilon

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([ 0.1,  1. , 10. ]),
                         'epsilon': array([0.001, 0.01 , 0.1  ]),
                         'gamma': array([1.e-05, 1.e-04, 1.e-03])},
             scoring='r2', verbose=3)

In [31]:
grid.best_params_

{'C': 1.0, 'epsilon': 0.01, 'gamma': 0.0001}

In [32]:
grid.best_score_

0.3586030193497778

In [54]:
reg = SVR(gamma=gamma_range[0],
          C=C_range[0],
          kernel=kernel_range[0])

In [55]:
reg.fit(X_train,y_train.values.ravel())

SVR(C=0.001, gamma=1e-05, kernel='linear')

In [56]:
reg.score(X_test, y_test.values.ravel())

-0.10666975517075472

In [46]:
y_train.values.ravel()

array([-0.01900826, -0.0045082 , -0.01458065, ..., -0.00318182,
        0.01425532,  0.01054422])

In [48]:
reg.score(X_train, y_train)

-0.07090936700738926