In [12]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, cross_validate
import pickle
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from data_preprocess import load_and_preprocess_data

In [13]:
(train_x, train_y, test_x) = load_and_preprocess_data()

In [14]:
def get_PCA_SVR(comp_count, params):
    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(train_x)

    pca = PCA(n_components=comp_count)
    X_train_pca = pca.fit_transform(X_train_scaled)

    grid_search_pca_svr = RandomizedSearchCV(SVR(), params, n_iter=50, cv=5, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
    grid_search_pca_svr.fit(X_train_pca, train_y)
    return grid_search_pca_svr.cv_results_

### All components

In [16]:
params = {
    'kernel': ['rbf'],
    'gamma': ['scale', 0.03, 0.0375, 0.045],
    'epsilon': np.linspace(0.1, 1, num=15),
    'C': np.linspace(2, 6, num=15)
}


pca_all = get_PCA_SVR(30, params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END C=5.142857142857142, epsilon=0.2928571428571429, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.399) r2: (test=0.757) total time=  39.0s
[CV 2/5] END C=5.142857142857142, epsilon=0.2928571428571429, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.516) r2: (test=0.741) total time=  36.3s
[CV 3/5] END C=5.142857142857142, epsilon=0.2928571428571429, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.441) r2: (test=0.750) total time=  33.7s
[CV 4/5] END C=5.142857142857142, epsilon=0.2928571428571429, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.589) r2: (test=0.736) total time=  29.0s
[CV 5/5] END C=5.142857142857142, epsilon=0.2928571428571429, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.328) r2: (test=0.757) total time=  28.9s
[CV 1/5] END C=4.571428571428571, epsilon=0.55, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.389) r2: (test=0.758) total

In [24]:
pca_random_search_tuning_results = pd.DataFrame({
    'R2': pca_all['mean_test_r2'],
    'MSE': pca_all['mean_test_neg_mean_squared_error'],
    'rank R2': pca_all['rank_test_r2'],
    'rank MSE': pca_all['rank_test_neg_mean_squared_error'],
})

pca_random_search_tuning_results.sort_values('rank R2').head(10)

Unnamed: 0,R2,MSE,rank R2,rank MSE
25,0.751277,-2.425268,1,1
7,0.751046,-2.427551,2,2
31,0.751026,-2.427729,3,3
47,0.751025,-2.42774,4,4
27,0.751011,-2.427878,5,5
33,0.750951,-2.428466,6,6
5,0.750883,-2.429131,7,7
4,0.750879,-2.429171,8,8
6,0.750726,-2.430669,9,9
40,0.750579,-2.432092,10,10


### 99.9% of components

In [5]:
pca_promille = get_PCA_SVR(27, params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END svr__C=1.5, svr__epsilon=1.577777777777778, svr__gamma=0.06, svr__kernel=rbf; neg_mean_squared_error: (test=-2.473) r2: (test=0.750) total time=  10.4s
[CV 2/5] END svr__C=1.5, svr__epsilon=1.577777777777778, svr__gamma=0.06, svr__kernel=rbf; neg_mean_squared_error: (test=-2.578) r2: (test=0.735) total time=  10.3s
[CV 3/5] END svr__C=1.5, svr__epsilon=1.577777777777778, svr__gamma=0.06, svr__kernel=rbf; neg_mean_squared_error: (test=-2.567) r2: (test=0.737) total time=   9.9s
[CV 4/5] END svr__C=1.5, svr__epsilon=1.577777777777778, svr__gamma=0.06, svr__kernel=rbf; neg_mean_squared_error: (test=-2.654) r2: (test=0.729) total time=   9.9s
[CV 5/5] END svr__C=1.5, svr__epsilon=1.577777777777778, svr__gamma=0.06, svr__kernel=rbf; neg_mean_squared_error: (test=-2.441) r2: (test=0.745) total time=  10.4s
[CV 1/5] END svr__C=3.0, svr__epsilon=0.5222222222222223, svr__gamma=0.045, svr__kernel=rbf; neg_mean_squared_err

In [25]:
pca_promille_results = pd.DataFrame({
    'R2': pca_promille['mean_test_r2'],
    'MSE': pca_promille['mean_test_neg_mean_squared_error'],
    'rank R2': pca_promille['rank_test_r2'],
    'rank MSE': pca_promille['rank_test_neg_mean_squared_error'],
})

pca_promille_results.sort_values('rank R2').head(10)

Unnamed: 0,R2,MSE,rank R2,rank MSE
14,0.745384,-2.48273,1,1
38,0.745066,-2.485784,2,2
23,0.744825,-2.488207,3,3
9,0.744745,-2.48894,4,4
31,0.74474,-2.488979,5,5
49,0.744567,-2.490708,6,6
22,0.744356,-2.492813,7,7
3,0.7443,-2.493237,8,8
35,0.744165,-2.49462,9,9
18,0.744083,-2.495417,10,10


### 99%

In [22]:
pca_percent = get_PCA_SVR(23, params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END C=2.2857142857142856, epsilon=0.8714285714285716, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.415) r2: (test=0.756) total time=  33.2s
[CV 2/5] END C=2.2857142857142856, epsilon=0.8714285714285716, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.575) r2: (test=0.735) total time=  32.1s
[CV 3/5] END C=2.2857142857142856, epsilon=0.8714285714285716, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.494) r2: (test=0.745) total time=  31.7s
[CV 4/5] END C=2.2857142857142856, epsilon=0.8714285714285716, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.653) r2: (test=0.729) total time=  32.1s
[CV 5/5] END C=2.2857142857142856, epsilon=0.8714285714285716, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.385) r2: (test=0.751) total time=  28.4s
[CV 1/5] END C=2.0, epsilon=0.7428571428571429, gamma=0.03, kernel=rbf; neg_mean_squared_error: (test=-2.418) r2: (test=0.755) total 

In [26]:
pca_percent_results = pd.DataFrame({
    'R2': pca_percent['mean_test_r2'],
    'MSE': pca_percent['mean_test_neg_mean_squared_error'],
    'rank R2': pca_percent['rank_test_r2'],
    'rank MSE': pca_percent['rank_test_neg_mean_squared_error'],
})

print(pca_percent_results.sort_values('rank R2').head(10))

          R2       MSE  rank R2  rank MSE
32  0.745643 -2.480190        1         1
18  0.745498 -2.481611        2         2
43  0.745285 -2.483693        3         3
31  0.745045 -2.486028        4         4
38  0.744962 -2.486845        5         5
2   0.744925 -2.487214        6         6
35  0.744815 -2.488280        7         7
12  0.744810 -2.488347        8         8
28  0.744799 -2.488447        9         9
14  0.744714 -2.489278       10        10


In [27]:
pca_10 = get_PCA_SVR(10, params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END C=3.7142857142857144, epsilon=0.6142857142857143, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.648) r2: (test=0.732) total time=  22.6s
[CV 2/5] END C=3.7142857142857144, epsilon=0.6142857142857143, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.774) r2: (test=0.715) total time=  23.1s
[CV 3/5] END C=3.7142857142857144, epsilon=0.6142857142857143, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.694) r2: (test=0.724) total time=  23.7s
[CV 4/5] END C=3.7142857142857144, epsilon=0.6142857142857143, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.827) r2: (test=0.711) total time=  23.6s
[CV 5/5] END C=3.7142857142857144, epsilon=0.6142857142857143, gamma=0.045, kernel=rbf; neg_mean_squared_error: (test=-2.570) r2: (test=0.731) total time=  23.0s
[CV 1/5] END C=3.4285714285714284, epsilon=0.4214285714285715, gamma=scale, kernel=rbf; neg_mean_squared_error: (test=-2.647) r2