In [1]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [3]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [4]:
np.random.seed(2907)

#Reduce sample
N = int(1e4)
indexes = np.arange(X.shape[0])
np.random.shuffle(indexes)
indexes = indexes[:N]

In [5]:
X_red, y_red = X[indexes], y[indexes]

In [6]:
print("X_red shape:", X_red.shape)
print("y_red shape:", y_red.shape)

X_red shape: (10000, 26)
y_red shape: (10000, 1)


In [7]:
svm = SVR()

In [8]:
param_grid = {
    "C": [1, 100],
    "gamma": [0.8, 1]
}

In [9]:
grid_search = GridSearchCV(
    svm,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=10,
    verbose=10,
    n_jobs=10
)

In [10]:
y_red = y_red.ravel()

In [11]:
np.random.seed(2907)
grid_search.fit(X_red, y_red)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


GridSearchCV(cv=10, estimator=SVR(), n_jobs=10,
             param_grid={'C': [1, 100], 'gamma': [0.8, 1]},
             scoring='neg_mean_squared_error', verbose=10)

In [12]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([ 5.94035838,  6.09487348, 20.17899635, 17.55943074]),
 'std_fit_time': array([0.80734614, 0.85440633, 1.48817653, 1.31835443]),
 'mean_score_time': array([0.54175723, 0.54212289, 0.53832903, 0.47576966]),
 'std_score_time': array([0.09370023, 0.10060939, 0.08816931, 0.11158534]),
 'param_C': masked_array(data=[1, 1, 100, 100],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_gamma': masked_array(data=[0.8, 1, 0.8, 1],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'gamma': 0.8},
  {'C': 1, 'gamma': 1},
  {'C': 100, 'gamma': 0.8},
  {'C': 100, 'gamma': 1}],
 'split0_test_score': array([-0.61123554, -0.65207678, -0.62312954, -0.65137606]),
 'split1_test_score': array([-0.5951875 , -0.63861147, -0.58578268, -0.61705487]),
 'split2_test_score': array([-0.56587934, -0.60564552, -0.55575005, -0.58795952]),
 'split3_test_score': array(

In [13]:
with open("../grids/grid_search_svm_02.pkl", mode="wb") as f:
    pickle.dump(grid_search, f)

In [14]:
best_regressor = grid_search.best_estimator_

In [15]:
print("Best regressor", best_regressor)

Best regressor SVR(C=1, gamma=0.8)


In [16]:
with open("../models/model_svm_02.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [17]:
y_hat = best_regressor.predict(X)

[CV 5/10; 1/4] START C=1, gamma=0.8.............................................
[CV 5/10; 1/4] END .............C=1, gamma=0.8;, score=-0.560 total time=   7.3s
[CV 8/10; 2/4] START C=1, gamma=1...............................................
[CV 8/10; 2/4] END ...............C=1, gamma=1;, score=-0.583 total time=   4.8s
[CV 2/10; 3/4] START C=100, gamma=0.8...........................................
[CV 2/10; 3/4] END ...........C=100, gamma=0.8;, score=-0.586 total time=  20.6s
[CV 4/10; 4/4] START C=100, gamma=1.............................................
[CV 4/10; 4/4] END .............C=100, gamma=1;, score=-0.620 total time=  17.2s
[CV 2/10; 1/4] START C=1, gamma=0.8.............................................
[CV 2/10; 1/4] END .............C=1, gamma=0.8;, score=-0.595 total time=   4.9s
[CV 1/10; 2/4] START C=1, gamma=1...............................................
[CV 1/10; 2/4] END ...............C=1, gamma=1;, score=-0.652 total time=   5.5s
[CV 1/10; 3/4] START C=100, 

In [18]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")

R2 Score: 0.430
MSE: 0.570
