# Support Vector Regression

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_percentage_error
from time import time

## Data

In [6]:
newborn_train = pd.read_csv('../data/preprocessed/newborntrain_processed.csv', sep=";")
newborn_test = pd.read_csv('../data/preprocessed/newborntest_processed.csv', sep=";")
X_train = newborn_train.drop("newborn_weight",axis=1)
y_train = newborn_train['newborn_weight']
X_test = newborn_test
print(X_train.shape, y_train.shape, X_test.shape)

(2398116, 17) (2398116,) (599561, 17)


## Grid search

Using KFold Cross Validation with k=5

In [7]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid.keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i, params in enumerate(grid_search.cv_results_['params']):
        new_row = params | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df
    

In [20]:
start_time = time()
# Grid 
param_grid = {'C': [1], 'epsilon': [0], 'loss': ['epsilon_insensitive'], 'dual':[True], 'max_iter':[2000], 'random_state': [12345]}
# Cross Validation
rskf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=LinearSVR(), param_grid=param_grid, scoring="neg_mean_absolute_percentage_error", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X_train,y_train)
print("--- %s seconds ---" % (time() - start_time))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
--- 3536.3326892852783 seconds ---




In [22]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv("../Regression/Validation/SVR0b_5cv.csv", index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,epsilon,loss,dual,max_iter,random_state,mean_train_score,std_train_score,mean_test_score,std_test_score
0,1,0,epsilon_insensitive,True,2000,12345,-1.196848,1.42126,-1.19586,1.418981


In [18]:
start_time = time()
# Grid 
param_grid = {'C': [1], 'epsilon': [0.01], 'loss': ['epsilon_insensitive'], 'dual':[True], 'random_state': [12345]}
# Cross Validation
rskf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=LinearSVR(), param_grid=param_grid, scoring="neg_mean_absolute_percentage_error", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X_train,y_train)
print("--- %s seconds ---" % (time() - start_time))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
--- 1801.6556894779205 seconds ---




In [19]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv("../Regression/Validation/SVR1_5cv.csv", index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,epsilon,loss,dual,random_state,mean_train_score,std_train_score,mean_test_score,std_test_score
0,1,0.01,epsilon_insensitive,True,12345,-0.189858,0.035672,-0.189888,0.035643
