In [1]:
import random
import math
import csv
import time
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score,top_k_accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputRegressor
from joblib import dump, load


In [2]:
numexpr=10000
noise=0.001
csv_path=f"./data/train_data{numexpr}_noise{noise}.csv"
df = pd.read_csv(csv_path,index_col=0)

In [3]:
#Split label from features
X = df.drop(['mean','sd'],axis=1)
y = df[['mean','sd']]

In [4]:
y.tail()

Unnamed: 0,mean,sd
9994,4.11,0.163914
9995,5.74,0.025522
9996,5.89,0.080218
9997,3.34,0.052107
9998,2.9,0.00565


In [5]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [6]:
#Scale data
scaler = MinMaxScaler()#StandardScaler()#
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [7]:
scaled_max=scaled_X_train[0].max()
unscaled_max=X_train.iloc[0].values.reshape(1, -1).max()
print(f"scaled: {scaled_max} not scaled: {unscaled_max}")

scaled: 0.6067711416580831 not scaled: 26.13788638675147


In [133]:
#Define svc
svr = SVR()

In [134]:
# Create the Multioutput Regressor
mor = MultiOutputRegressor(svr)

In [135]:
#Define grid search parameters
param_grid = {'estimator__kernel':['linear', 'poly', 'rbf'],'estimator__C':[0.05,0.075,0.1,0.125,0.175,0.25],'estimator__gamma':['scale','auto'],
              'estimator__degree':[2,3,4],'estimator__epsilon':[0.005,0.075,0.01,0.0125,0.025]}
grid = GridSearchCV(mor,param_grid,n_jobs=-1,cv=5)

In [136]:
#Run Grid search
grid.fit(scaled_X_train,y_train)

GridSearchCV(cv=5, estimator=MultiOutputRegressor(estimator=SVR()), n_jobs=-1,
             param_grid={'estimator__C': [0.05, 0.075, 0.1, 0.125, 0.175, 0.25],
                         'estimator__degree': [2, 3, 4],
                         'estimator__epsilon': [0.005, 0.075, 0.01, 0.0125,
                                                0.025],
                         'estimator__gamma': ['scale', 'auto'],
                         'estimator__kernel': ['linear', 'poly', 'rbf']})

In [137]:
#Display best parameter combination
grid.best_params_

{'estimator__C': 0.125,
 'estimator__degree': 2,
 'estimator__epsilon': 0.0125,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf'}

In [138]:
#Calcualte predictions based on test data
grid_pred = grid.predict(scaled_X_test)

In [139]:
#Display accuracy
# Evaluate the regressor
mse_one = mean_squared_error(y_test['mean'], grid_pred[:,0])
mse_two = mean_squared_error(y_test['sd'], grid_pred[:,1])
print(f'MSE for first regressor: {mse_one} - second regressor: {mse_two}')
mae_one = mean_absolute_error(y_test['mean'], grid_pred[:,0])
mae_two = mean_absolute_error(y_test['sd'], grid_pred[:,1])
print(f'MAE for first regressor: {mae_one} - second regressor: {mae_two}')

MSE for first regressor: 0.41851039882238233 - second regressor: 0.0021424036292900426
MAE for first regressor: 0.484755609498351 - second regressor: 0.0376605007485025


In [140]:
y_test_single_diam=y_test['mean'].iloc[10]
y_test_single_sd=y_test['sd'].iloc[10]

In [141]:
y_pred_single_diam = grid.predict(scaled_X_test[10].reshape(1, -1))[0][0]
y_pred_single_sd = grid.predict(scaled_X_test[10].reshape(1, -1))[0][1]

In [142]:
print(f"The actual diameter was {y_test_single_diam} and the model predicted {y_pred_single_diam}")
print(f"The actual sd was {y_test_single_sd} and the model predicted {y_pred_single_sd}")

The actual diameter was 0.72 and the model predicted 0.7650615923277133
The actual sd was 0.0916959415779256 and the model predicted 0.07228016206507633


## Optimal SVM Regression parameters
- kernel= 'rbf',C= 0.125,epsilon= 0.0125,gamma= 'scale'

In [8]:
#Define final SVC
svr_fin=SVR(kernel= 'rbf',C= 0.125,epsilon= 0.0125,gamma= 'scale')
mor_fin = MultiOutputRegressor(svr_fin)


In [9]:
#Fit final svc
start = time.time()
mor_fin.fit(scaled_X_train,y_train)
end = time.time()
print('SVM fitting taken ', end-start,' seconds')

SVM fitting taken  15.90216851234436  seconds


In [11]:
y_test_single_diam=y_test['mean'].iloc[20]
y_test_single_sd=y_test['sd'].iloc[20]

y_pred_single_diam = mor_fin.predict(scaled_X_test[20].reshape(1, -1))[0][0]
y_pred_single_sd = mor_fin.predict(scaled_X_test[20].reshape(1, -1))[0][1]

In [12]:
print(f"The actual diameter was {y_test_single_diam} and the model predicted {y_pred_single_diam}")
print(f"The actual sd was {y_test_single_sd} and the model predicted {y_pred_single_sd}")

The actual diameter was 7.62 and the model predicted 8.1677963059781
The actual sd was 0.1326669214983239 and the model predicted 0.11684243147345584


In [13]:
y_pred_fin=mor_fin.predict(scaled_X_test)

In [14]:
# Evaluate the regressor
mse_one = mean_squared_error(y_test['mean'], y_pred_fin[:,0])
mse_two = mean_squared_error(y_test['sd'], y_pred_fin[:,1])
print(f'MSE for first regressor: {mse_one} - second regressor: {mse_two}')
mae_one = mean_absolute_error(y_test['mean'], y_pred_fin[:,0])
mae_two = mean_absolute_error(y_test['sd'], y_pred_fin[:,1])
print(f'MAE for first regressor: {mae_one} - second regressor: {mae_two}')

MSE for first regressor: 0.8263753787523851 - second regressor: 0.0043906403507364775
MAE for first regressor: 0.6928008851399665 - second regressor: 0.05488356951935218


## Save model with joblib for persistence

In [15]:
dump(mor_fin, 'models/SVM_REGR.joblib')

['SVM_REGR.joblib']