In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

Support Vector Machine can be used for regression as well, employing largely the same principles as SVM for classification, mainly, to find an optimal hyperplane in an n-dimensional space, where n = number of features. 

In SVR, a kernel is used for finding a hyperplane in a higher-dimensional space while reducing the cost. A kernel is a function that is able to convert a lower dimensional dataset into a higher dimensional dataset. 

The hyperplane is the line that assists in predicting a continuous value. Besides, boundary lines are two lines in a SVM that produce a margin, with positive samples on one side, and negative ones on the other. Finally, support vectors are data points that lie closest to the border.

In [2]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [3]:
%matplotlib inline
train_data = pd.read_csv('train.csv')
train_data.head(20)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987
5,6,1,20,50,0.169698,27.259866,0,12.867706
6,7,1,20,50,0.203708,27.127486,0,14.695562
7,8,1,20,50,0.237723,26.807732,0,15.890699
8,9,1,20,50,0.271776,27.864715,0,15.539188
9,10,1,20,50,0.305732,28.313036,0,15.750094


In [4]:
data_train = train_data.copy()
data_train.drop(columns=['id', 'breath_id', 'time_step'], axis = 1, inplace = True)

In [5]:
X = data_train.drop(columns='pressure')
y = data_train['pressure']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)
print('Dimension of X_train:', X_train.shape,'\nDimension of X_test:', X_test.shape,'\nDimension of y_train:', y_train.shape,'\nDimension of y_test:', y_test.shape)

Dimension of X_train: (4225200, 4) 
Dimension of X_test: (1810800, 4) 
Dimension of y_train: (4225200,) 
Dimension of y_test: (1810800,)


In [7]:
sc = StandardScaler()

scaler = sc.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [8]:
X_train_limited = X_train_sc[:80000]
y_train_limited = y_train_sc[:80000]

## Hyperparameter Tuning for SVR model

C parameter adds a penalty for each misclassified data point. When c is low, the penalty for incorrectly categorized points is low, leading to the selection of a decision boundary with a high margin at the expense of a greater number of misclassifications.

Because it resembles the K-Nearest Neighborhood Algorithm, RBF Kernel is well-liked. The RBF Kernel Support Vector Machines have the benefits of K-NN and solve the space complexity issue by just storing the support vectors during training rather than the complete dataset.

Gamma parameter of RBF controls the distance of influence of a single training point. Low values of gamma indicates a large similarity radius which results in more points being grouped together.

In [9]:
%%time
SVR_model=GridSearchCV(
    estimator = SVR(),
    param_grid={
       'C': [0.1,1, 10, 100], 
       'gamma': [1,0.1,0.01,0.001],
       'kernel': ['rbf']},
    refit=True,
    cv=2,
    verbose=2,
    n_jobs=-1,
)

Wall time: 0 ns


In [10]:
SVR_result = SVR_model.fit(X_train_limited, y_train_limited)
#summarized result
print('Best Score: %s' % SVR_result.best_score_)
print('Best Hyperparameters: %s' % SVR_result.best_params_)

Fitting 2 folds for each of 16 candidates, totalling 32 fits
Best Score: 0.4606024175339485
Best Hyperparameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}


In [11]:
%%time
SVR_predict = SVR_model.predict(X_test_sc)

Wall time: 3h 21min 20s


In [12]:
SVR_R2 = r2_score(y_test, SVR_predict)
SVR_MAPE = mape(y_test, SVR_predict)
SVR_MAE = mean_absolute_error(y_test, SVR_predict)
SVR_MSE = mean_squared_error(y_test, SVR_predict)
SVR_RMSE = np.sqrt(SVR_MSE)

In [13]:
SVR_results = pd.DataFrame({'R Squared': SVR_R2,
                          'Mean Absolute Percentage Error': SVR_MAPE,
                          'Mean Absolute Error': SVR_MAE,
                          'Mean Squared Error': SVR_MSE,
                          'Root Mean Squared Error': SVR_RMSE}
                           ,index=['Support Vector Regression'])
SVR_results

Unnamed: 0,R Squared,Mean Absolute Percentage Error,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
Support Vector Regression,0.46354,34.686393,3.170965,35.312361,5.94242


In [14]:
filename = 'SVR_model.sav'
joblib.dump(SVR_model, filename)

['SVR_model.sav']