# Support Vector Regression
Overview of Implementation
1. <a href="#section1">Import Dataset</a>
2. <a href="#section2">Cleaning the Data for Model Training</a>
3. <a href="#section3">Support Vector Regression</a>

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import researchpy as rp
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statistics import mean, median, mode, stdev
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

## <a id='section1'>Import Dataset</a>

In [2]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## <a id='section2'>Cleaning the Data for Model Training</a>
Remove the NA data and the highly skewed categorical variables.<br>
Then, perform One Hot Encoding.

In [3]:
nullData = [['LotFrontage', 259], ['MasVnrArea', 8], ['Electrical', 1], ['GarageYrBlt', 81]]
n = len(train)
treshold = 0.1
drop = []

print('Drop feature - too many nulls:')
for i in nullData:
    if i[1]/n > treshold: # Arbitrary treshold: 10%
        print(i[0])
        train.drop(columns=[i[0]], inplace=True)
    else:
        drop.append(i[0])
        
print('Remove data point:')
print(drop)
train.dropna(subset=drop, inplace=True)

train

Drop feature - too many nulls:
LotFrontage
Remove data point:
['MasVnrArea', 'Electrical', 'GarageYrBlt']


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
#skewness of categorical variables
max_percent = []
catogorical_skewed=[]
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90: 
        max_percent.append(rp.summary_cat(train[i])["Percent"].max())
print ("The number of variables with one category of data which occupies >= 90% of data =", len(max_percent))
#highly skewed categorical variables
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90:
        catogorical_skewed.append(i)
        print (i,"/ratio of the dominant category = ", rp.summary_cat(train[i])["Percent"].max()/100)

The number of variables with one category of data which occupies >= 90% of data = 15
Street /ratio of the dominant category =  0.9964
LandContour /ratio of the dominant category =  0.9015000000000001
Utilities /ratio of the dominant category =  0.9993000000000001
LandSlope /ratio of the dominant category =  0.9467
Condition2 /ratio of the dominant category =  0.9898
RoofMatl /ratio of the dominant category =  0.9818000000000001
BsmtCond /ratio of the dominant category =  0.9246
Heating /ratio of the dominant category =  0.981
CentralAir /ratio of the dominant category =  0.9495999999999999
Electrical /ratio of the dominant category =  0.9226000000000001
Functional /ratio of the dominant category =  0.9336
GarageQual /ratio of the dominant category =  0.9504
GarageCond /ratio of the dominant category =  0.9612999999999999
PavedDrive /ratio of the dominant category =  0.9372
MiscFeature /ratio of the dominant category =  0.9216


In [5]:
#One-Hot encoding
categoricalcolumns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
train1 = pd.get_dummies(train, columns= categoricalcolumns, prefix= categoricalcolumns)
print(train1)

        Id  LotArea  YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  \
0        1     8450       2003          2003       196.0         706   
1        2     9600       1976          1976         0.0         978   
2        3    11250       2001          2002       162.0         486   
3        4     9550       1915          1970         0.0         216   
4        5    14260       2000          2000       350.0         655   
...    ...      ...        ...           ...         ...         ...   
1455  1456     7917       1999          2000         0.0           0   
1456  1457    13175       1978          1988       119.0         790   
1457  1458     9042       1941          2006         0.0         275   
1458  1459     9717       1950          1996         0.0          49   
1459  1460     9937       1965          1965         0.0         830   

      BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  1stFlrSF  ...  SaleType_ConLw  \
0              0        150          856       856  ...     

## <a id='section4'>Support Vector Regression</a>
Support Vector Regression (SVR) uses the same principle as Support Vector Machine (SVM), finding a suitable line/hyperplane for regression/classification. <br> The aim of SVR is to decide a decision boundary with a distance (epsilon) away from the original hyperplane such that data points closest to the hyperplane or the support vectors are within that boundary line.

In [6]:
#train-test split
TEST_SIZE = 0.2

filteredData1 = train1.drop(['Id'], axis=1)
train_df, test_df = train_test_split(filteredData1, test_size=TEST_SIZE)

train_X = train_df.drop('SalePrice', axis=1)
train_Y = train_df['SalePrice']
test_X = test_df.drop('SalePrice', axis=1)
test_Y = test_df['SalePrice']

# initialisation for training data
x_train=train_X.to_numpy()
y_train=train_Y.to_numpy()[:,None]
n_train=x_train.shape[0]
f_train=x_train.shape[1]
X = np.ones([n_train,f_train+1]) 
X[:,1:] = x_train

# initialisation for test data 
x_test=test_X.to_numpy()
y_test=test_Y.to_numpy()[:,None]
n_test=x_test.shape[0]
f_test=x_test.shape[1]
X_test = np.ones([n_test,f_test+1]) 
X_test[:,1:] = x_test

We apply the model With different kernel type used in the algorithm.

In [7]:
# applying svr with default rbf kernel
regr = make_pipeline(StandardScaler(), SVR(C=0.0001, epsilon=10))
regr.fit(x_train, y_train.ravel())
# calculating accuracy
print ("Train accuracy = ", regr.score(x_train, y_train))
print ("Test accuracy = ", regr.score(x_test,y_test))

Train accuracy =  -0.04267525597345867
Test accuracy =  -0.02017474607070735


In [8]:
# applying svr with polynomial kernel
regr = make_pipeline(StandardScaler(), SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1))
regr.fit(x_train, y_train.ravel())
# calculating accuracy
print ("Train accuracy = ", regr.score(x_train, y_train))
print ("Test accuracy = ", regr.score(x_test,y_test))

Train accuracy =  0.24312890481878702
Test accuracy =  0.21135200130041953


In [9]:
# applying svr with sigmoid kernel
regr = make_pipeline(StandardScaler(), SVR(kernel='sigmoid', C=100, gamma='auto'))
regr.fit(x_train, y_train.ravel())
# calculating accuracy
print ("Train accuracy = ", regr.score(x_train, y_train))
print ("Test accuracy = ", regr.score(x_test,y_test))

Train accuracy =  0.05186877797730449
Test accuracy =  0.06098117880466292


In [10]:
# applying svr with linear kernel
regr = make_pipeline(StandardScaler(), SVR(kernel='linear', C=100, gamma='auto'))
regr.fit(x_train, y_train.ravel())
# calculating accuracy
print ("Train accuracy = ", regr.score(x_train, y_train))
print ("Test accuracy = ", regr.score(x_test,y_test))

Train accuracy =  0.8768636844067218
Test accuracy =  0.859659067974357


Best performing SVR is with linear kernel