### Notes - SVM

In [1]:
# SVM is supervised learning under ML
# supervised learning can be classification (cat vars) or regression (continous variables)
# SVM can do both
# let us consider classification dataset (females on left and males on right - suppose the images are on a graph)
# there is a decision boundary that separates males and females - we have to find that decision boundary
# even if for training dataset, a number of decision boundaries can classify males-females with 100% accurcy
# but this may not be very well for test dataset
# so how to find the appropriate decision boundary

# calculate distance of the left and right points (vector) from the decision boundary
# we create a margin using the left vectors and another margin using right vectors and create 2 lines (margins)
# now for another decision boundary, we do the same thing - we want to see which decision boundary has larger width/margin
# the margin width is called rho
# the decision boundary is known as hyper plane
# equation of hyper plane is w^T (transpose).x + b = 0
# left margin of hyperplane is w^T (transpose).x + b = -1 and right margin is w^T (transpose).x + b = 1
# the distance of left margin from hyper plane is called D- and right margin - D+
# equation to calculate distance of a datapoint to the hyper plane is r = w^T (transpose).x(sub-i) + b / |w| (magnitude of w)
# the data points closest to the hyperplane are called support vectors - because we take their support to develop the hyperplane
# this method is called linear support vector classifier (LSVC)
# mathematical equation is f(x) = sign(w^T.x + b) (T is transpose)

# sometimes we have noisy dataset (e.g. female image in males and vice-versa) - then we add a slack variable
# wrongly place male image datapoint distance from male margin is Ej and 
# wrongly place female image datapoint distance from female margin is Ei - these 2 make the slack variable
# as the name suggests, slack variables allow certain constraints to be violated

# when linear classification cannot be used - then we use higher degree to classify the dataset
# using degree we can make the classes which were in a line (linear) deviate along a curve
# then we can easily use decsion boundary hyperplane

# another example is when 2 types of images / classes are concentric (one within another)
# circular fashion
# but circle cannot be used as a boundary - it can cause errors
# we convert 2d dataset to multiple dimensions, 3d in this case
# there will be a dimension, along which the 2 classses can be separated
# hyper plane can be used to separate the 2
# to implement this practically, we use mathematics - e.g. Kernel tricks (linear, polynomial, gaussian, sigmoid, etc.)

# linear SV regression (LSVR)
# here also we have margins, let us assume we have a single variable in linear formar
# we create a hyperplane such that maximum datapoints come within the margins. left and right margins are - and + epsilon apart from hyperplane
# if we use epsilon value higher, we will have wide band and vice-versa
# this technique is different than linear regression (best fit line) such that the error should not exceed the threshold
# means error should not be greater than epsilon
# points outside the band are called support vector
# if dataset is not in linear format, then we use kernels (e.g. polynomial with degree 2)


### Predict house prices - Regression

In [2]:
# import libraries
import pandas as pd

In [3]:
# load dataset (from link - it is cleaned)
path = r"https://drive.google.com/uc?export=download&id=1xxDtrZKfuWQfl-6KA9XEd_eatitNPnkB" 
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_sqft,area_typeSuper built-up Area,area_typeBuilt-up Area,area_typePlot Area,availability_Ready To Move,...,location_Kalena Agrahara,location_Horamavu Agara,location_Vidyaranyapura,location_BTM 2nd Stage,location_Hebbal Kempapura,location_Hosur Road,location_Horamavu Banaswadi,location_Domlur,location_Mahadevpura,location_Tumkur Road
0,3.0,2.0,150.0,1672.0,3,8971.291866,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3.0,3.0,149.0,1750.0,3,8514.285714,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3.0,2.0,150.0,1750.0,3,8571.428571,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,40.0,1250.0,2,3200.0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2.0,2.0,83.0,1200.0,2,6916.666667,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


#### SPlitting the data

In [5]:
X = df.drop('price', axis=1)
y = df['price'] # target feature

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (7120, 107)
Shape of y =  (7120,)


In [6]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=108)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (5696, 107)
Shape of y_train =  (5696,)
Shape of X_test =  (1424, 107)
Shape of y_test =  (1424,)


In [7]:
# feature scaling (because of ranges / units of different features)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

sc.fit(X_train)


StandardScaler()

In [8]:
# now we transform data / scale the data
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# performing SVR
from sklearn.svm import SVR # for classifier write SVC

In [11]:
svr_rbf = SVR(kernel='rbf') # rbf is default value - radial basis function - also called Gaussian
# KERNEL IS VERY IMPORTANT
# now training model
svr_rbf.fit(X_train, y_train)
svr_rbf.score(X_test, y_test)

0.2975977969870901

In [12]:
# this is poor value
# let us try linear kernel
svr_lin = SVR(kernel='linear') 
svr_lin.fit(X_train, y_train)
svr_lin.score(X_test, y_test)

0.8191327965862402

In [13]:
# this is great accuracy
# let us try poly reg
svr_poly = SVR(kernel='poly', degree = 2) 
svr_poly.fit(X_train, y_train)
svr_poly.score(X_test, y_test)

0.12076539068611114

In [None]:
# this is very poor accuracy - linear kernel is the best
# we can also use hyper-parameter tuning to set values of all parameters within SVR function (later)

In [15]:
# let us predict house prices values - linear SVR model
svr_lin.predict([X_test[0]]) # testing price of first house


array([68.82842776])

In [27]:
# it is 68.83 lakhs - let us see what the actual value was
y_test

3588     68.20
139     110.00
5601    180.00
853      71.73
2792    130.00
         ...  
2157     30.75
174      73.00
6644     52.00
1965     65.00
3205     66.00
Name: price, Length: 1424, dtype: float64

In [29]:
# it is 68.2, which is close
# getting predicted values
y_pred = svr_lin.predict(X_test)
y_pred # viewing

array([ 68.82842776, 124.76134169, 168.20828547, ...,  51.7263627 ,
        66.23143847,  67.47637549])

In [30]:
# comparing y_test and Y_pred
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  1917.0108741434606
RMSE =  43.78368273847531


In [31]:
# RMSE seems high, but R2 is ok