In [8]:
# Model based on a geometric intuition not a probabilistic one.
# You are trying to find the hyperplane maximizing the margin which is the distance between the points of classes closest to the line.
# min w*w/2 subject to yi*(w*xi+b) >= 1
# Points known as supper vectors are the points on the line, it can be more than one and these vectors support the decision boundary.
# Model is not affected from points far away from the decision boundary. Therefore model is known as one of the robust classifiers.
# The part solving the linear separable problem known as hard margin and if data is not linearly separable then SVM will not be able to return a result.
# Therefore we are chaning the optimization problem by adding slack variables.
# min w*w + C*sum(slacks) s.t yj(w*xj+b) >= 1-slackj and slackj >= 0
# To solve the problem you are creating a slack variable for every point in your data. If slack variable equals to 0, everything is same as before but if it is different than 0 then you are starting to shift the decision bounday.
# In summary you are bending the decision bounday and punishing the optimization by multiplying with C.
# If you keep C low, it means. you are tolerating it but if you make C infinite then it is becoming the hard svm case. w's will be so big and decision boundary will try to pass through a very narrow place.

# If the decision boundary is not linear then we are using feature transformations.
# Feature transformation : mapping function phi(x1,x2) = x1**2 + x2**2. We can transform the original feature space x E Rn into a new feature space phi(x).
# For ex, take x to (x,x**2)
# You are transforming features to high dimensional space so feature space will be bigger so fast and you need to write them explicity therefore kernel trick is needed.

# By using kernel trick, we are trying to find a solution which is linear in high dimension but not a linear in lower dimension.
# You are writing K(xi,yi) instead of xi*xj in the optimization dual problem. Instead of writing explicitly, kernel allows you to do it implicity. We have some special kernels.
# In summary, you are taking features in the original space and taking them into the higher dimension and getting dot product to find similarities.

# Linear kernel: k(xt,xp) = xt*xp . It shows how these features are similar.
# Polynomial kernel : (1+xt*x)**d for any d>0. Contains all polynomial terms up to degree d.

# Gaussian kernels: Also called the radial basis function kernel. e**(-norm(x-xp)**2/2*sigma**2) for sigma >0. You are taking your date to infinite dimension.

# C: Regularization parameter
# Choosing a better kernel
# Varying parameters of the kernel (width of Gaussian,sigma)

# If you have multiple class labels you should choos one vs all. It transforms to a binary problem.
# The other choice is all vs all. For ex, red vs green, red vs blue, green vs blue.

# If you have an imbalanced dataset, then most probably SVM will find a decision boundary to away from the points and predict the majority class. You need to adjust C parameter and you can adjust it according to the classes. If - labels are minority than penalize it more.
# If you have large number of features and small training set then use no kernel or linear kernel.
# If you have small number of features and large dataset then use Gaussian kernel.

# The one of the most critical part of training SVM is scaling.



In [9]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/rajyellow46/wine-quality") # download data from kaggle datasets

[0mPlease provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: uguryigit34
Your Kaggle Key: ········
Downloading wine-quality.zip to ./wine-quality


100%|███████████████████████████████████████| 98.0k/98.0k [00:00<00:00, 279kB/s]







In [54]:
import pandas as pd
import seaborn as sns
from sklearn import model_selection as mod
from sklearn.svm import LinearSVC,SVC
from sklearn import metrics as met
from sklearn import preprocessing as pre

In [12]:
data = pd.read_csv("/Users/uguryigit/wine-quality/winequalityN.csv")

In [14]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6487.0,6489.0,6494.0,6495.0,6495.0,6497.0,6497.0,6497.0,6488.0,6493.0,6497.0,6497.0
mean,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378
std,1.29675,0.164649,0.145265,4.758125,0.035036,17.7494,56.521855,0.002999,0.160748,0.148814,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [28]:
data.dropna(inplace = True) #drop na rows

In [30]:
data.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [31]:
data.type.value_counts() #target counts

white    4870
red      1593
Name: type, dtype: int64

In [44]:
y = data.type
X = data.drop("type",axis = 1)

In [45]:
X_train,X_test,y_train,y_test = mod.train_test_split(X,y,test_size = 0.2, stratify = y, random_state = 20)

In [46]:
scaler = pre.MinMaxScaler() # scale the data before training. range 0-1

In [47]:
scaler.fit(X_train)

MinMaxScaler()

In [48]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [49]:
model = LinearSVC()

In [51]:
model.fit(X_train_scaled,y_train)

LinearSVC()

In [52]:
pred = model.predict(X_test_scaled)

In [53]:
met.accuracy_score(y_test,pred)

0.9953596287703016

In [None]:
#### GAUSSIAN KERNEL

In [56]:
model_rbf = SVC(kernel = "rbf")

In [59]:
model_rbf.fit(X_train_scaled,y_train)

SVC()

In [60]:
pred_rbf = model_rbf.predict(X_test_scaled)

In [61]:
met.accuracy_score(y_test,pred_rbf)

0.9984532095901005

In [62]:
#### HYPERPARAMETER TUNING WITH GRIDSEARCH

In [63]:
kernel = ["linear","rbf"]
C = [0.001,0.01,1,10,100]
gamma = ["scale",0.1,0.01,"auto"] # if gamma is so low then points closer to the curve affects, if gamma is so high then far away points will start affecting.


In [66]:
param_grid = {"kernel":kernel,"C":C,"gamma":gamma}

In [67]:
grid = mod.GridSearchCV(SVC(),param_grid)

In [68]:
grid.fit(X_train_scaled,y_train) # Gamma and C parameters are coupled therefore you need to optimize both of them at once.


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 1, 10, 100],
                         'gamma': ['scale', 0.1, 0.01, 'auto'],
                         'kernel': ['linear', 'rbf']})

In [74]:
grid.best_estimator_ # shows the best fit

SVC(C=10)

In [70]:
pred = grid.predict(X_test_scaled)

In [71]:
met.accuracy_score(y_test,pred)

0.9969064191802011

In [75]:
# It turns the distance to the decision boundary as a probability if you use decision_function.

In [76]:
grid.decision_function(X_test_scaled)

array([ 1.55827425, -1.59770566,  1.23117017, ...,  2.84907503,
       -2.08063848,  2.04105297])

In [77]:
pred

array(['white', 'red', 'white', ..., 'white', 'red', 'white'],
      dtype=object)