# Support vector machine classifier and the kernel trick
#### Part of the course on "Foundations of machine learning", Department of Mathematics and Statistics, University of Turku, Finland
#### Lectures available on YouTube: https://youtube.com/playlist?list=PLbkSohdmxoVAZ9DEHEWHjeGK7Ei-DjKHI&si=Msu74_I0qhLrRWcu
#### Code available on GitHub: https://github.com/ionpetre/FoundML_course_assignments

Support Vector Machines (SVMs) are powerful supervised learning models. They work by finding the optimal hyperplane that best separates data points into distinct classes while maximizing the margin, the distance between the hyperplane and the closest points (support vectors). They use a kernel trick to transform data into a higher-dimensional space, enabling the creation of nonlinear decision boundaries, effectively handling complex relationships in the data. 

Datasets used in this notebook: Iris, two spiral dataset, checker board dataset

#### Load the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC

import seaborn as sns

In [None]:
# Reset the seed of the random number generator, for reproducibility purposes

import os

def reset_seed(SEED = 0):
    """Reset the seed for every random library in use (System, numpy)"""

    os.environ['PYTHONHASHSEED']=str(SEED)
    np.random.seed(SEED)


reset_seed(2023)

#### Load the data: the Iris dataset

In [None]:
# Import the Iris dataset from the sklearn library. 

from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=True)

# Join X and y for a moment, just for visualisation purposes, to see that the classes are separable
Xy = pd.concat([X,y], axis=1)
display(Xy)
sns.pairplot(data = Xy, hue = "target", palette='tab10')
del Xy

In [None]:
# Split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X.values, 
    y, 
    test_size=0.20, 
    shuffle=True,
    random_state=100,
    stratify=y,
)


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#my_scaler = StandardScaler()
my_scaler = MinMaxScaler()
my_scaler = my_scaler.fit(X_train)
X_train = my_scaler.transform(X_train)
X_test = my_scaler.transform(X_test)

#### Train several support vector classifiers to demonstrate the role of the kernels

In [None]:
# We evaluate the models using a 5-fold cross-validation strategy to get their performance
# Note: this is not the same as training a single model and getting its metrics

from sklearn.model_selection import cross_val_score

def cv_evaluation(estimators, X_train, y_train):

    scores = pd.DataFrame(columns=['Estimator', 'CV Scores mean', 'CV Scores Variance'])

    for i in range(len(estimators)):
        est = estimators[i]
        est_name = estimator_type[i]
        cv_scores = cross_val_score(est, X_train, y_train, cv=5, n_jobs=-1)
        scores.loc[i] = [est_name, cv_scores.mean(), cv_scores.std()**2]
    
    scores.sort_values(by='CV Scores mean', ascending=False, inplace=True)
    print(scores)

    plt.figure(figsize=(5, 2))
    sns.barplot(x=scores['CV Scores mean'], y=scores['Estimator'])
    plt.show()

In [None]:
# We plot the decision boundaries of our models

from sklearn.inspection import DecisionBoundaryDisplay

def plot_decision_boundary(estimators, X_train, y_train):
    
    # center the plot onto the data spread
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

    # Train the models

    for i in range(len(estimators)):    
        estimators[i].fit(X_train, y_train)

    fig, axes = plt.subplots(2, 2, figsize=(10,10))

    for i in range(len(estimators)):
        axes[i//2, i%2].set(title=estimator_type[i])
        axes[i//2, i%2].set_xlim(x_min, x_max)
        axes[i//2, i%2].set_ylim(y_min, y_max)
        disp = DecisionBoundaryDisplay.from_estimator(
            estimators[i], X_train, response_method="predict",
            xlabel=X.columns[0], ylabel=X.columns[1],
            alpha=0.5,
            ax=axes[i//2, i%2],
            plot_method='contourf'
        )
        disp.ax_.scatter(X_train[:,0], X_train[:,1], c=y_train, edgecolor="k")

    plt.show()

In [None]:
# We will test several levels of regularization, indicated through parameter C

def my_estimators(C):

    estimators = [
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='linear',  # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            decision_function_shape='ovr', 
            random_state=150),
        #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='poly',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            degree=3,         # Degree of the polynomial kernel function (‘poly’).
            gamma='scale',    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='sigmoid',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma='auto',    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=1.0,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='rbf',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma='scale',    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    ]

    return estimators 

In [None]:
estimator_type = ['linear', 'poly', 'sigmoid', 'rbf']

In [None]:
#For the visualisation of the decision boundaries we only use the first two columns
X_train_2 = X_train[:,:2]

In [None]:
# We will test several levels of regularization. 
# We start with C=1, our base level of regularization.

reset_seed(2023)
estimators = my_estimators(1.0)
cv_evaluation(estimators, X_train, y_train)
plot_decision_boundary(estimators, X_train_2, y_train)

In [None]:
# Test C=10
# This gives less regularization, we expect the models (the decision boundaries) to become more complex. 

reset_seed(2023)
estimators = my_estimators(10.0)
cv_evaluation(estimators, X_train, y_train)
plot_decision_boundary(estimators, X_train_2, y_train)

In [None]:
# C=0.1
# More regularization. We expect the models and the decision boundaris to be simpler.

reset_seed(2023)
estimators = my_estimators(0.1)
cv_evaluation(estimators, X_train, y_train)
plot_decision_boundary(estimators, X_train_2, y_train)

In [None]:
del X
del y
del X_train
del X_train_2
del y_train
del X_test
del y_test
del estimators

## Challenge 1: train support vector classifiers for the two spirals dataset
We train an SVC for a synthetic dataset that is designed to be "difficult": it is a 2-class dataset consisting of points spiraling around each other. Obviously, the dataset is not linearly separable

In [None]:
# Generate the 2-spirals dataset X

import math

def spiral_xy(i, spiral_num):
    """
    Create the data for a spiral.

    Arguments:
        i runs from 0 to 96
        spiral_num is 1 or -1
    """
    φ = i/16 * math.pi
    r = 6.5 * ((104 - i)/104)
    x = (r * math.cos(φ) * spiral_num)/13 + 0.5
    y = (r * math.sin(φ) * spiral_num)/13 + 0.5
    return (x, y)

def spiral(spiral_num):
    return [spiral_xy(i, spiral_num) for i in range(97)]

a = pd.DataFrame(np.array(spiral(1)), columns=['x', 'y'])
a['label']=1
b = pd.DataFrame(np.array(spiral(-1)), columns=['x', 'y'])
b['label']=-1
X = pd.concat([a,b], axis=0)

X.plot.scatter(x='x', y='y', c='label', colormap='jet')

In [None]:
y=X[['label']]
X.drop(['label'], axis=1, inplace=True)
X_train = X.to_numpy()
y_train = y.to_numpy()


from sklearn.preprocessing import StandardScaler, MinMaxScaler

my_scaler = StandardScaler()
my_scaler = my_scaler.fit(X)
X_train = my_scaler.transform(X)

### Challenge: learn the 2-spiral dataset using a support vector classifier
>Train a support vector classifier to learn perfectly the 2-spirals dataset. Try the four kernels dmeonstrated in this notebook. 

>You can keep C=1 and check different values for parameter gamma for the polynomial, the sigmoid and the radial basis kernels. Use degree 10 for the polynomial kernel.

>In each case, train the model and display its decision boundary support. Get its predictions on X_train and get the classification report to obtain the accuracy.

> Use reset_seed(2023) before each call to model fitting, as done in the demo part of this notebook.

> Q1. What is the accuracy of the linear kernel? 

> Q2-4. For gamma='scale' what is the accuracy for the poly/sigmoid/rbf SVC? 

> Q5-6. For gamma=150 what is the accuracy for the sigmoid/rbf SVC? 

In [None]:
# We will test several levels of regularization, indicated through parameters C and gamma

def my_estimators(C, gamma):

    estimators = [
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='linear',  # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            decision_function_shape='ovr', 
            random_state=150),
        #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='poly',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            degree=10,         # Degree of the polynomial kernel function (‘poly’).
            gamma='scale',    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='sigmoid',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma=gamma,    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=1.0,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='rbf',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma=gamma,    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    ]

    return estimators 


In [None]:
# We evaluate our models

from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay

def evaluate(estimators, X_train, y_train):
    
   # fig, axes = plt.subplots(2, 2, figsize=(10,10))

    for i in range(len(estimators)):    
        estimators[i].fit(X_train, y_train.ravel())
        y_train_pred=estimators[i].predict(X_train)        
        print("\n The classification results on the train data (",estimator_type[i],"):")
        print(classification_report(y_train,y_train_pred))
        print("Confusion matrix on the train data(",estimator_type[i],"):\n", confusion_matrix(y_train,y_train_pred))

        RocCurveDisplay.from_estimator(estimators[i], X_train, y_train)

    plt.show()

In [None]:
# Your code here


### Challenge 2:  train support vector classifiers for the checkerboard dataset. 
>Train an SVC for another synthetic dataset that is designed to be "difficult": the checkerboard dataset. Obviously, the dataset is not linearly separable. Try the four kernels dmeonstrated in this notebook. 

>You can keep C=1 and check different values for parameter gamma for the polynomial, the sigmoid and the radial basis kernels. Use degree 10 for the polynomial kernel.

>In each case, train the model and display its decision boundary support. Get its predictions on X_train and get the classification report to obtain the accuracy.

> Use reset_seed(2023) before each call to model fitting, as done in the demo part of this notebook.

> Q7. What is the accuracy of the linear kernel? 

> Q8-10. For gamma='scale' what is the accuracy for the poly/sigmoid/rbf SVC?

> Q11-12. For gamma=50 what is the accuracy for the sigmoid/rbf SVC? 


In [None]:
# We generate a checkerboard 4 x 4, where each cell (i,j) consists of 100 points with 
# coordinates (x,y), all with the same label. 
# The labels alternate between 0 and 1 from cell to cell, horizontally and vertically. 
# Each cell has size width x height

width = 2
height = 2 

from numpy.random import rand

checkerboard = pd.DataFrame(columns=['x', 'y', 'color'])
checkerboard = pd.DataFrame({'x': pd.Series(dtype='int'), 
                             'y': pd.Series(dtype='int'), 
                             'color': pd.Series(dtype='int')
                            })
for i in range(4):
    for j in range (4):
        # Generate the points in cell (i,j): array representing the (x,y) coordinates of each point
        ij = rand(100,2)
        ij[:,0] = i*width + ij[:,0]*width
        ij[:,1] = j*height + ij[:,1]*height
        ij_df=pd.DataFrame(ij, columns=['x', 'y'])
        ij_df['color']=(i+j)%2
        checkerboard = pd.concat([checkerboard, ij_df], axis=0)
        

checkerboard.plot.scatter(x='x', y='y', c='color', colormap='jet')

In [None]:
y=checkerboard[['color']]
X=checkerboard.drop(['color'], axis=1, inplace=False)
X_train = X.to_numpy()
y_train = y.to_numpy()


from sklearn.preprocessing import StandardScaler, MinMaxScaler

my_scaler = StandardScaler()
my_scaler = my_scaler.fit(X)
X_train = my_scaler.transform(X)

In [None]:
# We will test several levels of regularization, indicated through parameter C

def my_estimators(C, gamma):

    estimators = [
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='linear',  # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            decision_function_shape='ovr', 
            random_state=150),
        #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='poly',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            degree=10,         # Degree of the polynomial kernel function (‘poly’).
            gamma='scale',    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=C,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='sigmoid',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma=gamma,    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    #
        SVC(C=1.0,            # Regularization parameter. The strength of the regularization is inversely proportional to C. 
            kernel='rbf',     # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
            gamma=gamma,    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘scale’, ‘auto’, or float. 
            decision_function_shape='ovr', 
            random_state=150),
    ]

    return estimators 

In [None]:
# Your code here
