# <center>IEE 520: Fall 2019</center>

# <center> Support Vector Machines (10/10/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

**NOTE: TO SUPPORT INTERACTIVE PLOTS IN JUPYTER LAB, RUN**

conda install -c conda-forge nodejs

jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
# For compatibility with Python 2
from __future__ import print_function

# To load datasets
from sklearn import datasets

# To import the classifier (SVM classifier)
from sklearn.svm import SVC

# To measure accuracy
from sklearn import metrics

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# To support plots
from ipywidgets import interact
import ipywidgets as widgets
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import numpy as np

# To display all the plots inline
%matplotlib inline 

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (20, 10)

In [None]:
# To import the scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer


class DummyScaler:
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        return data

def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()

def create_scaler_minmax():
    return MinMaxScaler()

def crete_scaler_binarizer():
    return Binarizer()

## <center>Toy dataset</center>

### <center>Prepare the dataset</center>

The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper "The use of multiple measurements in taxonomic problems" as an example of linear discriminant analysis.

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

In [None]:
X, y = datasets.load_iris(True)

Let's trim the data to have just 2 variables and 2 classes.

In [None]:
X_trimmed = X[:, 1:3]
X_trimmed = X_trimmed[y != 0, :]
y_trimmed = y[y != 0]

create_scaler = create_scaler_minmax
scaler = create_scaler()
scaler.fit(X_trimmed)
X_trimmed = scaler.transform(X_trimmed)

In [None]:
plt.scatter(X_trimmed[:, 0], X_trimmed[:, 1], c=y_trimmed, s=30, cmap=plt.cm.bwr)
plt.title('Visualization of reduced iris problem')
plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()

### <center>Support Vector Machines Classifier (choice of kernel)</center>

You can find a full list of parameters here:

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
import matplotlib
# Here we use closure to store the related variables
def create_plot_svm_classification_kernels(_X, _y):
    X, y = _X, _y
    def plot_svc_kernel(C=1, kernel='linear'):
        if kernel.startswith('poly'):
            clf = SVC(kernel='poly', C=C, gamma='auto', degree=int(kernel[4:]))
        else:
            clf = SVC(kernel=kernel, C=C, gamma='auto')
        clf.fit(X, y)

        fig, ax = plt.subplots()
        ax.plot((np.min(X[:, 0]), np.max(X[:, 0])), (np.min(X[:, 1]), np.max(X[:, 1])), alpha=0.0)
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()

        xx = np.linspace(xlim[0], xlim[1], 50)
        yy = np.linspace(ylim[0], ylim[1], 50)
        YY, XX = np.meshgrid(yy, xx)
        xy = np.vstack([XX.ravel(), YY.ravel()]).T
        Z = clf.decision_function(xy).reshape(XX.shape)
        v = max(np.max(Z), -np.min(Z))
        cf = ax.contourf(XX, YY, Z, 100, cmap='coolwarm', norm = matplotlib.colors.Normalize(vmin=-v, vmax=v), alpha=0.1)
        ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.bwr)
        ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
                   linestyles=['--', '-', '--'], linewidths=[2, 5, 2])

        ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
                   linewidth=1, facecolors='none', edgecolors='k')
        
        plt.xlabel('Sepal width')
        plt.ylabel('Petal length')
        plt.title('Support Vector Machines Classifier: C=%s, %s kernel.' % (str(C), kernel))
        plt.show()
    return plot_svc_kernel

In [None]:
kernels = ['linear'] + ['poly'+str(x) for x in range(1, 9)] + ['rbf']
С_widget = widgets.FloatLogSlider(
    value=10,
    base=10,
    min=-4,
    max=4,
    step=0.5,
    continuous_update=False,
    description='C')
interact(create_plot_svm_classification_kernels(X_trimmed, y_trimmed), C=С_widget, kernel=kernels)

### <center>Support Vector Machines Classifier (RBF)</center>

In [None]:
# Here we use closure to store the related variables
def create_plot_svm_classification_rbf(_X, _y):
    X, y = _X, _y
    def plot_svc_rbf(C=1, gamma=1):

        clf = SVC(kernel='rbf', C=C, gamma=gamma)
        clf.fit(X, y)

        plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.bwr)

        ax = plt.gca()
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()

        xx = np.linspace(xlim[0], xlim[1], 50)
        yy = np.linspace(ylim[0], ylim[1], 50)
        YY, XX = np.meshgrid(yy, xx)
        xy = np.vstack([XX.ravel(), YY.ravel()]).T
        Z = clf.decision_function(xy).reshape(XX.shape)
        Z = clf.decision_function(xy).reshape(XX.shape)
        v = max(np.max(Z), -np.min(Z))
        cf = ax.contourf(XX, YY, Z, 100, cmap='coolwarm', norm = matplotlib.colors.Normalize(vmin=-v, vmax=v), alpha=0.1)
        ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
                   linestyles=['--', '-', '--'], linewidths=[2, 5, 2])

        ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
                   linewidth=1, facecolors='none', edgecolors='k')

        plt.xlabel('Sepal width')
        plt.ylabel('Petal length')
        plt.title('Support Vector Machines Classifier: C=%s, Gamma=%s.' % (str(C), str(gamma)))
        plt.show()
    return plot_svc_rbf

In [None]:
С_widget = widgets.FloatLogSlider(
    value=1,
    base=10,
    min=-4,
    max=3,
    step=0.5,
    continuous_update=False,
    description='C:')
gamma_widget = widgets.FloatLogSlider(
    value=1,
    base=10,
    min=-4,
    max=3,
    step=0.5,
    continuous_update=False,
    description='gamma:')
interact(create_plot_svm_classification_rbf(X_trimmed, y_trimmed), C=С_widget, gamma=gamma_widget)

SVM accuracy on a grid of parameter values

In [None]:
X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=520)

create_scaler = create_scaler_minmax
scaler = create_scaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

def plot_param_search_rbf(X_train, y_train, X_test, y_test, Cs, gammas):
    def compute_accuracy(C, gamma):
        clf = SVC(kernel='rbf', C=C, gamma=gamma)
        clf.fit(X_train, y_train)
        return clf.score(X_test, y_test)

    Cs = np.power(10, np.linspace(-3, 4, num=20, endpoint=True))
    gammas = np.power(10, np.linspace(-3, 4, num=20, endpoint=True))

    C_mesh, gamma_mesh = np.meshgrid(Cs, gammas)
    Z = np.zeros(C_mesh.shape)
    for i in range(len(gammas)):
        for j in range(len(Cs)):
            Z[i, j] = compute_accuracy(C_mesh[i, j], gamma_mesh[i, j])

    fig, ax = plt.subplots()
    plt.contourf(gamma_mesh, C_mesh, Z, 50, cmap='gray')
    plt.colorbar()
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Gamma')
    ax.set_ylabel('C')
    plt.show()

Cs = np.power(10, np.linspace(-3, 4, num=20, endpoint=True))
gammas = np.power(10, np.linspace(-3, 4, num=20, endpoint=True))
plot_param_search_rbf(X_train, y_train, X_test, y_test, Cs, gammas)

## <center>Hyperparameter search</center>

### <center>Dataset</center>

In [None]:
import pandas as pd

# This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. 
# The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, 
# based on certain diagnostic measurements included in the dataset. 
# Several constraints were placed on the selection of these instances from a larger database. 
# In particular, all patients here are females at least 21 years old of Pima Indian heritage.

# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

names=['Pregnancies', 'Glucose', 'BloodPressure', 
       'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Class']
data = pd.read_csv('https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv', skiprows=9, header=None, names=names)

In [None]:
print(data)

Let's remove all the missing values

In [None]:
data = data[data['Glucose'] != 0]
data = data[data['BloodPressure'] != 0]
data = data[data['SkinThickness'] != 0]
data = data[data['Insulin'] != 0]
data = data[data['BMI'] != 0]
data = data[data['Age'] != 0]

Split to X and y.

In [None]:
vals = data.values
y = vals[:, -1]
X = vals[:, :-1]

Let's split the data into train and test. Also, we need to scale the data before using SVM.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=520)

create_scaler = create_scaler_minmax
scaler = create_scaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### <center>Hyperparameter search (rbf)</center>

In [None]:
model = RandomizedSearchCV(SVC(kernel='rbf', random_state=520),
                           cv=5,
                           n_iter=40,
                           n_jobs=-1,
                           iid=True,
                           param_distributions={
                               'C': [10**x for x in range(-3, 4)], 
                               'gamma': [10**x for x in range(-3, 4)]
                           })

# model = GridSearchCV(SVC(kernel='rbf', random_state=520),
#                          cv=5,
#                          n_jobs=-1,
#                          iid=True,
#                          param_grid={
#                              'C': [10**x for x in range(-3, 4)], 
#                              'gamma': [10**x for x in range(-3, 4)]
#                          })

model.fit(X_train, y_train)
print('Optimal parameters:', model.best_params_)

y_test_hat = model.predict(X_test)
print('Accuracy:', metrics.accuracy_score(y_test, y_test_hat))

print(metrics.classification_report(y_test, y_test_hat))

In [None]:
Cs = np.power(10, np.linspace(-3, 4, num=40, endpoint=True))
gammas = np.power(10, np.linspace(-3, 4, num=40, endpoint=True))
plot_param_search_rbf(X_train, y_train, X_test, y_test, Cs, gammas)