In [48]:
from sklearn.datasets import make_blobs
import numpy as np
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score,precision_score,f1_score

In [49]:
X, y = make_blobs(n_samples=1000, centers=2, random_state=0)

In [50]:
# Scale the data
scaler = StandardScaler()
scaler_fit =scaler.fit(X)
X_scaled = scaler_fit.transform(X)

# Relabel the y-targets
y = np.where(y == 0, -1, 1)

In [51]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [52]:
# Add the bias
X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test =  np.c_[np.ones((X_test.shape[0], 1)), X_test]

In [53]:
# loss function for svm
def loss_function(X, y, C, theta):
    m = X.shape[0]
    loss = 0
    reg_term = (1/2)*(theta.T.dot(theta))
    for i in range(m):
        loss += reg_term + (C * max(0, 1-(y[i]*(theta.dot(X[i])))))
    
    loss = loss / m
    return loss

In [54]:
# gradient function for svm
def gradient(X_i, y_i, theta, C, N):
    if max(0, 1-(y_i*(X_i.dot(theta)))) == 0:
        return theta / N
    return (theta - (C*y_i*X_i)) / N

##### Decision Function
Trained weight vector multiplied by the data

In [55]:
# prediction function for svm
def predict(theta, X):
    pred = X.dot(theta)
    return np.where(pred <= 0, -1, 1)

In [56]:
# calculating precision, recall, f_score, accuracy
def scores(y, y_pred):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(y)):
        if y[i] == 1 and y_pred[i] == 1:
            tp += 1
        elif y[i] == 1 and y_pred[i] == -1:
            fn += 1
        elif y[i] == -1 and y_pred[i] == 1:
            fp += 1
        elif y[i] == -1 and y_pred[i] == -1:
            tn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return precision, recall, f_score, accuracy

In [57]:
t0, t1 = 5,50
n_iterations = 500
C = 100
def learning_schedule(t):
    return t0 / (t + t1)

# calculating the coefficient vector
def weight(X_train, y_train, n_iterations, C):
    m = X_train.shape[0]
    n = X_train.shape[1]
    theta = np.random.rand(n) # random initialisation
    random_idx = []
    loss_list = []

    for epoch in range(1, n_iterations):
        for i in range(m):
            random_index = np.random.randint(m)
            while random_index in random_idx: # getting unique random index
                random_index = np.random.randint(m)
            xi = X_train[random_index]
            yi = y_train[random_index]
            gi = gradient(xi, yi, theta, C, m)
            eta = learning_schedule(epoch * m + i)
            theta = theta - eta * gi
        loss_list.append(loss_function(X_train, y_train, C, theta))
    return theta, loss_list

theta, loss_list = weight(X_train, y_train, n_iterations, C)

In [58]:
# predicting on the testing set and evaluating the predictions
y_pred = predict(theta, X_test)
precision, recall, f_score, accuracy = scores(y_test, y_pred)
print(precision, recall, f_score, accuracy)

0.9534883720930233 0.9534883720930233 0.9534883720930233 0.952


In [59]:
# predicting on the training set and evaluating the predictions
y_pred_train = predict(theta, X_train)
precision_t, recall_t, f_score_t, accuracy_t = scores(y_train, y_pred_train)
print(precision_t, recall_t, f_score_t, accuracy_t)

0.9514824797843666 0.9514824797843666 0.9514824797843666 0.952


In [60]:
# confusion matrix for testing predictions
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print(confusionMatrix)

[[115   6]
 [  6 123]]


In [61]:
# confusion matrix for training predictions
confusionMatrix_train = confusion_matrix(y_train, y_pred_train)
print(confusionMatrix_train)

[[361  18]
 [ 18 353]]


In [62]:
# calculating the distances of each point in the training dataset from the decision boundary
theta_norm = np.linalg.norm(theta)
distances = []

for i in range(len(X_train)):
    d = abs((X_train[i].dot(theta)) / theta_norm)
    distances.append(d)
    
min_distance = min(distances)
min_idx = distances.index(min_distance)

print("Minimum Distance: ", min_distance)
print("Index of the instance with minimum distance: ", min_idx)
print("Instance with the minimum distance: ", X_train[min_idx])

Minimum Distance:  0.003603874975510126
Index of the instance with minimum distance:  692
Instance with the minimum distance:  [1.         0.57447184 0.25745982]


Which of the training examples are closest to the decision boundary in the SVM
primal problem?

Minimum Distance:  0.003603874975510126

Index of the instance with minimum distance:  692

Instance with the minimum distance:  [1.         0.57447184 0.25745982]

In [63]:
# Testing the model using k fold validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
n_iterations = 900
C = 20

for i, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    X_train = X[train_index]
    X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    y_train = y[train_index]
    theta, loss_list = weight(X_train, y_train, n_iterations, C)
    y_pred = predict(theta, X_train)
    p, r, f, a = scores(y_train, y_pred)
    print("Training Precision: ", p)
    print("Traning Recall: ", r)
    print("Traning F score: ", f)
    print("Traning Accuracy: ", a)
    confusionMatrix = confusion_matrix(y_train, y_pred)
    print("Traning Confusion Matrix: ", confusionMatrix)
    print("----------------------------------------")

    
    X_test = X[test_index]
    X_test =  np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = y[test_index]
    y_pred = predict(theta, X_test)
    p, r, f, a = scores(y_test, y_pred)
    print("Testing Precision: ", p)
    print("Testing Recall: ", r)
    print("Testing F score: ", f)
    print("Testing Accuracy: ", a)
    confusionMatrix = confusion_matrix(y_test, y_pred)
    print("Testing Confusion Matrix: ", confusionMatrix)
    print("========================================")
    print("========================================")


Training Precision:  0.9592875318066157
Traning Recall:  0.9308641975308642
Traning F score:  0.9448621553884712
Traning Accuracy:  0.945
Traning Confusion Matrix:  [[379  16]
 [ 28 377]]
----------------------------------------
Testing Precision:  0.9770114942528736
Testing Recall:  0.8947368421052632
Testing F score:  0.9340659340659342
Testing Accuracy:  0.94
Testing Confusion Matrix:  [[103   2]
 [ 10  85]]
Training Precision:  0.972972972972973
Traning Recall:  0.9113924050632911
Traning F score:  0.9411764705882353
Traning Accuracy:  0.94375
Traning Confusion Matrix:  [[395  10]
 [ 35 360]]
----------------------------------------
Testing Precision:  0.9504950495049505
Testing Recall:  0.9142857142857143
Testing F score:  0.9320388349514563
Testing Accuracy:  0.93
Testing Confusion Matrix:  [[90  5]
 [ 9 96]]
Training Precision:  0.9699453551912568
Traning Recall:  0.9033078880407125
Traning F score:  0.9354413702239789
Traning Accuracy:  0.93875
Traning Confusion Matrix:  [[396 