In [540]:
from sklearn.datasets import make_blobs
import numpy as np
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score,precision_score,f1_score

#### Explore Dataset

In [541]:
data = load_breast_cancer()
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [542]:
import pandas as pd
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0


In [543]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [544]:
X, y = load_breast_cancer(return_X_y=True)

In [545]:
# Scale the data
scaler = StandardScaler()
scaler_fit =scaler.fit(X)
X_scaled = scaler_fit.transform(X)

# Relabel the y-targets
y = np.where(y == 0, -1, 1)

In [546]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [547]:
# Add the bias
X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test =  np.c_[np.ones((X_test.shape[0], 1)), X_test]

In [548]:
# loss function for svm
def loss_function(X, y, C, theta):
    m = X.shape[0]
    loss = 0
    reg_term = (1/2)*(theta.T.dot(theta))
    for i in range(m):
        loss += reg_term + (C * max(0, 1-(y[i]*(theta.dot(X[i])))))
    
    loss = loss / m
    return loss

In [549]:
# gradient function for svm
def gradient(X_i, y_i, theta, C, N):
    if max(0, 1-(y_i*(X_i.dot(theta)))) == 0:
        return theta / N
    return (theta - (C*y_i*X_i)) / N

##### Decision Function
Trained weight vector multiplied by the data (predict function below)

In [550]:
# prediction function for svm
def predict(theta, X):
    pred = X.dot(theta)
    return np.where(pred <= 0, -1, 1)

In [551]:
# calculating precision, recall, f_score, accuracy
def scores(y, y_pred):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(y)):
        if y[i] == 1 and y_pred[i] == 1:
            tp += 1
        elif y[i] == 1 and y_pred[i] == -1:
            fn += 1
        elif y[i] == -1 and y_pred[i] == 1:
            fp += 1
        elif y[i] == -1 and y_pred[i] == -1:
            tn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return precision, recall, f_score, accuracy

In [552]:
t0, t1 = 5,50
n_iterations = 1000
C = 100
def learning_schedule(t):
    return t0 / (t + t1)

# calculating the coefficient vector
def weight(X_train, y_train, n_iterations, C):
    m = X_train.shape[0]
    n = X_train.shape[1]
    theta = np.random.rand(n) # random initialisation
    random_idx = []
    loss_list = []

    for epoch in range(1, n_iterations):
        for i in range(m):
            random_index = np.random.randint(m)
            while random_index in random_idx: # getting unique random index
                random_index = np.random.randint(m)
            xi = X_train[random_index]
            yi = y_train[random_index]
            gi = gradient(xi, yi, theta, C, m)
            eta = learning_schedule(epoch * m + i)
            theta = theta - eta * gi
        loss_list.append(loss_function(X_train, y_train, C, theta))
    return theta, loss_list

theta, loss_list = weight(X_train, y_train, n_iterations, C)

In [553]:
# predicting on the testing set and evaluating the predictions
y_pred = predict(theta, X_test)
precision, recall, f_score, accuracy = scores(y_test, y_pred)
print(precision, recall, f_score, accuracy)

0.9479166666666666 1.0 0.9732620320855615 0.965034965034965


In [554]:
# predicting on the training set and evaluating the predictions
y_pred_train = predict(theta, X_train)
precision_t, recall_t, f_score_t, accuracy_t = scores(y_train, y_pred_train)
print(precision_t, recall_t, f_score_t, accuracy_t)

0.9601449275362319 0.9962406015037594 0.977859778597786 0.971830985915493


In [555]:
# confusion matrix for testing predictions
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print(confusionMatrix)

[[47  5]
 [ 0 91]]


In [556]:
# confusion matrix for training predictions
confusionMatrix_train = confusion_matrix(y_train, y_pred_train)
print(confusionMatrix_train)

[[149  11]
 [  1 265]]


In [557]:
# calculating the distances of each point in the training dataset from the decision boundary
theta_norm = np.linalg.norm(theta)
distances = []

for i in range(len(X_train)):
    d = abs((X_train[i].dot(theta)) / theta_norm)
    distances.append(d)
    
min_distance = min(distances)
min_idx = distances.index(min_distance)

print("Minimum Distance: ", min_distance)
print("Index of the instance with minimum distance: ", min_idx)
print("Instance with the minimum distance: ", X_train[min_idx])

Minimum Distance:  0.0014740073597155079
Index of the instance with minimum distance:  27
Instance with the minimum distance:  [ 1.          0.13425586  0.93090865  0.08242022  0.0279032  -0.67893833
 -0.71979584 -0.06151077  0.09778122 -0.67403217 -1.22447108  0.03799756
  0.74414693  0.02373215 -0.16120827  0.42369087 -0.45036567  0.06619082
  0.64297688 -0.38046483 -0.34946788 -0.09923184  0.98240025 -0.15075246
 -0.21513886 -0.05122636 -0.61174404 -0.02249311  0.32423177 -0.68554377
 -0.8637008 ]


Which of the training examples are closest to the decision boundary in the SVM
primal problem?

Minimum Distance:  0.0014740073597155079

Index of the instance with minimum distance:  27

Instance with the minimum distance:  [ 1.          0.13425586  0.93090865  0.08242022  0.0279032  -0.67893833
 -0.71979584 -0.06151077  0.09778122 -0.67403217 -1.22447108  0.03799756
  0.74414693  0.02373215 -0.16120827  0.42369087 -0.45036567  0.06619082
  0.64297688 -0.38046483 -0.34946788 -0.09923184  0.98240025 -0.15075246
 -0.21513886 -0.05122636 -0.61174404 -0.02249311  0.32423177 -0.68554377
 -0.8637008 ]

In [558]:
# Testing the model using k fold validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
n_iterations = 900
C = 1

for i, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    X_train = X[train_index]
    X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    y_train = y[train_index]
    theta, loss_list = weight(X_train, y_train, n_iterations, C)
    y_pred = predict(theta, X_train)
    p, r, f, a = scores(y_train, y_pred)
    print("Training Precision: ", p)
    print("Traning Recall: ", r)
    print("Traning F score: ", f)
    print("Traning Accuracy: ", a)
    confusionMatrix = confusion_matrix(y_train, y_pred)
    print("Traning Confusion Matrix: ", confusionMatrix)
    print("----------------------------------------")

    
    X_test = X[test_index]
    X_test =  np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = y[test_index]
    y_pred = predict(theta, X_test)
    p, r, f, a = scores(y_test, y_pred)
    print("Testing Precision: ", p)
    print("Testing Recall: ", r)
    print("Testing F score: ", f)
    print("Testing Accuracy: ", a)
    confusionMatrix = confusion_matrix(y_test, y_pred)
    print("Testing Confusion Matrix: ", confusionMatrix)
    print("========================================")
    print("========================================")


Training Precision:  0.9090909090909091
Traning Recall:  0.9003215434083601
Traning F score:  0.9046849757673666
Traning Accuracy:  0.8703296703296703
Traning Confusion Matrix:  [[116  28]
 [ 31 280]]
----------------------------------------
Testing Precision:  0.7894736842105263
Testing Recall:  0.9782608695652174
Testing F score:  0.8737864077669902
Testing Accuracy:  0.8859649122807017
Testing Confusion Matrix:  [[56 12]
 [ 1 45]]
Training Precision:  0.9251700680272109
Traning Recall:  0.9315068493150684
Traning F score:  0.9283276450511946
Traning Accuracy:  0.9076923076923077
Traning Confusion Matrix:  [[141  22]
 [ 20 272]]
----------------------------------------
Testing Precision:  0.8571428571428571
Testing Recall:  0.9230769230769231
Testing F score:  0.888888888888889
Testing Accuracy:  0.868421052631579
Testing Confusion Matrix:  [[39 10]
 [ 5 60]]
Training Precision:  0.911660777385159
Traning Recall:  0.911660777385159
Traning F score:  0.911660777385159
Traning Accuracy