# Lesson 5 - Assignment

In this assignment, you will implement a Support Vector Machine Classifier  from scratch and compare the results to existing sklearn algorithm. 

In [1]:
# import packages
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.legend_handler import HandlerLine2D
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# make this notebook's output stable across runs
np.random.seed(0)

Question 1.1: Implement the cost function cost/objective function:
<img src="https://miro.medium.com/max/688/1*JAS6rUTO7TDlrv4XZSMbsA.png" alt="drawing" width="600"/>


In [33]:
def compute_cost(W, X, Y,reg_strength=1000):
    # TODO calculate cost function
    N = X.shape[0]
    
    distances = 1 - Y * (np.dot(X, W)) # 1 - y *(W . X)
    distances = np.maximum(np.zeros(distances.shape), distances) # max(0, distances)
    loss = (np.sum(distances) / N) * reg_strength

    return 1 / 2 * np.dot(W, W) + loss

Question 1.2: Write a method that calculate the cost gradient:
<img src="https://miro.medium.com/max/866/1*ww3F21VMVGp2NKhm0VTesA.png" alt="drawing" width="600"/>

In [55]:
def calculate_cost_gradient(W, X_batch, Y_batch, reg_strength=1000):
    
    # got an iteration error in the case of a single sample, so have to add this check
    if Y_batch.shape == ():
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])
    
    distance = 1 - (Y_batch * np.dot(X_batch, W))
    delta_w = np.zeros(W.shape[0])

    for i, dist in enumerate(distance):
        if max(0, dist) == 0:
            delta_i = W 
        else:
            delta_i = W - (reg_strength * Y_batch[i] * X_batch[i])
        delta_w += delta_i
        
    delta_w = delta_w / len(Y_batch)
    return delta_w

Question 1.3: Write a method that performs stochastic Gradient descent as follows:
- Caluclate the gradient of cost function i.e. ∇J(w)
- Update the weights in the opposite direction to the gradient: w = w — ∝(∇J(w))
- Repeat until conversion or until 5000 epochs are reached

In [27]:
def sgd(data, outputs, learning_rate = 0.0001, max_epochs = 5000, cost_threshold= 0.01):
    weights = np.zeros(data.shape[1])
    nth = 0
    prev_cost = np.inf
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(data, outputs)
        for i, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[i])
            weights = weights - (ascent * learning_rate)
            # convergence check on 2^nth epoch
            if epoch == max_epochs-1 or epoch == 2 ** nth:
                cost = compute_cost(weights, data, outputs)
                print(f"Epoch is:{epoch} and Cost is: {cost}")
                # stoppage criterion
                if abs(prev_cost - cost) < cost_threshold * prev_cost:
                    return weights
                prev_cost = cost
                nth += 1
               
    return weights

# Dataset

In [10]:
data = pd.read_csv('data_banknote_authentication.csv')

Y = data.iloc[:, -1]  
X = data.iloc[:, 1:4]
X.insert(loc=len(X.columns), column='intercept', value=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

Question 4: Train and evaluate an SVC using the banknote_authentication data

In [11]:
data

Unnamed: 0,3.6216,8.6661,-2.8073,-0.44699,0
0,4.54590,8.16740,-2.4586,-1.46210,0
1,3.86600,-2.63830,1.9242,0.10645,0
2,3.45660,9.52280,-4.0112,-3.59440,0
3,0.32924,-4.45520,4.5718,-0.98880,0
4,4.36840,9.67180,-3.9606,-3.16250,0
...,...,...,...,...,...
1366,0.40614,1.34920,-1.4501,-0.55949,1
1367,-1.38870,-4.87730,6.4774,0.34179,1
1368,-3.75030,-13.45860,17.5932,-2.77710,1
1369,-3.56370,-8.38270,12.3930,-1.28230,1


In [18]:
np.float64(3.2).shape == ()

True

In [56]:
# train the model
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy(), cost_threshold = 0.01)
print("training finished.")
print("weights are: {}".format(W))

# testing the model on test set
y_pred = np.zeros(X_test.shape[0])
for i in range(X_test.shape[0]):
    y_p = np.sign(np.dot(W, X_test.to_numpy()[i]))
    y_pred[i] = y_p
print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_pred)))

training started...
Epoch is:1 and Cost is: 1710.8711649919944
Epoch is:2 and Cost is: 560.1173728901427
Epoch is:4 and Cost is: 559.2935797652925
training finished.
weights are: [-0.17135254  0.05008014 -0.55367713  1.97319546]
accuracy on test dataset: 0.44808743169398907


[Bonus] Question 5: Train and evaluate an SKLEARN SVC model, and compare the results to your model 

In [48]:
from sklearn.svm import LinearSVC

clf = LinearSVC(C=1000, max_iter=1000)
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_test, clf.predict(X_test))
print(f'Accuracy: {accuracy}')

Accuracy: 0.7377049180327869




In [49]:
clf.coef_

array([[-0.37237628, -0.40422869, -0.43944554,  0.09369893]])

Question 6: Create a new text cell in your Notebook: Complete a 50-100 word summary (or short description of your thinking in applying this week's learning to the solution) of your experience in this assignment. Include: What was your incoming experience with this model, if any? what steps you took, what obstacles you encountered. how you link this exercise to real-world, machine learning problem-solving. (What steps were missing? What else do you need to learn?) This summary allows your instructor to know how you are doing and allot points for your effort in thinking and planning, and making connections to real-world work.

I had some previous experience using SVM for text classification, though I had never implemented it myself. It was interesting to walk through each step and consider what is actually happening behind the scenes. As is often the case, the hardest thing was getting numpy to cooperate, especially when it comes to array shapes and sizes. 

I'm not sure why the hand-built model performed so poorly compared to sklearn's LinearSVC model. I played around with it a bit trying to figure that out but I'm still unsure. I'd be curious to get an explanation for what sklearn is doing differently under the hood. 