# The SVM Classifier in PyTorch

Now that you've seen how much work it takes to build an SVM classifier
in NumPy, let's now convert it into PyTorch code and see how much time
we save in both implementation and training.

# Initial Setup

In [52]:
import datasets
import torch
import numpy as np

from matplotlib import pyplot as plt
from numpy.linalg import norm
from pathlib import Path
from torch import nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load in the Data

In [61]:
def get_dataloaders(
		ds_name='mnist', 
		data_dir='./datasets', 
		batch_size=16, 
		get_mean=False):
	"""
	downloads the dataset and returns a dataloader for batching

	:param ds_name: name of the dataset. check huggingface for valid dataset names
	:param data_dir: the directory to download and store the data
	:param batch_size: the batch size for the classifier to train on

	:return: two pytorch dataloaders for the training and testing data
	"""
	# create path for dataset if it doesn't exist
	Path(data_dir).mkdir(parents=True, exist_ok=True)
	
	# download or read in dataset
	ds = datasets.load_dataset(
            ds_name,
            cache_dir = data_dir,
            keep_in_memory=True).with_format('pt')

	# zero mean procedure. only computes mean based of training set
	if get_mean:
		new_ds = {
			split : {'image' : [], 'label' : []} 
			for split in ds.keys()
		}
		mean_x = torch.mean(torch.tensor(
			[sample['image'] for sample in ds['train']]
		), dtype=np.float32)

		return  (
			ds,
			mean_x,
			DataLoader(ds['train'], batch_size=batch_size),
			DataLoader(ds['test'], batch_size=batch_size),
		)
	return  (
		ds,
		DataLoader(ds['train'], batch_size=batch_size),
		DataLoader(ds['test'], batch_size=batch_size),
	)

ds, train_dl, test_dl = get_dataloaders(batch_size=4096)

# Define the SVM class

Here we define the architecture, loss and gradient calculation, and
optimization steps for the classifier.

We train our model on the loss function
$$
\mathcal{L_\theta(x)}=\lambda R_2(\theta)+\frac{1}{N}\sum\limits_{i=1}^N \max(0, 1-y_if_\theta(x_i))
$$

In [49]:
class SVM(nn.Module):
    """ implementation of the SVM classifier """

    def __init__(
            self, 
            C=10, dim=784, shape=(28, 28), 
            reg=1, lr=1, loc=0, scale=1, 
            decay=1, mean_x=None):
        """
        initialize the weights

        :param C: number of classes
        :param dim: dimensionality of the input
        :param reg: the regularization term
        :param loc: arguement for initialization using numpy.random.normal()
        :param scale: standard deviation for initialization using numpy.random.normal()
        """

        super().__init__()

        # save dimensionality for reshaping input
        self.dim=dim

        # initialize weights. use +1 for the bias weights
        self.w = nn.Linear(dim+1, C)

        # set bias weights to 0
        self.w.weight.data[-1, :] = 0

        # sets the mean of the data
        self.mean_x = mean_x
    
    def forward(self, x):
        """
        generates classifier scores. computes x@w+b

        :param x: the input data

        :return: prediction scores
        """

        # get number of samples
        N=x.shape[0]
        if self.mean_x:
            N-=self.mean_x

        # reshape x to an Nxdim matrix and then pad with 1s
        xpad=torch.hstack((
            x.reshape(N, self.dim),
            torch.ones(N, device=x.device)[:, None]
        ))

        return self.w(xpad)

    def loss(self, x, y=None):
        """
        this is used for calculating the loss function
        
        :param x: the input data
        :param y: the labels for the input data

        :return loss: the output of the loss function
        :return acc: the accuracy of the model
        """

        # get the number of samples
        N=x.shape[0]

        # use this to index the scores
        xi=range(N)

        # calculate scores
        scores=self(x)

        preds=torch.argmax(scores, axis=-1)

        # compute accuracy
        acc=torch.sum(preds==y)/len(preds)

        # compute loss
        ys = -torch.ones(scores.shape, device=scores.device)
        ys[xi, y] = 1
        svm_scores=torch.maximum(torch.tensor(0), 1-ys*scores)
        loss=torch.mean(svm_scores)

        return loss, acc

# Define Optimization Parameters and Train Model

In [65]:
# parameters
lr = .01
weight_decay = 1
gamma = .995
num_epochs = 40
batch_size = 4096

# define model and load into GPU
model = SVM().to('cuda')

# define optimizer
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=lr,
    weight_decay=weight_decay
)
scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer,
    gamma=gamma,
)

# define evaluation procedure
def evaluate(model, data):

    # set model to eval mode
    model.eval()

    N=len(data)
    loss=0
    acc=0
    # compute loss and accuracy on the test set
    for batch in data:

        # split into data and labels. 
        x = batch['image'].to('cuda')
        y = batch['label'].to('cuda')

        # compute loss
        with torch.no_grad():
            loss_i, acc_i = model.loss(x, y)

        loss+=loss_i.cpu().numpy()
        acc+=acc_i.cpu().numpy()

    model.train()
    
    return loss/N, acc/N


# Training loop
history={
    'train_loss':[],
    'train_acc':[],
    'test_loss':[],
    'test_acc':[],
    'w':[],
}
bar=tqdm(range(num_epochs*len(train_dl)))
for epoch in range(num_epochs):

    for batch in train_dl:
        bar.update()

        # split into data and labels. 
        data, labels = batch['image'], batch['label']
        N = data.shape[0]
        scores = model(data.to('cuda'))
        loss, acc = model.loss(
            data.to('cuda'), 
            labels.to('cuda'),
        )

        # compute backpropagation
        loss.backward()
        optimizer.zero_grad()
        optimizer.step()

        # collect historical data
        history['train_loss'].append(loss.detach().cpu().numpy())
        history['train_acc'].append(acc.detach().cpu().numpy())

        # collect historical data
        test_loss, test_acc = evaluate(model, test_dl)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        history['w'].append(
            model.w.weight.data.detach().cpu().numpy()
        )
            
        # update progress bar
        bar.set_postfix({
            'loss_trn':history['train_loss'][-1],
            'loss_tst':history['test_loss'][-1],
            'acc_trn':history['train_acc'][-1],
            'acc_tst':history['test_acc'][-1],
            'lr': scheduler.get_last_lr(),
            '||w||':np.linalg.norm(history['w'][-1], ord=2),
        })
    scheduler.step()

  0%|          | 0/600 [00:00<?, ?it/s]

tensor(17.1423, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.2301, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.4797, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.5394, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.4867, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.7299, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.7073, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(17.0781, device='cuda:0', grad_fn=<MeanBackward0>)


KeyboardInterrupt: 

In [60]:

print(len(train_dl))

3750
Dataset({
    features: ['image', 'label'],
    num_rows: 60000
})


3750.0