## Gradient Descent

In [160]:
# Let's start by importing the relevant packages
# matplotlib for plots
import matplotlib as mpl
from matplotlib import pyplot as plt
# pandas to read in some data
import pandas as pd
# numpy to build our first perceptron
import numpy as np
# Train test split to do validate our findings from the perceptron training
from sklearn.model_selection import train_test_split
# MinMaxScaler to normalise the data before inputting them to the perceptron
from sklearn.preprocessing import MinMaxScaler
# PyTorch for neural networks
import torch
import time
from torch import nn
%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 9)

In [161]:
# Load the occupancy data so we have something to predict
df = pd.read_csv('../data/occupancy_data/datatraining.txt')
target = 'Occupancy'
features = [col for col in df.columns if target not in col and 'date' not in col]
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [162]:
print(df[target].value_counts())

0    6414
1    1729
Name: Occupancy, dtype: int64


In [163]:
x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], shuffle=False)
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [164]:
x_train[:5]

array([[1.        , 0.70391174, 0.27549041, 0.42478485, 0.98861629],
       [0.99282297, 0.70361083, 0.27775383, 0.41480207, 0.98416176],
       [0.99282297, 0.70210632, 0.27549041, 0.4141136 , 0.98230597],
       [0.99282297, 0.69909729, 0.27549041, 0.40688468, 0.97859448],
       [0.98086124, 0.69909729, 0.27549041, 0.40172117, 0.97182167]])

In [165]:
y_train[:5]

1    1
2    1
3    1
4    1
5    1
Name: Occupancy, dtype: int64

## Build the artificial neuron
To build and train a neuron we have to perform three steps:
- Calculate the perceptron's output $\hat{y} = \sigma\left(\sum_i w_i X_i)\right)$
- Determine the error: $E(w) = \frac12 \sum_{(x,y) \in D} (y-a)^2$
- Calculate the weight gradient with: $\sum_{(x,y) \in D} (y-a)$
- Repeat the above steps until there occur no more updates (we will iterate once over the dataset instead)

PyTorch abstracts neural networks using the nn.Module class. Every neural network has to subclass from it for PyTorch mechanisms to work perfecty. Let us start by using this to build out neuron.

In [166]:
class Neuron(nn.Module):
    
    def __init__(self, number_of_inputs):
        super().__init__()
        # Build the neuron using nn.Linear
        self.neuron = nn.Linear(number_of_inputs, 1, bias=True)
        # use nn.Sigmoid as an activation function
        self.act = nn.Sigmoid()
    
    def logit(self, inp):
        return self.neuron(inp)
    
    def forward(self, inp):
        return self.act(self.logit(inp))
    

Let us now select a random selection of the training data and calculate the gradients for the neuron:

In [191]:
loss = nn.BCEWithLogitsLoss()
neuron = Neuron(5)
select = np.random.randint(0, len(x_train), 2014)
x = torch.from_numpy(x_train[select]).float()
y = torch.from_numpy(y_train.iloc[select].values).float().unsqueeze(1)
y_logits = neuron.logit(x)
err = loss(y_logits, y)
err.backward()
for name, param in neuron.named_parameters():
    print(f'Parameter {name}\n{param}\nGradient {param.grad}')
    param = param - 5e-2*param.grad


Parameter neuron.weight
Parameter containing:
tensor([[-0.1690,  0.3154,  0.0627,  0.0407,  0.0373]], requires_grad=True)
Gradient tensor([[ 0.0794,  0.1658, -0.0177, -0.0275,  0.1240]])
Parameter neuron.bias
Parameter containing:
tensor([-0.0248], requires_grad=True)
Gradient tensor([0.3323])


In [209]:
optim = torch.optim.SGD(neuron.parameters(), lr=5e-2)

In [210]:
def fit_batch(optim, loss, neuron, x, y):
    optim.zero_grad()
    y_pred = neuron.logit(x)
    #print(y, y_pred, y.sum())
    err = loss(y_pred, y)
    #err = err * (y * 3 + 1)
    err.mean().backward()
    optim.step()
    return y_pred

start = time.time()  
for i in range(20):
    acc = None
    for i in range(200):
        select = np.random.randint(0, len(x_train), 2048)
        x = torch.from_numpy(x_train[select]).float()
        y = torch.from_numpy(y_train.iloc[select].values).float().unsqueeze(1)
        y_pred = fit_batch(optim, loss, neuron, x, y)
        if acc is None:
            acc = (y==(y_pred > .5).float()).float().mean()
        else:
            acc += (y==(y_pred > .5).float()).float().mean()
        #y_pred = y_pred.argmax(dim=-1)
        #acc += (y==y_pred).float().mean()
    print(f'accuracy {acc/200}')
print(f'Training time: {time.time() - start}')


RuntimeError: Expected object of backend CUDA but got backend CPU for argument #4 'mat1'

Why did we use the logits function instead of calling forward including the sigmoid function?
Chaining a Sigmoid and the Cross Entropy Loss can lead to instabilities, if calculated numerically. 
This can be solved analytically and is done directly in the BCELoss function.

## Move the neuron to the GPU
PyTorch tensors and modules allow us to call .cuda() on them to move the computations to the GPU.
This makes it really easy to perform any calculation on the GPU (which is super handy even if you do not use neural networks.


In [196]:
if torch.cuda.is_available():
    neuron = Neuron(5).cuda()
    optim = torch.optim.SGD(neuron.parameters(), lr=5e-2)
    start = time.time()
    for i in range(20):
        acc = None
        for i in range(200):
            select = np.random.randint(0, len(x_train), 2048)
            x = torch.from_numpy(x_train[select]).float().cuda()
            y = torch.from_numpy(y_train.iloc[select].values).float().unsqueeze(1).cuda()
            y_pred = fit_batch(optim, loss, neuron, x, y)
            if acc is None:
                acc = (y==(y_pred > .5).float()).float().mean()
            else:
                acc += (y==(y_pred > .5).float()).float().mean()
        print(f'{acc.data.cpu().numpy()/200}')
    print(f'Training time: {time.time() - start}')

0.81374267578125
0.81352294921875
0.81407470703125
0.8188525390625
0.855341796875
0.87596435546875
0.89211669921875
0.90404052734375
0.9098583984375
0.91836669921875
0.92681396484375
0.93129150390625
0.934775390625
0.93648193359375
0.93915283203125
0.9420703125
0.9422705078125
0.942626953125
0.943828125
0.9444189453125
Training time: 4.190782785415649


## Why is the GPU version slower?

Well, we need to move the data to the GPU and back. This costs us time. It normally pays off, as the computations take way longer than moving the data. In our current case the computation is very simple and the amount of data very small. This nothing the GPU is well suited for, because it can not use its advantage of performing a lot of computations in parallel.