# Deep Learning with PyTorch Step-by-Step: A Beginner's Guide

# Chapter 6

In [None]:
from config_dl import *
from plots.chapter3 import *

In [None]:
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc

from stepbystep.v0 import StepByStep

# A Simple Classification Problem

## Data Generation

In [None]:
X, y = make_moons(n_samples=100, noise=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=13)

In [None]:
sc = StandardScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_val = sc.transform(X_val)

In [None]:
fig = figure1(X_train, y_train, X_val, y_val)

## Data Preparation

In [None]:
torch.manual_seed(13)

# Builds tensors from numpy arrays
x_train_tensor = torch.as_tensor(X_train).float()
y_train_tensor = torch.as_tensor(y_train.reshape(-1, 1)).float()

x_val_tensor = torch.as_tensor(X_val).float()
y_val_tensor = torch.as_tensor(y_val.reshape(-1, 1)).float()

# Builds dataset containing ALL data points
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

# Builds a loader of each set
train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=16)

## Model

$$
\Large
y = b + w_1x_1 + w_2x_2 + \epsilon
$$

$$
\Large
y =
\begin{cases}
1,\ if\ \ b + w_1x_1 + w_2x_2 \ge 0
\\
0,\ if \ \ b + w_1x_1 + w_2x_2 \lt 0
\end{cases}
$$

### Logits

$$
\Large
z = b + w_1x_1 + w_2x_2
$$

### Probabilities

$$
\Large
\begin{array}{ccc}
& P(y=1) \approx 1.0, & if\ &z \gg 0
\\
& P(y=1) = 0.5, & if\ &z = 0
\\
& P(y=1) \approx 0.0, & if\ &z \ll 0
\end{array}
$$

### Odds Ratio

$$
\Large
odds\ ratio(p) = \frac{p}{q} = \frac{p}{1-p}
$$

In [None]:
def odds_ratio(prob):
    return prob / (1 - prob)

In [None]:
p = .75
q = 1 - p
odds_ratio(p), odds_ratio(q)

In [None]:
fig = figure2(p)

### Log Odds Ratio

$$
\Large
log\ odds\ ratio(p) = log\left(\frac{p}{1-p}\right)
$$

In [None]:
def log_odds_ratio(prob):
    return np.log(odds_ratio(prob))

p = .75
q = 1 - p
log_odds_ratio(p), log_odds_ratio(q)

In [None]:
fig = figure3(p)

### From Logits to Probabilities

$$
\Large
\begin{array}{rrl}
b + w_1x_1 + w_2x_2 = &z& = &log\left(\frac{p}{1-p}\right)
\\
e^{b + w_1x_1 + w_2x_2} = &e^z& = &\frac{p}{1-p}
\end{array}
$$

$$
\Large
\begin{array}{rl}
\frac{1}{e^z}& = &\frac{1-p}{p}
\\
e^{-z}& = &\frac{1}{p} - 1
\\
1 + e^{-z}& = &\frac{1}{p}
\\
p& = &\frac{1}{1 + e^{-z}}
\end{array}
$$

$$
\Large
p = \sigma(z) = \frac{1}{1+e^{-z}}
$$

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

p = .75
q = 1 - p
sigmoid(log_odds_ratio(p)), sigmoid(log_odds_ratio(q))

### Sigmoid

In [None]:
torch.sigmoid(torch.tensor(1.0986)), torch.sigmoid(torch.tensor(-1.0986))

In [None]:
fig = figure4(p)

### Logistic Regression

$$
\Large P(y=1) = \sigma(z) = \sigma(b+w_1x_1+w_2x_2)
$$

![](images/logistic_model.png)

In [None]:
torch.manual_seed(42)
model1 = nn.Sequential()
model1.add_module('linear', nn.Linear(2, 1))
model1.add_module('sigmoid', nn.Sigmoid())
print(model1.state_dict())

### A Note on Notation

$$
\large W =
\underset{(3 \times 1)}{
\begin{bmatrix}
b \\
w_1 \\
w_2
\end{bmatrix}};
X = 
\underset{(3 \times 1)}{
\begin{bmatrix}
1 \\
x_1 \\
x_2
\end{bmatrix}}
$$

$$
\large 
\begin{aligned}
z
& = W^T \cdot X
=
\underset{(1 \times 3)}{
\begin{bmatrix}
- & w^{T} & -\\
\end{bmatrix}}
\cdot
\underset{(3 \times 1)}{
\begin{bmatrix}
1 \\
x_1 \\
x_2
\end{bmatrix}}
= \underset{(1 \times 3)}{
\begin{bmatrix}
b & w_1 & w_2
\end{bmatrix}}
\cdot
\underset{(3 \times 1)}{
\begin{bmatrix}
1 \\
x_1 \\
x_2
\end{bmatrix}}\\
& = b + w_1x_1 + w_2x_2
\end{aligned}
$$

## Loss

$$
\Large y_i = 1 \Rightarrow error_i=log(P(y_i=1))
$$

$$
\Large P(y_i=0)=1-P(y_i=1)
$$

$$
\Large y_i = 0 \Rightarrow error_i=log(1-P(y_i=1))
$$

$$
\Large BCE(y)={-\frac{1}{(N_{pos}+N_{neg})}\Bigg[{\sum_{i=1}^{N_{pos}}{log(P(y_i=1))} + \sum_{i=1}^{N_{neg}}{log(1 - P(y_i=1))}}\Bigg]}
$$

In [None]:
dummy_labels = torch.tensor([1.0, 0.0])
dummy_predictions = torch.tensor([.9, .2])

# Positive class (labels == 1)
positive_pred = dummy_predictions[dummy_labels == 1]
first_summation = torch.log(positive_pred).sum()
# Negative class (labels == 0)
negative_pred = dummy_predictions[dummy_labels == 0]
second_summation = torch.log(1 - negative_pred).sum()
# n_total = n_pos + n_neg
n_total = dummy_labels.size(0)

loss = -(first_summation + second_summation) / n_total
loss

$$
\Large BCE(y)={-\frac{1}{N}\sum_{i=1}^{N}{\left[y_i \cdot log(P(y_i=1)) + (1-y_i) \cdot log(1-P(y_i=1))\right]}}
$$

In [None]:
summation = torch.sum(
    dummy_labels * torch.log(dummy_predictions) +
    (1 - dummy_labels) * torch.log(1 - dummy_predictions)
)
loss = -summation / n_total
loss

### BCELoss

In [None]:
loss_fn = nn.BCELoss(reduction='mean')

loss_fn

In [None]:
dummy_labels = torch.tensor([1.0, 0.0])
dummy_predictions = torch.tensor([.9, .2])

# RIGHT
right_loss = loss_fn(dummy_predictions, dummy_labels)

# WRONG
wrong_loss = loss_fn(dummy_labels, dummy_predictions)

print(right_loss, wrong_loss)

### BCEWithLogitsLoss

In [None]:
loss_fn_logits = nn.BCEWithLogitsLoss(reduction='mean')

loss_fn_logits

In [None]:
logit1 = log_odds_ratio(.9)
logit2 = log_odds_ratio(.2)

dummy_labels = torch.tensor([1.0, 0.0])
dummy_logits = torch.tensor([logit1, logit2])

print(dummy_logits)

In [None]:
loss = loss_fn_logits(dummy_logits, dummy_labels)
loss

### Imbalanced Dataset

In [None]:
dummy_imb_labels = torch.tensor([1.0, 0.0, 0.0, 0.0])
dummy_imb_logits = torch.tensor([logit1, logit2, logit2, logit2])

$$
\Large pos\_weight = \frac{\# points\ in\ negative\ class}{\# points\ in\ positive\ class}
$$

In [None]:
n_neg = (dummy_imb_labels == 0).sum().float()
n_pos = (dummy_imb_labels == 1).sum().float()

pos_weight = (n_neg / n_pos).view(1,)
pos_weight

In [None]:
loss_fn_imb = nn.BCEWithLogitsLoss(reduction='mean', pos_weight=pos_weight)

In [None]:
loss = loss_fn_imb(dummy_imb_logits, dummy_imb_labels)
loss

$$
\Large weighted\ average = \frac{pos\_weight \cdot \sum_{i=1}^{N_{pos}}{loss_i}+\sum_{i=1}^{N_{neg}}{loss_i}}{pos\_weight \cdot N_{pos}+N_{neg}}
$$

$$
\Large BCEWithLogitsLoss = \frac{pos\_weight \cdot \sum_{i=1}^{N_{pos}}{loss_i}+\sum_{i=1}^{N_{neg}}{loss_i}}{N_{pos}+N_{neg}}
$$

In [None]:
loss_fn_imb_sum = nn.BCEWithLogitsLoss(reduction='sum', pos_weight=pos_weight)

loss = loss_fn_imb_sum(dummy_imb_logits, dummy_imb_labels)

loss = loss / (pos_weight * n_pos + n_neg)
loss

## Model Configuration

In [None]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

torch.manual_seed(42)
model = nn.Sequential()
model.add_module('linear', nn.Linear(2, 1))

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a BCE loss function
loss_fn = nn.BCEWithLogitsLoss()

## Model Training

In [None]:
n_epochs = 100

sbs = StepByStep(model, loss_fn, optimizer)
sbs.set_loaders(train_loader, val_loader)
sbs.train(n_epochs)

In [None]:
fig = sbs.plot_losses()

In [None]:
print(model.state_dict())

$$
\large
\begin{array}{ccccccc}
z & = & b & + & w_1x_1 & + & w_2x_2
\\
z & = & -0.0587 & + & 1.1822x_1 & - & 1.8684x_2
\end{array}
$$

#### Making Predictions (Logits)

In [None]:
predictions = sbs.predict(x_train_tensor[:4])
predictions

#### Making Predictions (Probabilities)

In [None]:
probabilities = sigmoid(predictions)
probabilities

#### Making Predictions (Classes)

$$
\Large y =
\begin{cases}
1,\ if\ P(y=1) \ge 0.5
\\
0,\ if \ P(y=1) < 0.5
\end{cases}
$$

$$
\Large y =
\begin{cases}
1,\ if\ \sigma(z) \ge 0.5
\\
0,\ if \ \sigma(z) < 0.5
\end{cases}
$$

$$
\Large y =
\begin{cases}
1,\ if\ z \ge 0
\\
0,\ if \ z < 0
\end{cases}
$$

In [None]:
classes = (predictions >= 0).astype(np.int)
classes

## Decision Boundary

$$
\Large
\begin{array}{ccccccccc}
z & = &   0 & =   & b & + & w_1x_1 & + & w_2x_2
\\
& & -w_2x_2 & = & b & + & w_1x_1 & &
\\
& & x_2 & = & -\frac{b}{w_2} & - &\frac{w_1}{w_2}x_1 & &
\end{array}
$$

$$
\Large
\begin{array}{ccccccccc}
& & x_2 & = & -\frac{0.0587}{1.8684} & + &\frac{1.1822}{1.8684}x_1 & &
\\
& & x_2 & = & -0.0314 & + &0.6327x_1 & &
\end{array}
$$

In [None]:
# Training set
fig = figure7(X_train, y_train, sbs.model, sbs.device)

In [None]:
# Validation set
fig = figure7(X_val, y_val, sbs.model, sbs.device)

### Are my data points separable?

In [None]:
x = np.array([-2.8, -2.2, -1.8, -1.3, -.4, 0.3, 0.6, 1.3, 1.9, 2.5])
y = np.array([0., 0., 0., 0., 1., 1., 1., 0., 0., 0.])

fig = one_dimension(x, y)

$$
\Large X_2 = f(X_1)= X_1^2
$$

In [None]:
fig = two_dimensions(x, y)

In [None]:
model = nn.Sequential()
model.add_module('hidden', nn.Linear(2, 10))
model.add_module('activation', nn.ReLU())
model.add_module('output', nn.Linear(10, 1))
model.add_module('sigmoid', nn.Sigmoid())

loss_fn = nn.BCELoss()

## Classification Threshold

In [None]:
logits_val = sbs.predict(X_val)
probabilities_val = sigmoid(logits_val).squeeze()
threshold = 0.5

In [None]:
fig = figure9(X_val, y_val, sbs.model, sbs.device, probabilities_val, threshold)

In [None]:
fig = figure10(y_val, probabilities_val, threshold, 0.04, False)

### Confusion Matrix

In [None]:
fig = figure10(y_val, probabilities_val, threshold, 0.04, True)

In [None]:
cm_thresh50 = confusion_matrix(y_val, (probabilities_val >= .5))
cm_thresh50

#### True and False Positives and Negatives

In [None]:
def split_cm(cm):
    # Actual negatives go in the top row, 
    # above the probability line
    actual_negative = cm[0]
    # Predicted negatives go in the first column
    tn = actual_negative[0]
    # Predicted positives go in the second column
    fp = actual_negative[1]

    # Actual positives go in the bottow row, 
    # below the probability line
    actual_positive = cm[1]
    # Predicted negatives go in the first column
    fn = actual_positive[0]
    # Predicted positives go in the second column
    tp = actual_positive[1]
    
    return tn, fp, fn, tp

### Metrics

### True and False Positive Rates

$$
\Large TPR = \frac{TP}{TP + FN} \ \ \  FPR = \frac{FP}{FP + TN}
$$

In [None]:
def tpr_fpr(cm):
    tn, fp, fn, tp = split_cm(cm)
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    
    return tpr, fpr

In [None]:
tpr_fpr(cm_thresh50)

### Precision and Recall

$$
\Large Recall = \frac{TP}{TP + FN} \ \ \  Precision = \frac{TP}{TP + FP}
$$

In [None]:
def precision_recall(cm):
    tn, fp, fn, tp = split_cm(cm)
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    return precision, recall

In [None]:
precision_recall(cm_thresh50)

### Accuracy

$$
\Large Accuracy = \frac{TP+TN}{TP+TN+FP+FN}
$$

## Trade-offs and Curves

In [None]:
fig = eval_curves_from_probs(y_val, probabilities_val, [.5], annot=True)

### Low Threshold

In [None]:
fig = figure9(X_val, y_val, sbs.model, sbs.device, probabilities_val, threshold=0.3, shift=0.04, annot=True)

In [None]:
confusion_matrix(y_val, (probabilities_val >= 0.3))

In [None]:
fig = eval_curves_from_probs(y_val, probabilities_val, [.3, .5], annot=True)

### High Threshold

In [None]:
fig = figure9(X_val, y_val, sbs.model, sbs.device, probabilities_val, threshold=0.7, shift=0.04, annot=True)

In [None]:
confusion_matrix(y_val, (probabilities_val >= 0.7))

In [None]:
fig = eval_curves_from_probs(y_val, probabilities_val, [.3, .5, .7], annot=True)

### ROC and PR Curves

In [None]:
threshs = np.linspace(0.,1,11)

In [None]:
fig = figure17(y_val, probabilities_val, threshs)

In [None]:
fpr, tpr, thresholds1 = roc_curve(y_val, probabilities_val)
prec, rec, thresholds2 = precision_recall_curve(y_val, probabilities_val)

In [None]:
fig = eval_curves(fpr, tpr, rec, prec, thresholds1, thresholds2, line=True)

### The Precision Quirk

In [None]:
fig = figure19(y_val, probabilities_val)

$$
\Large Precision(thresh=0.40)=\frac{13}{13+2}=0.8666
$$

$$
\Large Precision(thresh=0.50)=\frac{(13-1)}{(13-1)+2}=\frac{12}{12+2}=0.8571
$$

$$
\Large Precision(thresh=0.57)=\frac{12}{12+(2-1)}=\frac{12}{12+1}=0.9230
$$

### Best and Worst Curves

In [None]:
# Best
fig = figure20(y_val)

In [None]:
np.random.seed(39)
random_probs = np.random.uniform(size=y_val.shape)

In [None]:
fpr_random, tpr_random, thresholds1_random = roc_curve(y_val, random_probs)
prec_random, rec_random, thresholds2_random = precision_recall_curve(y_val, random_probs)

In [None]:
# Worst
fig = figure21(y_val, random_probs)

### Comparing Models

In [None]:
# Area under the curves of our model
auroc = auc(fpr, tpr)
aupr = auc(rec, prec)
print(auroc, aupr)

In [None]:
# Area under the curves of the random model
auroc_random = auc(fpr_random, tpr_random)
aupr_random = auc(rec_random, prec_random)
print(auroc_random, aupr_random)

## Putting It All Together

In [None]:
torch.manual_seed(13)

# Builds tensors from numpy arrays
x_train_tensor = torch.as_tensor(X_train).float()
y_train_tensor = torch.as_tensor(y_train.reshape(-1, 1)).float()

x_val_tensor = torch.as_tensor(X_val).float()
y_val_tensor = torch.as_tensor(y_val.reshape(-1, 1)).float()

# Builds dataset containing ALL data points
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

# Builds a loader of each set
train_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=16, 
    shuffle=True
)
val_loader = DataLoader(dataset=val_dataset, batch_size=16)

In [None]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

torch.manual_seed(42)
model = nn.Sequential()
model.add_module('linear', nn.Linear(2, 1))

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a BCE loss function
loss_fn = nn.BCEWithLogitsLoss()

In [None]:
n_epochs = 100

sbs = StepByStep(model, loss_fn, optimizer)
sbs.set_loaders(train_loader, val_loader)
sbs.train(n_epochs)

In [None]:
print(model.state_dict())

In [None]:
logits_val = sbs.predict(X_val)
probabilities_val = sigmoid(logits_val).squeeze()
cm_thresh50 = confusion_matrix(y_val, (probabilities_val >= 0.5))
cm_thresh50