(As always a new environment specifically for this course is recommended!)

## Handling Data in PyTorch

The torch ```tensor``` is the fundamental datatype used by PyTorch
- works very similar to arrays
- designed to work with GPUs
- optimized for automatic differentiation

In [2]:
import torch
import numpy as np

# array
X_array = np.array([[1,0],[0,1]])

# tensor
X_tensor = torch.tensor([[1,0],[0,1]])

X_array, X_tensor

ModuleNotFoundError: No module named 'torch'

In [None]:
# can easily convert back and forth
X_tensor.numpy(), torch.from_numpy(X_array)

If you want to use a GPU, you can use one for free in Google Colab or Kaggle for a limited amount of time each week.

In [None]:
# if a GPU is available we can send the tensor to the GPU
if torch.cuda.is_available():
    device = torch.device(0)
    X_tensor_cuda = X_tensor.to(device)

The ```Dataset``` is an abstract class which holds the recipe for producing your data
- can do complex operations to retrieve/transform your data in parallel
- You must implement the following methods:
 - ```__init__```
 - ```__len__```: length of the dataset
 - ```__getitem__```: recipe for retrieving the *i*-th datapoint

In [None]:
from torch.utils.data import Dataset

# create some linear data on [0,10] according to a slope, intercept, and number of desired points
def random_linear_data(m, b, n):
    x = 10 * np.random.rand(n)
    y = m * x + b + np.random.rand(n)
    return x, y

# create a dataset class
class LinearDataset(Dataset):
    # things I need to intialize
    def __init__(self, m, b, n):
        x, y = random_linear_data(m, b, n)
        self.x, self.y = torch.from_numpy(x), torch.from_numpy(y)
        self.n = n
        
    # length of the dataset
    def __len__(self):
        return self.n
    
    # how to get a datapoint
    # any transformations you want to do on-the-fly
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
linear_ds = LinearDataset(1.5, 50, 100)

In [None]:
# get first datapoint
next(iter(linear_ds))

In [None]:
X = []
Y = []
# iterate through the dataset
for x, y in linear_ds:
    X.append(x.item())
    Y.append(y.item())

import matplotlib.pyplot as plt
plt.scatter(X, Y)
plt.show()

In [None]:
# turn iris data into a Dataset
import seaborn as sns
import pandas as pd

iris = sns.load_dataset('iris')
iris = iris[iris.species != 'virginica']
iris.head()

In [None]:
class IrisDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.species_val = {'setosa':0,
                            'versicolor':1}
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # everything in getitem is done "on-the-fly"
        
        row = self.df.iloc[idx]
        x = torch.tensor([row['sepal_length'],
                          row['sepal_width']]).float()
        
        y = torch.tensor(self.species_val[row['species']]).float()
        
        return x, y
    
iris_ds = IrisDataset(iris)
next(iter(iris_ds))

In general a good rule of thumb for what to do on-the-fly vs. preprocessing:
- If it is random alteration (data augmentation): on-the-fly
- If it is a time-consuming step that is also the same each time: preprocessing

The ```Dataloader``` helps us iterate over a Dataset
- can choose batch size
- can shuffle
- can be retrieved in parallel
- automatically collates tensors

In [None]:
from torch.utils.data import DataLoader

iris_dl = DataLoader(iris_ds, batch_size=10, shuffle=True)

In [None]:
x, y = next(iter(iris_dl))
print(x.shape, y.shape)
print(x, y)

## Defining a Model

Let's define a simple Feed Forward neural network for the iris dataset

In [None]:
import torch.nn as nn

class TwoLayerNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN, self).__init__()
        
        # initialize the layers with random weights
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        
        # define the actual function
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        
        # don't worry about the last activation function for now
        return torch.squeeze(x)
        
model = TwoLayerNN(2, 5, 1)

In [None]:
print(model)

Note the attached gradient function below. PyTorch autograd is keeping track of the computational graph for computing partial derivatives with respect to the various parameters/weights.

In [None]:
x, y = next(iter(iris_dl))
model(x), y

Some very useful tools for looking at models

In [None]:
from torchsummary import summary
summary(model, input_size = (2,), device='cpu')

In [None]:
# uh oh
summary(model, input_size = (3,), device='cpu')

In [None]:
from torchviz import make_dot
make_dot(model(x), params=dict(list(model.named_parameters())))

In [None]:
# if the function is straightforward we can just use Sequential
#model = nn.Sequential(nn.Linear(2, 5),
#                      nn.ReLU(),
#                      nn.Linear(5, 1))

## Train the Model
We need the following ingredients
- A loss function for our model
- An optimization algorithm

In [None]:
import torch.optim as optim

# feeds outputs through a sigmoid before computing BCE Loss
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

Below we adjust the weights of the model according to one batch

In [None]:
# adjust the gradients according to one batch

x, y = next(iter(iris_dl))

# some layers will do different things during training/prediction (i.e. dropout)
model.train()

# compute the predictions then loss
y_pred = model(x)
loss = lossFun(y_pred, y)
print(loss.item())

# zero out the gradients in the optimizer (otherwise they will accumulate)
optimizer.zero_grad()

# compute the gradients w.r.t. loss function
loss.backward()

# adjust weights!
optimizer.step()

An *epoch* is one pass through the training set

In [None]:
# very crude training loop (you'll make a fancier one in your first lab)
for epoch in range(100):
    for x, y in iris_dl:
        model.train()
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        print(loss.item())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

## After Training
Let's use our model to make some predictions

In [None]:
x,y = next(iter(iris_dl))

# some layers will do different things during training/prediction (i.e. dropout)
model.eval()

# don't compute gradients
with torch.no_grad():
    outputs = torch.sigmoid(model(x))

y_pred = torch.zeros(10)
y_pred[outputs > .5] = 1

y_pred, y

In [None]:
# save your model parameters and optimizater checkpoint
checkpoint = {'model_state_dict': model.state_dict(),
              'optimizer_state_dict' :optimizer.state_dict()}
torch.save(checkpoint, 'model_checkpoint.pt')

In [None]:
# now load them up!
checkpoint = torch.load('model_checkpoint.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

You can save other things in the checkpoint such as the loss history, epoch number, etc. if you really want to save every aspect of your progress.

### Tip: Custom Loss

In [None]:
class some_loss(nn.Module):
    def __init__(self, hyperparam):
        super(some_loss, self).__init__()
        self.hyperparam = hyperparam
        
    
    def forward(self, y_pred, y):
        diff = y_pred - y
        
        # average over each entry and batch size
        torch.norm(diff) / torch.numel(doff)
        return