In [2]:
"""
https://pytorch.org/tutorials/beginner/nn_tutorial.html#refactor-using-dataset
"""

In [96]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import DataLoader


In [70]:
BATCH_SIZE = 10

## Prepare the initial dataset

In [89]:
df = pd.read_csv("data/iris.data", header=None)
#print(df.head())
dum = pd.get_dummies(df[4])
#print(dum)
df = pd.concat([df, dum], axis=1)
df = df.drop([4], axis=1)
print(df.head())

     0    1    2    3  Iris-setosa  Iris-versicolor  Iris-virginica
0  5.1  3.5  1.4  0.2            1                0               0
1  4.9  3.0  1.4  0.2            1                0               0
2  4.7  3.2  1.3  0.2            1                0               0
3  4.6  3.1  1.5  0.2            1                0               0
4  5.0  3.6  1.4  0.2            1                0               0


In [90]:
X = df.iloc[:,:-3]
print(X.head())

     0    1    2    3
0  5.1  3.5  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  1.3  0.2
3  4.6  3.1  1.5  0.2
4  5.0  3.6  1.4  0.2


In [91]:
target = df.iloc[:,-3:]
print(target.head())

   Iris-setosa  Iris-versicolor  Iris-virginica
0            1                0               0
1            1                0               0
2            1                0               0
3            1                0               0
4            1                0               0


In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.20, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120, 3)
(30, 4)
(30, 3)


## Transform Pandas DF to Tensors

In [93]:
# Fist transform them to numpy
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

# Then transform them to Tensors
X_train, y_train, X_test, y_test = map( torch.tensor, (X_train, y_train, X_test, y_test))
n, c = X_train.shape
print("Rows", n)
print("cols:", c)

Rows 120
cols: 4


## Prepare the Torch Dataset

In [94]:
train_ds = TensorDataset(X_train, y_train)


## Get the batches without DataLoader

In [95]:
for epoch in range(1):
    for i in range((n-1) // BATCH_SIZE + 1 ):
        xb, yb = train_ds[i*BATCH_SIZE: i*BATCH_SIZE+BATCH_SIZE]
        print(xb.shape, yb.shape)

torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])


## Use a Torch DataLoader

In [103]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(1):
    for xb, yb in train_dl:
        print(xb.shape, yb.shape)

torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])
torch.Size([10, 4]) torch.Size([10, 3])


## Manage Validation data

In [None]:
test_ds = TensorDataset(X_test, y_test)
#We’ll use a batch size for the validation set that is twice as large as that for the training set.
# This is because the validation set does not need backpropagation
# and thus takes less memory (it doesn’t need to store the gradients).
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE*2)

In [None]:
for epoch in range(epochs):
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_func(model(xb), yb) for xb, yb in test_dl)

    print(epoch, valid_loss / len(test_dl))

## Wrapping DataLoader

Our CNN is fairly concise, but it only works with MNIST, because:
1. It assumes the input is a 28*28 long vector
2. It assumes that the final CNN grid size is 4*4 (since that’s the average
pooling kernel size we used)

Let’s get rid of these two assumptions, so our model works with any 2d single channel image. First, we can remove the initial Lambda layer but moving the data preprocessing into a generator:

In [None]:
def get_data(train_ds, valid_ds, bs):
    """
    returns dataloaders for the training and validation sets.
    :param train_ds: 
    :param valid_ds: 
    :param bs: 
    :return: 
    """
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )

def preprocess(x, y):
    return x.view(-1, 1, 28, 28), y

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))

train_dl, valid_dl = get_data(train_ds, test_ds, BATCH_SIZE)
train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)