In [29]:
import torch
import torchvision

from torchvision import transforms, datasets

In [99]:
train = datasets.MNIST("", train = True, download = True, 
                       transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train = False, download = True, 
                       transform = transforms.Compose([transforms.ToTensor()]))


  0%|          | 0/9912422 [00:00<?, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:03, 3067763.44it/s]                             


Extracting MNIST/raw/train-images-idx3-ubyte.gz to MNIST/raw
Using downloaded and verified file: MNIST/raw/train-labels-idx1-ubyte.gz
Extracting MNIST/raw/train-labels-idx1-ubyte.gz to MNIST/raw
Using downloaded and verified file: MNIST/raw/t10k-images-idx3-ubyte.gz
Extracting MNIST/raw/t10k-images-idx3-ubyte.gz to MNIST/raw
Using downloaded and verified file: MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting MNIST/raw/t10k-labels-idx1-ubyte.gz to MNIST/raw
Processing...
Done!


In [103]:
# train.

In [100]:
trainset = torch.utils.data.DataLoader(train, batch_size = 50, shuffle = True)
testset = torch.utils.data.DataLoader(test, batch_size = 50, shuffle = True)

In [107]:
trainset.dataset.targets

tensor([5, 0, 4,  ..., 5, 6, 8])

### Initialize the model

In [4]:
X_1  = trainset.dataset.data[0]
y_1 = trainset.dataset.targets[0]

In [52]:
X_tensor = torch.Tensor(X)
y_tensor = torch.Tensor(y.astype('int64'))

In [58]:
net(X_tensor[:1])

tensor([[0.1399, 0.1296, 0.0474, 0.1319, 0.0643, 0.1262, 0.0992, 0.1222, 0.0865,
         0.0528]], grad_fn=<SoftmaxBackward>)

In [56]:
# y_tensor.dtype

In [43]:
# type(X_tensor[0])

In [30]:
import torch.nn as nn
import torch.nn.functional as F
def build_model():
    W1 = nn.Linear(28*28, 64)
    W2 = nn.Linear(64, 10)
    return {'W1': W1, 'W2': W2} 

In [6]:
model = build_model()

model

{'W1': Linear(in_features=784, out_features=64, bias=True),
 'W2': Linear(in_features=64, out_features=10, bias=True)}

In [7]:
def forward(X, model):
    W1, W2 = tuple(model.values())
    Z1 = W1(X)
    A1 = F.sigmoid(Z1)
    Z2 = W2(A1)
    A2 = F.softmax(Z2, dim = 1)
    return (Z1, A1, Z2, A2)

### Combine Into Class

In [31]:
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        A1 = torch.sigmoid(self.fc1(x))
        return F.softmax(self.fc2(A1), dim = 1)

In [33]:
net = Net()

In [53]:
net(X_1.float().view(-1,784)).shape

torch.Size([1, 10])

In [64]:
for param in net.parameters():
    print(param.shape)

torch.Size([64, 784])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])


In [65]:
net.parameters()

<generator object Module.parameters at 0x185f05ed0>

### Training A Network

In [None]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X.shape

In [91]:
X_tensor = torch.Tensor(X)
y_tensor = torch.Tensor(y.astype('long'))

In [92]:
X_data = torch.utils.data.DataLoader(X_tensor,  batch_size = 50)
y_data = torch.utils.data.DataLoader(y_tensor,  batch_size = 50)

In [93]:
output = net(X_tensor[:1].view(1, 784))
output

tensor([[3.6040e-12, 2.8281e-14, 7.8524e-16, 1.1378e-06, 2.0242e-21, 1.0000e+00,
         5.2465e-18, 2.2632e-11, 5.8933e-15, 7.0108e-17]],
       grad_fn=<SoftmaxBackward>)

In [94]:
import torch.optim as optim

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0005)

In [108]:
for epoch in range(10): # 3 full passes over the data
    for data in trainset:  # `data` is a batch of data
        X, y = data  # X is the batch of features, y is the batch of targets.
        net.zero_grad()  # sets gradients to 0 before loss calc. You will do this likely every step.
        output = net(X.view(-1,28*28))  # pass in the reshaped batch (recall they are 28x28 atm)
        loss = F.nll_loss(output, y)  # calc and grab the loss value
        loss.backward()  # apply this loss backwards thru the network's parameters
        optimizer.step()  # attempt to optimize weights to account for loss/gradients
    print(loss)  # print loss. We hope loss (a measure of wrong-ness) declines! 

tensor(-0.9686, grad_fn=<NllLossBackward>)
tensor(-0.9338, grad_fn=<NllLossBackward>)
tensor(-0.9363, grad_fn=<NllLossBackward>)
tensor(-0.9966, grad_fn=<NllLossBackward>)
tensor(-0.9321, grad_fn=<NllLossBackward>)
tensor(-0.9400, grad_fn=<NllLossBackward>)
tensor(-0.9517, grad_fn=<NllLossBackward>)
tensor(-0.9379, grad_fn=<NllLossBackward>)
tensor(-0.9381, grad_fn=<NllLossBackward>)
tensor(-0.9770, grad_fn=<NllLossBackward>)


In [110]:
torch.argmax(net(trainset.dataset.data.view(-1, 784).float()), axis = 1)[:100]

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1,
        1, 2, 4, 3, 2, 1, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9, 3, 9, 8, 5,
        8, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0, 4, 5, 6, 1, 0, 0, 1, 7,
        1, 6, 3, 0, 2, 1, 1, 7, 3, 0, 2, 6, 7, 8, 3, 9, 0, 4, 6, 7, 4, 6, 8, 0,
        7, 8, 3, 1])

In [111]:
trainset.dataset.targets[:100]

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1,
        1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9, 3, 9, 8, 5,
        9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0, 4, 5, 6, 1, 0, 0, 1, 7,
        1, 6, 3, 0, 2, 1, 1, 7, 9, 0, 2, 6, 7, 8, 3, 9, 0, 4, 6, 7, 4, 6, 8, 0,
        7, 8, 3, 1])

In [112]:
torch.argmax(net(testset.dataset.data.view(-1, 784).float()), axis = 1)[:100]

tensor([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6, 6, 5,
        4, 0, 7, 4, 0, 1, 3, 1, 3, 5, 7, 2, 7, 1, 2, 1, 1, 7, 4, 2, 3, 5, 1, 2,
        4, 4, 6, 3, 5, 5, 6, 0, 4, 1, 9, 5, 7, 8, 9, 3, 7, 4, 6, 4, 3, 0, 7, 0,
        2, 9, 1, 7, 3, 2, 9, 7, 7, 6, 2, 7, 8, 4, 7, 3, 6, 1, 3, 6, 9, 3, 1, 4,
        9, 7, 6, 9])

In [113]:
testset.dataset.targets[:100]

tensor([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6, 6, 5,
        4, 0, 7, 4, 0, 1, 3, 1, 3, 4, 7, 2, 7, 1, 2, 1, 1, 7, 4, 2, 3, 5, 1, 2,
        4, 4, 6, 3, 5, 5, 6, 0, 4, 1, 9, 5, 7, 8, 9, 3, 7, 4, 6, 4, 3, 0, 7, 0,
        2, 9, 1, 7, 3, 2, 9, 7, 7, 6, 2, 7, 8, 4, 7, 3, 6, 1, 3, 6, 9, 3, 1, 4,
        1, 7, 6, 9])

### Resources

[Towards data science Pytorch Gradients](https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e)

[Pytorch viz](https://github.com/szagoruyko/pytorchviz)