In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.autograd import Function

# Hyper Parameters 
input_size = 784
num_classes = 10
num_epochs = 5
batch_size = 100
learning_rate = 0.001

# MNIST Dataset (Images and Labels)
train_dataset = dsets.MNIST(root='.', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='.', 
                           train=False, 
                           transform=transforms.ToTensor())

# Dataset Loader (Input Pipline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)


In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, num_classes)
        self.fc2 = nn.Linear(500,100)
        self.fc3 = nn.Linear(input_size,num_classes)
    
    def forward(self, x):
#         x = self.fc1(x).clamp(min=0)
#         x = self.fc2(x).clamp(min=0)
        return self.fc3(x)


In [None]:
model = LogisticRegression(input_size, num_classes)

# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

# Training the Model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = Variable(images.view(-1, 28*28))
        labels = Variable(labels)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                   % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.item()))

# Test the Model
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.view(-1, 28*28))
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
    
print('Accuracy of the model on the 10000 test images: %d %%' % (100 * correct / total))


In [28]:
class LinearFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight)
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight.t(), grad_bias


In [31]:
linear = LinearFunction.apply

w1 = torch.randn(784,10, requires_grad=True)
b1 = torch.randn(10, requires_grad=True)
w1.shape, b1.shape
criterion = nn.CrossEntropyLoss()  
learning_rate = 1e-6
# w1.shape, w1.t().shape

In [32]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = Variable(images.view(-1, 28*28))
        labels = Variable(labels)
        
        # Forward + Backward + Optimize
#         optimizer.zero_grad()
        outputs = linear(images, w1, b1)
        loss = criterion(outputs, labels)
        loss.backward()
        
        with torch.no_grad():
            w1 -= learning_rate * w1.grad
            b1 -= learning_rate * b1.grad
            w1.grad.zero_()
            b1.grad.zero_()
        
        if (i+1) % 100 == 0:
            print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                   % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.item()))



Epoch: [1/5], Step: [100/600], Loss: 15.0241
Epoch: [1/5], Step: [200/600], Loss: 12.6359
Epoch: [1/5], Step: [300/600], Loss: 11.3570
Epoch: [1/5], Step: [400/600], Loss: 12.7539
Epoch: [1/5], Step: [500/600], Loss: 12.4810
Epoch: [1/5], Step: [600/600], Loss: 13.1052
Epoch: [2/5], Step: [100/600], Loss: 13.1589
Epoch: [2/5], Step: [200/600], Loss: 12.7618
Epoch: [2/5], Step: [300/600], Loss: 12.6086
Epoch: [2/5], Step: [400/600], Loss: 13.7224
Epoch: [2/5], Step: [500/600], Loss: 12.8392
Epoch: [2/5], Step: [600/600], Loss: 12.3166
Epoch: [3/5], Step: [100/600], Loss: 13.1667
Epoch: [3/5], Step: [200/600], Loss: 13.2436
Epoch: [3/5], Step: [300/600], Loss: 14.2716
Epoch: [3/5], Step: [400/600], Loss: 12.8490
Epoch: [3/5], Step: [500/600], Loss: 13.1255
Epoch: [3/5], Step: [600/600], Loss: 12.2093
Epoch: [4/5], Step: [100/600], Loss: 13.4401
Epoch: [4/5], Step: [200/600], Loss: 12.6585
Epoch: [4/5], Step: [300/600], Loss: 13.8559
Epoch: [4/5], Step: [400/600], Loss: 13.3821
Epoch: [4/

In [34]:
# Test the Model
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.view(-1, 28*28))
    outputs = linear(images,w1,b1)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
    
print('Accuracy of the model on the 10000 test images: %d %%' % (100 * correct / total))


Accuracy of the model on the 10000 test images: 11 %


In [18]:
for i, (images, labels) in enumerate(train_loader):
    images = Variable(images.view(-1, 28*28))
    labels = Variable(labels)

out = linear(images, w1, b1)
loss = criterion(out, labels)


In [None]:
w1.grad.zero_()
b1.grad.zero_()

In [23]:
w1.grad.shape., b1.grad.shape

(torch.Size([10, 784]), torch.Size([10]))

In [19]:
loss.backward()