# **2 Layer Neural Network**

1. Weights initialized using contrastive divergence.
2. Fine tuning using back propagation.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/Colab\ Notebooks/


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Notebooks


In [0]:
#import libraries
import numpy as np
from sklearn.linear_model import LogisticRegression
import torch
import torchvision.datasets
import torchvision.models
import torchvision.transforms
CUDA = torch.cuda.is_available()
CUDA_DEVICE = 0
if CUDA:
    torch.cuda.set_device(CUDA_DEVICE)

## **Model**

In [0]:
import torch
class RBM():
    def __init__(self, num_visible, num_hidden, k, learning_rate=1e-3, momentum_coefficient=0.5, weight_decay=1e-4, use_cuda=True):
        self.num_visible = num_visible
        self.num_hidden = num_hidden
        self.k = k
        self.learning_rate = learning_rate
        self.momentum_coefficient = momentum_coefficient
        self.weight_decay = weight_decay
        self.use_cuda = use_cuda

        self.weights = torch.randn(num_visible, num_hidden) * 0.1
        self.visible_bias = torch.ones(num_visible) * 0.5
        self.hidden_bias = torch.zeros(num_hidden)

        self.weights_momentum = torch.zeros(num_visible, num_hidden)
        self.visible_bias_momentum = torch.zeros(num_visible)
        self.hidden_bias_momentum = torch.zeros(num_hidden)

        if self.use_cuda:
            self.weights = self.weights.cuda()
            self.visible_bias = self.visible_bias.cuda()
            self.hidden_bias = self.hidden_bias.cuda()

            self.weights_momentum = self.weights_momentum.cuda()
            self.visible_bias_momentum = self.visible_bias_momentum.cuda()
            self.hidden_bias_momentum = self.hidden_bias_momentum.cuda()

    def sample_hidden(self, visible_probabilities):
        hidden_activations = torch.matmul(visible_probabilities, self.weights) + self.hidden_bias
        hidden_probabilities = self._sigmoid(hidden_activations)
        return hidden_probabilities

    def sample_visible(self, hidden_probabilities):
        visible_activations = torch.matmul(hidden_probabilities, self.weights.t()) + self.visible_bias
        visible_probabilities = self._sigmoid(visible_activations)
        return visible_probabilities

    def contrastive_divergence(self, input_data):
        # Positive phase
        positive_hidden_probabilities = self.sample_hidden(input_data)
        positive_hidden_activations = (positive_hidden_probabilities >= self._random_probabilities(self.num_hidden)).float()
        positive_associations = torch.matmul(input_data.t(), positive_hidden_activations)

        # Negative phase
        hidden_activations = positive_hidden_activations

        for step in range(self.k):
            visible_probabilities = self.sample_visible(hidden_activations)
            hidden_probabilities = self.sample_hidden(visible_probabilities)
            hidden_activations = (hidden_probabilities >= self._random_probabilities(self.num_hidden)).float()

        negative_visible_probabilities = visible_probabilities
        negative_hidden_probabilities = hidden_probabilities

        negative_associations = torch.matmul(negative_visible_probabilities.t(), negative_hidden_probabilities)

        # Update parameters
        self.weights_momentum *= self.momentum_coefficient
        self.weights_momentum += (positive_associations - negative_associations)

        self.visible_bias_momentum *= self.momentum_coefficient
        self.visible_bias_momentum += torch.sum(input_data - negative_visible_probabilities, dim=0)

        self.hidden_bias_momentum *= self.momentum_coefficient
        self.hidden_bias_momentum += torch.sum(positive_hidden_probabilities - negative_hidden_probabilities, dim=0)

        batch_size = input_data.size(0)

        self.weights += self.weights_momentum * self.learning_rate / batch_size
        self.visible_bias += self.visible_bias_momentum * self.learning_rate / batch_size
        self.hidden_bias += self.hidden_bias_momentum * self.learning_rate / batch_size

        self.weights -= self.weights * self.weight_decay  # L2 weight decay

        # Compute reconstruction error
        error = torch.sum((input_data - negative_visible_probabilities)**2)

        return error

    def _sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))

    def _random_probabilities(self, num):
        random_probabilities = torch.rand(num)

        if self.use_cuda:
            random_probabilities = random_probabilities.cuda()

        return random_probabilities


In [0]:
def train_rbm(EPOCHS,train_loader,rbm,VISIBLE_UNITS):
  for epoch in range(EPOCHS):
      epoch_error = 0.0
      for batch, _ in train_loader:
          batch = batch.view(len(batch), VISIBLE_UNITS)  # flatten input data
          if CUDA:
              batch = batch.cuda()
          batch_error = rbm.contrastive_divergence(batch)
          epoch_error += batch_error

      print('Epoch Error (epoch=%d): %.4f' % (epoch, epoch_error))

## **Load dataset**

In [0]:
########## LOADING DATASET ##########
# from rbm import RBM
DATA_FOLDER = 'data/mnist'
print('Loading dataset...')
BATCH_SIZE = 300
train_dataset = torchvision.datasets.MNIST(root=DATA_FOLDER, train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)

test_dataset = torchvision.datasets.MNIST(root=DATA_FOLDER, train=False, transform=torchvision.transforms.ToTensor(), download=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)


Loading dataset...


## **Train**


### RBN1

In [0]:
#RBM 1
BATCH_SIZE = 300
VISIBLE_UNITS = 784  # 28 x 28 images
HIDDEN_UNITS = 500
CD_K = 2
EPOCHS = 10
rbm = RBM(VISIBLE_UNITS, HIDDEN_UNITS, CD_K, use_cuda=CUDA)
########## TRAINING RBM ##########
print('Training RBM...')
train_rbm(EPOCHS,train_loader,rbm,VISIBLE_UNITS)

Training RBM...
Epoch Error (epoch=0): 3658332.7500
Epoch Error (epoch=1): 1998634.8750
Epoch Error (epoch=2): 1850227.7500
Epoch Error (epoch=3): 1802715.2500
Epoch Error (epoch=4): 1814810.6250
Epoch Error (epoch=5): 1835410.8750
Epoch Error (epoch=6): 1869499.2500
Epoch Error (epoch=7): 1902424.1250
Epoch Error (epoch=8): 1943025.7500
Epoch Error (epoch=9): 1993747.1250


### RBN2

In [0]:
########## EXTRACT FEATURES OF PREVIOUS RBN ##########
print('Extracting features...')

train_features = np.zeros((len(train_dataset), HIDDEN_UNITS))
train_labels = np.zeros(len(train_dataset))

for i, (batch, labels) in enumerate(train_loader):
    batch = batch.view(len(batch), VISIBLE_UNITS)  # flatten input data

    if CUDA:
        batch = batch.cuda()

    train_features[i*BATCH_SIZE:i*BATCH_SIZE+len(batch)] = rbm.sample_hidden(batch).cpu().numpy()
    train_labels[i*BATCH_SIZE:i*BATCH_SIZE+len(batch)] = labels.numpy()
#
#Prep data for RBN2
tensor_y = torch.from_numpy(train_labels)
tensor_x = torch.stack([torch.Tensor(i) for i in train_features]) # transform to torch tensors
# tensor_y = torch.stack([torch.Tensor(i) for i in [train_labels]])
my_dataset = torch.utils.data.TensorDataset(tensor_x,tensor_y) 
my_dataloader = torch.utils.data.DataLoader(my_dataset, batch_size=BATCH_SIZE)
# tensor_y.shape

Extracting features...


In [0]:
def train_rbm_2(EPOCHS,train_loader,rbm,VISIBLE_UNITS):
  for epoch in range(EPOCHS):
      epoch_error = 0.0
      for batch, _ in train_loader:
#           batch = batch.view(len(batch), VISIBLE_UNITS)  # flatten input data
          if CUDA:
              batch = batch.cuda()
          batch_error = rbm.contrastive_divergence(batch)
          epoch_error += batch_error

      print('Epoch Error (epoch=%d): %.4f' % (epoch, epoch_error))
      
BATCH_SIZE = 50
VISIBLE_UNITS = 500  # 28 x 28 images
HIDDEN_UNITS = 500
CD_K = 2
EPOCHS = 10
rbm2 = RBM(VISIBLE_UNITS, HIDDEN_UNITS, CD_K, use_cuda=CUDA)
########## TRAINING RBM ##########
print('Training RBM...')
train_rbm_2(EPOCHS,my_dataloader,rbm2,VISIBLE_UNITS)

Training RBM...
Epoch Error (epoch=0): 3373277.5000
Epoch Error (epoch=1): 2694158.7500
Epoch Error (epoch=2): 2201942.7500
Epoch Error (epoch=3): 1893600.1250
Epoch Error (epoch=4): 1703827.6250
Epoch Error (epoch=5): 1574179.2500
Epoch Error (epoch=6): 1475032.3750
Epoch Error (epoch=7): 1405237.8750
Epoch Error (epoch=8): 1344130.0000
Epoch Error (epoch=9): 1288003.6250


###Fine tune using backprop

In [0]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
input_size = 784
hidden_size = 500
num_classes = 10
num_epochs = 50
batch_size = 500
learning_rate = 0.001

# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
        #Initialise with rbm weights
#         with torch.no_grad():
        self.fc1.weight = torch.nn.Parameter(torch.transpose(rbm.weights,0,1))
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

Epoch [1/50], Step [100/200], Loss: 0.1297
Epoch [1/50], Step [200/200], Loss: 0.1098
Epoch [2/50], Step [100/200], Loss: 0.0463
Epoch [2/50], Step [200/200], Loss: 0.0641
Epoch [3/50], Step [100/200], Loss: 0.0260
Epoch [3/50], Step [200/200], Loss: 0.0424
Epoch [4/50], Step [100/200], Loss: 0.0167
Epoch [4/50], Step [200/200], Loss: 0.0276
Epoch [5/50], Step [100/200], Loss: 0.0119
Epoch [5/50], Step [200/200], Loss: 0.0168
Epoch [6/50], Step [100/200], Loss: 0.0093
Epoch [6/50], Step [200/200], Loss: 0.0103
Epoch [7/50], Step [100/200], Loss: 0.0073
Epoch [7/50], Step [200/200], Loss: 0.0069
Epoch [8/50], Step [100/200], Loss: 0.0059
Epoch [8/50], Step [200/200], Loss: 0.0049
Epoch [9/50], Step [100/200], Loss: 0.0048
Epoch [9/50], Step [200/200], Loss: 0.0037
Epoch [10/50], Step [100/200], Loss: 0.0039
Epoch [10/50], Step [200/200], Loss: 0.0029
Epoch [11/50], Step [100/200], Loss: 0.0032
Epoch [11/50], Step [200/200], Loss: 0.0024
Epoch [12/50], Step [100/200], Loss: 0.0026
Epoch 