# 1 introduction

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d, cross_entropy

plt.rc("figure", dpi=100)

batch_size = 100

In [2]:
# transform images into normalized tensors
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5,), std=(0.5,))
])

train_dataset = datasets.MNIST(
    "./",
    download=True,
    train=True,
    transform=transform,
)

test_dataset = datasets.MNIST(
    "./",
    download=True,
    train=False,
    transform=transform,
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
    pin_memory=True,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
)

In [3]:
def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape) * std
    w.requires_grad = True
    return w


def rectify(x):
    # Rectified Linear Unit (ReLU)
    return torch.max(torch.zeros_like(x), x)

In [4]:
class RMSprop(optim.Optimizer):
    """
    This is a reduced version of the PyTorch internal RMSprop optimizer
    It serves here as an example
    """
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # state initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])

In [5]:
# define the neural network
def model(x, w_h, w_h2, w_o):
    h = rectify(x @ w_h)
    h2 = rectify(h @ w_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [None]:
# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)

optimizer = RMSprop(params=[w_h, w_h2, w_o])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        noise_py_x = model(x, w_h, w_h2, w_o)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                noise_py_x = model(x, w_h, w_h2, w_o)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 3.96e-01
Mean Test Loss:  1.97e-01
Epoch: 10
Mean Train Loss: 1.42e-01
Mean Test Loss:  2.58e-01
Epoch: 20
Mean Train Loss: 1.01e-01
Mean Test Loss:  3.08e-01
Epoch: 30
Mean Train Loss: 7.46e-02
Mean Test Loss:  6.31e-01
Epoch: 40
Mean Train Loss: 4.82e-02
Mean Test Loss:  6.37e-01
Epoch: 50
Mean Train Loss: 3.92e-02
Mean Test Loss:  6.22e-01
Epoch: 60
Mean Train Loss: 2.67e-02
Mean Test Loss:  8.37e-01
Epoch: 70
Mean Train Loss: 2.16e-02
Mean Test Loss:  7.67e-01
Epoch: 80
Mean Train Loss: 2.41e-02
Mean Test Loss:  8.72e-01
Epoch: 90
Mean Train Loss: 7.05e-03
Mean Test Loss:  8.99e-01
Epoch: 100
Mean Train Loss: 4.29e-03
Mean Test Loss:  9.54e-01


<matplotlib.legend.Legend at 0x29558a3eb50>

# 2 Dropout

In [12]:
def dropout (X, p_drop =0.5) :
    if 0 < p_drop < 1:
        # randomly mask elements of X based on binomial distribution
        mask = np.random.binomial(1, p=p_drop, size=X.shape).astype(bool)
        # copy of X is created using X.clone() to avoid modifying the original tensor
        X_drop = X.clone()
        X_drop[mask] = 0
        X_drop /= (1 - p_drop)
        return X_drop
    else:
        return X

In [9]:
def dropout_model (x, w_h , w_h2 , w_o , p_drop_input , p_drop_hidden ):

    
   # apply dropout to the input layer
    x = dropout(x, p_drop_input)

    # first hidden layer
    h = rectify(x @ w_h)
    # apply dropout to the first hidden layer
    h = dropout(h, p_drop_hidden)

    # second hidden layer
    h2 = rectify(h @ w_h2)
    # apply dropout to the second hidden layer
    h2 = dropout(h2, p_drop_hidden)

    # output layer
    pre_softmax = h2 @ w_o
    return pre_softmax

In [None]:
# Set dropout probabilities
p_drop_input = 0.2
p_drop_hidden = 0.5

# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)

optimizer = RMSprop(params=[w_h, w_h2, w_o])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        noise_py_x = dropout_model(x, w_h, w_h2, w_o,p_drop_input,p_drop_hidden)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                noise_py_x = dropout_model(x, w_h, w_h2, w_o,p_drop_input,p_drop_hidden)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 7.40e-01
Mean Test Loss:  4.36e-01
Epoch: 10
Mean Train Loss: 6.95e-01
Mean Test Loss:  7.75e-01
Epoch: 20
Mean Train Loss: 8.49e-01
Mean Test Loss:  1.06e+00
Epoch: 30
Mean Train Loss: 9.61e-01
Mean Test Loss:  1.08e+00
Epoch: 40
Mean Train Loss: 1.06e+00
Mean Test Loss:  1.42e+00
Epoch: 50
Mean Train Loss: 1.11e+00
Mean Test Loss:  1.54e+00
Epoch: 60
Mean Train Loss: 1.18e+00
Mean Test Loss:  1.40e+00
Epoch: 70
Mean Train Loss: 1.28e+00
Mean Test Loss:  1.56e+00
Epoch: 80
Mean Train Loss: 1.35e+00
Mean Test Loss:  2.01e+00
Epoch: 90
Mean Train Loss: 1.39e+00
Mean Test Loss:  1.58e+00
Epoch: 100
Mean Train Loss: 1.39e+00
Mean Test Loss:  1.85e+00


<matplotlib.legend.Legend at 0x24d17ea5d10>

# 3Parametric Relu 

In [7]:
def PRelu (X,a):
    return np.maximum(0,X)+np.minimum(0,a*X)

In [6]:
#test PRelu function 
x=np.array([1, -1])
a=0.5
x_new=PRelu(x,a)
print(x_new)

[ 1.  -0.5]


In [None]:
# Set dropout probabilities
p_drop_input = 0.2
p_drop_hidden = 0.5

#set a
a=0.5

# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)

optimizer = RMSprop(params=[w_h, w_h2, w_o])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        
        #PRelu
        x=PRelu(x,a)
        
        noise_py_x = dropout_model(x, w_h, w_h2, w_o,p_drop_input,p_drop_hidden)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                
                #PRelu
                x=PRelu(x,a)
                
                noise_py_x = dropout_model(x, w_h, w_h2, w_o,p_drop_input,p_drop_hidden)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 5.76e-01
Mean Test Loss:  3.80e-01
Epoch: 10
Mean Train Loss: 5.26e-01
Mean Test Loss:  5.61e-01
Epoch: 20
Mean Train Loss: 6.36e-01
Mean Test Loss:  8.54e-01
Epoch: 30
Mean Train Loss: 7.48e-01
Mean Test Loss:  8.64e-01
Epoch: 40
Mean Train Loss: 7.85e-01
Mean Test Loss:  1.44e+00
Epoch: 50
Mean Train Loss: 8.31e-01
Mean Test Loss:  1.36e+00
Epoch: 60
Mean Train Loss: 8.92e-01
Mean Test Loss:  1.52e+00
Epoch: 70
Mean Train Loss: 9.19e-01
Mean Test Loss:  1.29e+00
Epoch: 80
Mean Train Loss: 9.92e-01
Mean Test Loss:  1.49e+00
Epoch: 90
Mean Train Loss: 9.98e-01
Mean Test Loss:  1.70e+00
Epoch: 100
Mean Train Loss: 9.96e-01
Mean Test Loss:  1.76e+00


<matplotlib.legend.Legend at 0x21b7757e810>

# 4 Convolutional layers

In [None]:
#Defining the convolutional neural network
class LeNet5(nn.Module):
    def __init__(self, num_classes):
        super(ConvNeuralNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Linear(400, 120)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(120, 84)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(84, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.relu(out)
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        return out

In [None]:
#Setting Hyperparameters


model = LeNet5(num_classes).to(device)

#Setting the loss function
cost = nn.CrossEntropyLoss()

#Setting the optimizer with the model parameters and learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#this is defined to print how many steps are remaining when training
total_step = len(train_loader)

In [None]:
#train
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        images = images.to(device)
        labels = labels.to(device)
        
        #Forward pass
        outputs = model(images)
        loss = cost(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 400 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

In [None]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
  
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))


In [None]:
from torch .nn. functional import conv2d , max_pool2d

convolutional_layer = rectify ( conv2d ( previous_layer , weightvector ))
# reduces (2 ,2) window to 1 pixel
subsampling_layer = max_pool_2d ( convolutional_layer , (2 , 2) )
out_layer = dropout ( subsample_layer , p_drop_input )

In [22]:
# define the neural network
def LeNetmodel(x, w_h, w_h2, w_o,p_drop_input , p_drop_hidden):
    x=x.reshape ( -1 , 1, 28 , 28) 
    
    #h
    convolutional_layer = rectify ( conv2d ( x , w_h ))
    subsampling_layer = max_pool2d ( convolutional_layer , (2 , 2) )
    out_layer = dropout ( subsampling_layer , p_drop_input )
    
    convolutional_layer = rectify ( conv2d ( out_layer , w_h2 ))
    subsampling_layer = max_pool2d ( convolutional_layer , (2 , 2) )
    out_layer = dropout ( subsampling_layer , p_drop_input )
    
    convolutional_layer = rectify ( conv2d ( out_layer , w_o ))
    subsampling_layer = max_pool2d ( convolutional_layer , (2 , 2) )
    out_layer = dropout ( subsampling_layer , p_drop_input )
    
    #h2
    #print(out_layer.shape)#[100,128,1,1]
    h=out_layer.reshape(100,128)
    old_w_h2 = init_weights (( 128 , 625) )
    old_w_o = init_weights((625, 10))
    
    
    #h = rectify(x @ w_h)
    h2 = rectify(h @ old_w_h2)
    pre_softmax = h2 @ old_w_o
    return pre_softmax

In [None]:


# Set dropout probabilities
p_drop_input = 0.2
p_drop_hidden = 0.5

#set a
a=0.5

# initialize weights

# input shape is (B, 784)
#w_h = init_weights((784, 625))
# hidden layer with 625 neurons
#w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
#w_o = init_weights((625, 10))
# output shape is (B, 10)
#init_weights ((f, pic_in , k_x , k_y ))
w1=init_weights ((32, 1 , 5 , 5 ))
w2=init_weights ((64, 32 , 5 , 5 ))
w3=init_weights ((128, 64 , 2 , 2 ))

optimizer = RMSprop(params=[w1, w2, w3])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch
        
        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        
        #PRelu
        x=PRelu(x,a)
        
        noise_py_x = LeNetmodel(x, w1, w2, w3,p_drop_input,p_drop_hidden)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                
                #PRelu
                x=PRelu(x,a)
                
                noise_py_x = LeNetmodel(x, w1, w2, w3,p_drop_input,p_drop_hidden)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 3.15e+00
Mean Test Loss:  2.30e+00
Epoch: 10
Mean Train Loss: 2.30e+00
Mean Test Loss:  2.30e+00
Epoch: 20
Mean Train Loss: 2.30e+00
Mean Test Loss:  2.30e+00
Epoch: 30
Mean Train Loss: 2.30e+00
Mean Test Loss:  2.30e+00
Epoch: 40
Mean Train Loss: 2.30e+00
Mean Test Loss:  2.30e+00
