![Screenshot%202023-08-24%20024733.png](attachment:Screenshot%202023-08-24%20024733.png)

## How gradient descent can solve the problem of optimizing neural network

* Saddle points:
    saddle points are basically a flat point in the surface, the value of gradient is zero and flat surfaces. we will not be able to update the weights, we will get stuck on the saddle point. gradient descent with momentum can help to overcome this.
    
    momentum (hyper parameter) helps to achieve a non-zero velocity at the flat surface, this allows to continously update the parameter and we will not stop at saddle points. 
    
    if we pick the momentum value too small, we will get stuck at saddle point. we can perform cross-validation to check a momentum term such that we won't get stuck in the saddle point.

* Local Minima:
    local minima is another problem that gradient descent with momentum helps us overcome. if momentum is too small, the ball will get caught at local minimum and if the momentum is too large, we may over shoot the global minimum. 

In [1]:
pip show torch

Name: torch
Version: 1.13.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: c:\programdata\anaconda3\lib\site-packages
Requires: typing_extensions
Required-by: torchvision, torchaudio
Note: you may need to restart the kernel to use updated packages.




In [2]:
# Import the libraries for this lab

import matplotlib.pyplot as plt 
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from matplotlib.colors import ListedColormap
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

torch.manual_seed(1)
np.random.seed(1)


# Create dataset object
class Net(nn.Module):
    
    # Constructor
    def __init__(self, Layers):
        super(Net, self).__init__()
        self.hidden = nn.ModuleList()
        for input_size, output_size in zip(Layers, Layers[1:]):
            self.hidden.append(nn.Linear(input_size, output_size))
    
    # Prediction
    def forward(self, x):
        L = len(self.hidden)
        for (l, linear_transform) in zip(range(L), self.hidden):
            if l < L - 1:
                x = F.relu(linear_transform(x))    
            else:
                x = linear_transform(x)
        return x

    
# Define the function for training the model

def train(data_set, model, criterion, train_loader, optimizer, epochs=100):
    LOSS = []
    ACC = []
    for epoch in range(epochs):
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            yhat = model(x)
            loss = criterion(yhat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        max_memory_allocated = torch.cuda.max_memory_allocated(device=device) / (1024 ** 3)  # Convert to GB
        
        epoch_loss= loss.item()
        epoch_accuracy= accuracy(model,x,y)
        print(f"Epoch {epoch+1}/{epochs}  - "
              f"loss: {epoch_loss:.4f}  - "
              f"accuracy: {epoch_accuracy:.4f}  - "
              f"GPU max memory allocated: {max_memory_allocated:.2f} GB  - "
              f"GPU memory cached: {gpu_cached:.2f} GB")
        LOSS.append(epoch_loss)
        ACC.append(epoch_accuracy)
        

        
    results ={"Loss":LOSS, "Accuracy":ACC}
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.plot(LOSS,color=color)
    ax1.set_xlabel('epoch', color=color)
    ax1.set_ylabel('total loss', color=color)
    ax1.tick_params(axis = 'y', color=color)
    
    ax2 = ax1.twinx()  
    color = 'tab:blue'
    ax2.set_ylabel('accuracy', color=color)  # we already handled the x-label with ax1
    ax2.plot(ACC, color=color)
    ax2.tick_params(axis='y', color=color)
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    
    plt.show()
    return results


# Define a function for calculating accuracy
def accuracy(model, inputs, targets):
    correct_predictions = 0
    total_samples = len(inputs)

    _, yhat = torch.max(model(inputs), 1)

    correct_predictions = (yhat == targets).sum().item()
    
    return correct_predictions / total_samples


# Initialize a dictionary to contain the cost and accuracy
Results = {"momentum 0": {"Loss": 0, "Accuracy:": 0}, "momentum 0.1": {"Loss": 0, "Accuracy:": 0}}


# Define data transformations
transform = transforms.Compose([
    transforms.ToTensor(),         # Convert images to tensors
    transforms.Lambda(lambda x: x.view(-1))  # Flatten each image to a row
])

# Download and create MNIST datasets
train_data_set = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_data_set = datasets.MNIST(root='./data', train=False, transform=transform, download=True)



CUDA available: True
Number of GPUs: 1
Using device: cuda


In [None]:
# Train a model with 2 hidden layers and 50 neurons in each layer
Layers = [784, 50, 50, 10] 
model = Net(Layers).to(device)
learning_rate = 0.10
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
train_loader = DataLoader(dataset=train_data_set, batch_size=20)
criterion = nn.CrossEntropyLoss().to(device)
Results["momentum 0"] = train(train_data_set, model, criterion, train_loader, optimizer, epochs=10)




Epoch 1/10  - loss: 0.0444  - accuracy: 1.0000  - GPU memory allocated: 0.00 GB  - GPU memory cached: 0.00 GB
OrderedDict([('active.all.allocated', 69016), ('active.all.current', 16), ('active.all.freed', 69000), ('active.all.peak', 23), ('active.large_pool.allocated', 0), ('active.large_pool.current', 0), ('active.large_pool.freed', 0), ('active.large_pool.peak', 0), ('active.small_pool.allocated', 69016), ('active.small_pool.current', 16), ('active.small_pool.freed', 69000), ('active.small_pool.peak', 23), ('active_bytes.all.allocated', 818878976), ('active_bytes.all.current', 407040), ('active_bytes.all.freed', 818471936), ('active_bytes.all.peak', 569344), ('active_bytes.large_pool.allocated', 0), ('active_bytes.large_pool.current', 0), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 0), ('active_bytes.small_pool.allocated', 818878976), ('active_bytes.small_pool.current', 407040), ('active_bytes.small_pool.freed', 818471936), ('active_bytes.small_pool.peak', 

Epoch 3/10  - loss: 0.0153  - accuracy: 1.0000  - GPU memory allocated: 0.00 GB  - GPU memory cached: 0.00 GB
OrderedDict([('active.all.allocated', 207036), ('active.all.current', 16), ('active.all.freed', 207020), ('active.all.peak', 23), ('active.large_pool.allocated', 0), ('active.large_pool.current', 0), ('active.large_pool.freed', 0), ('active.large_pool.peak', 0), ('active.small_pool.allocated', 207036), ('active.small_pool.current', 16), ('active.small_pool.freed', 207020), ('active.small_pool.peak', 23), ('active_bytes.all.allocated', 2456294912), ('active_bytes.all.current', 407040), ('active_bytes.all.freed', 2455887872), ('active_bytes.all.peak', 569344), ('active_bytes.large_pool.allocated', 0), ('active_bytes.large_pool.current', 0), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 0), ('active_bytes.small_pool.allocated', 2456294912), ('active_bytes.small_pool.current', 407040), ('active_bytes.small_pool.freed', 2455887872), ('active_bytes.small_pool

Epoch 5/10  - loss: 0.0047  - accuracy: 1.0000  - GPU memory allocated: 0.00 GB  - GPU memory cached: 0.00 GB
OrderedDict([('active.all.allocated', 345056), ('active.all.current', 16), ('active.all.freed', 345040), ('active.all.peak', 23), ('active.large_pool.allocated', 0), ('active.large_pool.current', 0), ('active.large_pool.freed', 0), ('active.large_pool.peak', 0), ('active.small_pool.allocated', 345056), ('active.small_pool.current', 16), ('active.small_pool.freed', 345040), ('active.small_pool.peak', 23), ('active_bytes.all.allocated', 4093710848), ('active_bytes.all.current', 407040), ('active_bytes.all.freed', 4093303808), ('active_bytes.all.peak', 569344), ('active_bytes.large_pool.allocated', 0), ('active_bytes.large_pool.current', 0), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 0), ('active_bytes.small_pool.allocated', 4093710848), ('active_bytes.small_pool.current', 407040), ('active_bytes.small_pool.freed', 4093303808), ('active_bytes.small_pool

Epoch 7/10  - loss: 0.0033  - accuracy: 1.0000  - GPU memory allocated: 0.00 GB  - GPU memory cached: 0.00 GB
OrderedDict([('active.all.allocated', 483076), ('active.all.current', 16), ('active.all.freed', 483060), ('active.all.peak', 23), ('active.large_pool.allocated', 0), ('active.large_pool.current', 0), ('active.large_pool.freed', 0), ('active.large_pool.peak', 0), ('active.small_pool.allocated', 483076), ('active.small_pool.current', 16), ('active.small_pool.freed', 483060), ('active.small_pool.peak', 23), ('active_bytes.all.allocated', 5731126784), ('active_bytes.all.current', 407040), ('active_bytes.all.freed', 5730719744), ('active_bytes.all.peak', 569344), ('active_bytes.large_pool.allocated', 0), ('active_bytes.large_pool.current', 0), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 0), ('active_bytes.small_pool.allocated', 5731126784), ('active_bytes.small_pool.current', 407040), ('active_bytes.small_pool.freed', 5730719744), ('active_bytes.small_pool

Epoch 9/10  - loss: 0.0023  - accuracy: 1.0000  - GPU memory allocated: 0.00 GB  - GPU memory cached: 0.00 GB
OrderedDict([('active.all.allocated', 621096), ('active.all.current', 16), ('active.all.freed', 621080), ('active.all.peak', 23), ('active.large_pool.allocated', 0), ('active.large_pool.current', 0), ('active.large_pool.freed', 0), ('active.large_pool.peak', 0), ('active.small_pool.allocated', 621096), ('active.small_pool.current', 16), ('active.small_pool.freed', 621080), ('active.small_pool.peak', 23), ('active_bytes.all.allocated', 7368542720), ('active_bytes.all.current', 407040), ('active_bytes.all.freed', 7368135680), ('active_bytes.all.peak', 569344), ('active_bytes.large_pool.allocated', 0), ('active_bytes.large_pool.current', 0), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 0), ('active_bytes.small_pool.allocated', 7368542720), ('active_bytes.small_pool.current', 407040), ('active_bytes.small_pool.freed', 7368135680), ('active_bytes.small_pool

In [None]:
# Train a model with 2 hidden layer and 50 neurons in each layer with 0.1 momentum
Layers = [784, 50, 50, 10] 
model = Net(Layers)
learning_rate = 0.10
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.1)
train_loader = DataLoader(dataset=train_data_set, batch_size=20)
criterion = nn.CrossEntropyLoss()
Results["momentum 0.1"] = train(train_data_set, model, criterion, train_loader, optimizer, epochs=10)


In [None]:
# Train a model with 2 hidden layer and 50 neurons in each layer with 0.2 momentum
Layers = [784, 50, 50, 10] 
model = Net(Layers)
learning_rate = 0.10
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.2)
train_loader = DataLoader(dataset=train_data_set, batch_size=20)
criterion = nn.CrossEntropyLoss()
Results["momentum 0.2"] = train(train_data_set, model, criterion, train_loader, optimizer, epochs=100)


In [None]:
# Train a model with 2 hidden layer and 50 neurons in each layer with 0.4 momentum
Layers = [784, 50, 50, 10] 
model = Net(Layers)
learning_rate = 0.10
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.4)
train_loader = DataLoader(dataset=train_data_set, batch_size=20)
criterion = nn.CrossEntropyLoss()
Results["momentum 0.4"] = train(train_data_set, model, criterion, train_loader, optimizer, epochs=100)


In [None]:
# Train a model with 2 hidden layer and 50 neurons in each layer with 0.5 momentum

Layers = [784, 50, 50, 10] 
model = Net(Layers)
learning_rate = 0.10
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)
train_loader = DataLoader(dataset=train_data_set, batch_size=20)
criterion = nn.CrossEntropyLoss()
Results["momentum 0.5"] = train(train_data_set, model, criterion, train_loader, optimizer, epochs=100)
