<a href="https://colab.research.google.com/github/kashish049/Deep-Learning-Assignments/blob/main/Assignment_4(21dcs024).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import wandb
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
def load_data(dataset):
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    if dataset == "MNIST":
        train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
        test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)
    elif dataset == "CIFAR10":
        train_dataset = datasets.CIFAR10(root="./data", train=True, transform=transform, download=True)
        test_dataset = datasets.CIFAR10(root="./data", train=False, transform=transform, download=True)
    else:
        raise ValueError("Dataset not supported")

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    return train_loader, test_loader

In [None]:
def get_input_size(dataset):
    if dataset == "MNIST":
        return 28 * 28
    elif dataset == "CIFAR10":
        return 32 * 32 * 3
    else:
        raise ValueError("Dataset not supported")

def get_output_size(dataset):
    if dataset == "MNIST":
        return 10
    elif dataset == "CIFAR10":
        return 10
    else:
        raise ValueError("Dataset not supported")

# Dropout function
def dropout_layer(X, dropout):
    mask = (torch.rand_like(X) > dropout).float()
    return (X * mask) / (1.0 - dropout)

# Test dropout_layer with examples
X = torch.ones((5, 5))
print(dropout_layer(X, 0.3))

tensor([[0.0000, 0.0000, 1.4286, 1.4286, 1.4286],
        [1.4286, 1.4286, 0.0000, 1.4286, 0.0000],
        [1.4286, 1.4286, 0.0000, 1.4286, 0.0000],
        [0.0000, 1.4286, 1.4286, 1.4286, 0.0000],
        [1.4286, 1.4286, 1.4286, 1.4286, 1.4286]])


In [None]:
import torch
import torch.nn.functional as F
import wandb

# Define a simple neural network with dropout
def train_network(dataset, model_type, activation, hidden_units, dropout_rate, learning_rate, momentum):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, test_loader = load_data(dataset)

    input_size = get_input_size(dataset)
    output_size = get_output_size(dataset)

    torch.manual_seed(42)

    # Initialize weights as trainable parameters and move them to the correct device
    W1 = torch.nn.Parameter(torch.randn(input_size, hidden_units, device=device) * 0.01, requires_grad=True)
    W2 = torch.nn.Parameter(torch.randn(hidden_units, hidden_units, device=device) * 0.01, requires_grad=True) if model_type == 'DropoutNN' else None
    W3 = torch.nn.Parameter(torch.randn(hidden_units, output_size, device=device) * 0.01, requires_grad=True)

    def forward(X):
        X = X.to(device)  # Ensure X is on the same device
        X = X @ W1
        if activation == 'ReLU':
            X = F.relu(X)
        else:
            X = torch.sigmoid(X)
        if W2 is not None:
            X = dropout_layer(X, dropout_rate)
            X = X @ W2
            X = F.relu(X) if activation == 'ReLU' else torch.sigmoid(X)
        X = X @ W3
        return F.log_softmax(X, dim=1)

    optimizer = torch.optim.SGD([W1, W3] if W2 is None else [W1, W2, W3], lr=learning_rate, momentum=momentum)

    wandb.init(project="dropout_experiments", config={
        "dataset": dataset,
        "model_type": model_type,
        "activation": activation,
        "hidden_units": hidden_units,
        "dropout_rate": dropout_rate,
        "learning_rate": learning_rate,
        "momentum": momentum
    })

    for epoch in range(10):
        correct = 0
        total = 0
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.view(batch_X.shape[0], -1).to(device)  # Move to device
            batch_y = batch_y.to(device)  # Move labels to device

            optimizer.zero_grad()
            output = forward(batch_X)
            loss = F.nll_loss(output, batch_y)
            loss.backward()
            optimizer.step()

            predicted = output.argmax(dim=1, keepdim=True)
            correct += predicted.eq(batch_y.view_as(predicted)).sum().item()
            total += batch_y.size(0)

        accuracy = 100. * correct / total
        wandb.log({"loss": loss.item(), "accuracy": accuracy})

    wandb.finish()


In [None]:
# Run experiments
for dataset in ["MNIST", "CIFAR10"]:
    for model_type, activation, layers, units in [("StandardNeuralNet", "Logistic", 2, 100),
                                                  ("StandardNeuralNet", "Logistic", 2, 800),
                                                  ("DropoutNN", "Logistic", 3, 1024),
                                                  ("DropoutNN", "ReLU", 3, 1024)]:
        train_network(dataset, model_type, activation, units, dropout_rate=0.5, learning_rate=0.01, momentum=0.9)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:11<00:00, 864kB/s] 


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 135kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:03<00:00, 543kB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.11MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m21dcs024[0m ([33m21dcs024-national-institute-of-technology-hamirpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


0,1
accuracy,▁▆▇▇▇█████
loss,█▇▃▃▂▄▂▃▁▁

0,1
accuracy,96.30333
loss,0.07203


0,1
accuracy,▁▆▆▇▇▇▇███
loss,██▄▃▂▄▂▃▁▁

0,1
accuracy,95.51167
loss,0.09584


0,1
accuracy,▁▆▇▇██████
loss,█▅▃▂▂▅▁▃▁▁

0,1
accuracy,93.535
loss,0.09978


0,1
accuracy,▁▆▇▇▇█████
loss,█▅▂▁▂▂▂▃▁▁

0,1
accuracy,97.29833
loss,0.04818


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:13<00:00, 13.1MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


0,1
accuracy,▁▄▅▆▆▇▇▇██
loss,▇▅▃▅▅▇█▃▄▁

0,1
accuracy,50.79
loss,1.12691


Files already downloaded and verified
Files already downloaded and verified


0,1
accuracy,▁▃▄▅▆▆▇▇██
loss,▄▃▁▄▆▆█▂▅▁

0,1
accuracy,48.916
loss,1.25781


Files already downloaded and verified
Files already downloaded and verified


0,1
accuracy,▁▄▅▆▆▇▇▇██
loss,█▂▄▄▂▅▇▁▅▁

0,1
accuracy,40.442
loss,1.5026


Files already downloaded and verified
Files already downloaded and verified


0,1
accuracy,▁▄▅▅▆▆▇▇██
loss,▆▃█▆▆▆▆▁▄▂

0,1
accuracy,57.942
loss,1.18524


**MNIST:**

* **Best Run:** earnest-snow-5
* **Accuracy:** 97.35%
* **Activation:** ReLU
* **Model Type:** DropoutNN
* **Hidden Units:** 1024
* **Dropout Rate:** 0.5
* **Learning Rate:** 0.01
* **Momentum:** 0.9

**Analysis:** This run achieves the highest accuracy on the MNIST dataset.  The use of ReLU activation, a larger number of hidden units (1024), and the DropoutNN architecture likely contributed to its strong performance. Dropout helps prevent overfitting, which is crucial for good generalization performance.


**CIFAR10:**

* **Best Run:** distinctive-lion-9
* **Accuracy:** 57.91%
* **Activation:** ReLU
* **Model Type:** DropoutNN
* **Hidden Units:** 1024
* **Dropout Rate:** 0.5
* **Learning Rate:** 0.01
* **Momentum:** 0.9

**Analysis:**  While this run has the highest accuracy for CIFAR10 within the provided data, 57.91% is relatively low. CIFAR10 is a more complex dataset than MNIST, and these models may be underfitting or require further tuning.  The fact that the ReLU activation and DropoutNN model also performed best here suggests these are good starting points.


#### 1. **What happens if you change the dropout probabilities for different layers?**
   - Dropout at higher rates (e.g., 0.5) reduces overfitting and forces the network to generalize better, leading to lower training accuracy but possibly higher test accuracy.
   - For example, **"distinctive-lion-9"** and **"easy-frost-8"** use a dropout rate of 0.5, and the accuracy is lower (57.91% and 40.51% respectively) compared to models without dropout.
   - **Dropout Rate 0.5** seems effective in controlling overfitting on datasets like CIFAR10, with reduced variance across epochs.

#### 2. **Increase the number of epochs and compare dropout with no dropout:**
   - Training with dropout (e.g., **"distinctive-lion-9"**) prevents the model from overfitting, even over an increased number of epochs.
   - Models with no dropout (e.g., **"rich-fire-2"**) might show higher accuracy but risk overfitting, as indicated by the performance of models without dropout on MNIST (e.g., **"rich-fire-2"** with 96.32% accuracy) compared to models with dropout.

#### 3. **Variance of activations with and without dropout:**
   - The variance of activations in hidden layers is generally reduced when dropout is applied because it forces the model to rely on multiple paths.
   - Models with dropout (e.g., **"distinctive-lion-9"**) would show lower variance of activations, aiding in regularization. However, this variance would be higher in models without dropout (e.g., **"rich-fire-2"**) due to potential overfitting.
   - The variance likely evolves less erratically when dropout is applied, indicating smoother convergence.

#### 4. **Why is dropout not used at test time?**
   - Dropout is a regularization technique, and its stochastic nature can cause fluctuating test results, leading to inaccurate predictions.
   - At test time, the model uses all its parameters to provide deterministic outputs, thus eliminating the noise introduced by dropout during training.
   - The given data does not show test-time dropout usage, aligning with this principle.

#### 5. **Compare dropout with weight decay:**
   - **Dropout**: Reduces overfitting by randomly "turning off" units, forcing the network to rely on different subsets of neurons.
   - **Weight Decay**: Penalizes large weights, constraining the model to simpler solutions and avoiding overfitting.
   - Combining both (e.g., in **"distinctive-lion-9"** with dropout and weight decay) could result in diminishing returns if both are applied too aggressively.
   - However, in some cases, using both techniques simultaneously helps balance the need for reducing model complexity and avoiding overfitting.

#### 6. **Dropout on individual weights instead of activations:**
   - Dropout on activations is more effective in preventing overfitting by forcing the model to learn more robust features. Applying dropout to weights might disrupt the network's learning process, as weights define the strength of the connections between neurons.
   - The results of the current experiments do not directly involve dropout on weights, but based on theory and practice, dropout on activations will yield better generalization.
