# CSE 151A - PA4
By: Jonathan Lo<br>
Date: 8/3/24

## Questions

### Q1a

**Original Version**:
![Original Version](https://cdn.discordapp.com/attachments/942218891952783421/1136702566848270437/image.png)

**Normalized Version**:
![Normalized Version](https://cdn.discordapp.com/attachments/942218891952783421/1136702584833441853/HdhRMpgMBgMXYsRKYPBYDB0LUakDAaDwdC1GJEyGAwGQ9diRMpgMBgMXYsRKYPBYDB0LUakDAaDwdC1GJEyGAwGQ9diRMpgMBgMXYsRKYPBYDB0Lf8HMHqoZe3IpicAAAAASUVORK5CYII.png)

### Q1b

![](https://cdn.discordapp.com/attachments/942218891952783421/1136705746759864410/image.png)

### Q2a

In [None]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.to(device)
        input_var = torch.autograd.Variable(input).to(device)
        target_var = torch.autograd.Variable(target).to(device)
        # target_var = torch.squeeze(target_var)
        # compute output
        output = model(input_var)
        
        # compute loss
        loss = criterion(output, target_var.long())

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target)
        losses.update(loss.item(), input.size(0))
        top1.update(prec1[0][0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            curr_lr = optimizer.param_groups[0]['lr']
            print('Epoch: [{0}/{1}][{2}/{3}]\t'
                  'LR: {4}\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Train Acc {top1.val:.3f} ({top1.avg:.3f})'.format(
                   epoch, num_epochs, i, len(train_loader), curr_lr,
                   loss=losses, top1=top1))

    # Return the average training loss for the current epoch
    return losses.avg

best_prec1 = 0
total_loss = []
for epoch in range(num_epochs):
    if epoch in lr_step:
        for param_group in optimizer.param_groups:
            param_group['lr'] *= 0.1

    # train for one epoch
    train_loss = train(train_loader, model, criterion, optimizer, epoch)
    total_loss.append(train_loss)

    # evaluate on validation set
    # prec1 = 0
    prec1 = validate(val_loader, model, criterion)

    # remember best prec@1 and save checkpoint
    is_best = prec1 > best_prec1
    best_prec1 = max(prec1, best_prec1)
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_prec1': best_prec1,
        'optimizer': optimizer.state_dict(),
    }, is_best,filename="checkpoint.pth.tar")
    print("-----------------------------------------------")
    
    # if epoch%print_freq==0:
    #     plot_decision_boundary(model)

# plot_decision_boundary(model)
plt.plot(range(1, num_epochs + 1), total_loss)
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.title('Training Loss vs. Epoch')
plt.show()

![](https://cdn.discordapp.com/attachments/942218891952783421/1136711194988773477/wftWjtosO5yxAAAAABJRU5ErkJggg.png)

It looks like there are occasionally big spikes in loss. However, there is a general downward trend as the number of epochs increases.

### Q2b

In [None]:
learning_rates = [1, 0.1, 0.01, 0.001, 0.0001]
train_accuracies = []
val_accuracies = []

for lr in learning_rates:
    optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=weight_decay)
    for epoch in range(num_epochs):
        if epoch in lr_step:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= lr

        # Train for one epoch and get the average loss for this epoch
        train_loss = train(train_loader, model, criterion, optimizer, epoch)

        # Evaluate on validation set
        val_accuracy = validate(val_loader, model, criterion)

        # Store the accuracy for each epoch
        train_accuracies.append(train_loss)
        val_accuracies.append(val_accuracy)

plt.figure(figsize=(10, 6))
for i, lr in enumerate(learning_rates):
    epochs = range(1, num_epochs + 1)
    plt.plot(epochs, train_accuracies[i * num_epochs : (i + 1) * num_epochs], label=f"Train LR={lr}")
    plt.plot(epochs, val_accuracies[i * num_epochs : (i + 1) * num_epochs], label=f"Val LR={lr}", linestyle='dashed')

plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy for Different Learning Rates')
plt.legend()
plt.grid(True)
plt.show()

![](https://cdn.discordapp.com/attachments/942218891952783421/1136714751741796373/0gERH1SnxqHxER9VmKomDHjh149NFHzQ6FiIj6GH5HioiIiIiIyEcspIiIiIiIiHzE70gREVGfxbvXiYjIKLwiRURERERE5CMWUkRERERERD5iIUVEREREROQjFlJEREREREQYiFFRERERETkIxZSREREREREPmIhRURERERE5CMWUkRERERERD76P8nTw5I8NWGRAAAAAElFTkSuQmCC.png)

The best learning rate is `0.1` becuase the accuracy continues to increase through each epoch. The otheres remain constant  or dive down wildly.

### Q2c

In [None]:
# Define models
models = [
    linear_nn([2,20,10,10,2],activations).to(device),
    linear_nn([2,100,2],activations).to(device)
]
train_accuracies = []
val_accuracies = []
for model in models:
    optimizer = torch.optim.Adam(model.parameters(),lr=lr,weight_decay=weight_decay)
    for epoch in range(num_epochs):
        if epoch in lr_step:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= lr

    # Get final accuracies
    train_loss = train(train_loader, model, criterion, optimizer, epoch)

    # Evaluate on validation set
    val_accuracy = validate(val_loader, model, criterion)

    # Store the accuracy for each epoch
    train_accuracies.append(train_loss)
    val_accuracies.append(val_accuracy)

print("-----------------------")
print("Deep Model:")
print("Final Training Accuracy:", train_accuracies[0])
print("Final Testing Accuracy:", val_accuracies[0])
print("Num Params": sum(p.numel() for p in models[0].parameters()))

print("Shallow Model:")
print("Final Training Accuracy:", train_accuracies[1])
print("Final Testing Accuracy:", val_accuracies[1])
print("Num Params": sum(p.numel() for p in models[1].parameters()))

```
Deep Model:
Final Training Accuracy: 0.700875997543335
Final Testing Accuracy: 0.501231669087068
Num Params: 402

Shallow Model:
Final Training Accuracy: 0.6991315722465515
Final Testing Accuracy: 0.7566677862757319
Num Params: 502
```

Although the deep model has a better training accuracy, the best model will be the shallow model because it has a better final testing accuracy.