<a href="https://colab.research.google.com/github/jwang44/upgraded-octo-chainsaw/blob/main/submit/parameter_select.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ECSE 551 3rd Mini-Project
Group 10: Junhao Wang, Yinan Zhou, Ruilin Ji

This notebook contains all code related to 

**hyperparameter tuning**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/imageunderstanding'
!ls

/content/drive/MyDrive/imageunderstanding
ExampleSubmissionRandom.csv  Test.pkl	       VAL_ACCU_RES34_ROTATE.csv
Load_data.ipynb		     Train_labels.csv
PRED_RESULT.csv		     Train.pkl


In [None]:
TRAIN_DATA_PATH = "Train.pkl"
TRAIN_LABEL_PATH = "Train_labels.csv"
TEST_DATA_PATH = "Test.pkl"
CSV_OUTPUT_PATH = "PRED_RESULT.csv"

In [None]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, TensorDataset
from PIL import Image
import torch
import pandas as pd

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

## Dataset Class / Data Loaders

In [None]:
IMG_SIZE = (224, 224)

In [None]:
# Transforms are common image transformations. They can be chained together using Compose.
img_transform = transforms.Compose([
    transforms.ToTensor(),
    # transforms.Normalize([0.5], [0.5]),
    transforms.Resize(IMG_SIZE),
    # transforms.RandomRotation(10)
    # transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0)
    # transforms.RandomAffine(degrees, translate=None, scale=None, shear=None, interpolation=<InterpolationMode.NEAREST: 'nearest'>, fill=0, fillcolor=None, resample=None)
])

In [None]:
class MyDataset(Dataset):
    def __init__(self, img_file, label_file, transform=None, idx = None):
        self.data = pickle.load( open( img_file, 'rb' ), encoding='bytes')
        self.targets = np.genfromtxt(label_file, delimiter=',', skip_header=1, usecols=1) #[:,1:]
        if idx is not None:
          self.targets = self.targets[idx]
          self.data = self.data[idx]
        self.transform = transform

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        img, target = self.data[index].squeeze(), int(self.targets[index])
        img = Image.fromarray((img*255).astype('uint8'), mode='L')
        if self.transform is not None:
           img = self.transform(img)
        return img, target

Get loader for all train data

In [None]:
BATCH_SIZE = 128
dataset = MyDataset(TRAIN_DATA_PATH, TRAIN_LABEL_PATH,transform=img_transform, idx=None)
# dataloader for all data
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

Get loaders for train/val data

In [None]:
VAL_SPLIT = 0.15
shuffle = True

# Creating indices for train and val split:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(VAL_SPLIT * dataset_size))
if shuffle:
  # set random seed so that we get the same split everytime
  np.random.seed(0)
  np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_dataset = MyDataset(TRAIN_DATA_PATH, TRAIN_LABEL_PATH,transform=img_transform, idx=train_indices)
val_dataset = MyDataset(TRAIN_DATA_PATH, TRAIN_LABEL_PATH,transform=img_transform, idx=val_indices)

# separate loaders for train and val data
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Test Dataset / Loader

In [None]:
class MyTestSet(Dataset):
  def __init__(self, img_file, transform=None):
    self.data = pickle.load( open(img_file, 'rb' ), encoding='bytes')
    self.transform = transform

  def __len__(self):
    # return self.data.shape[0]
    return len(self.data)

  def __getitem__(self, index):
    img = self.data[index].squeeze()
    img = Image.fromarray((img*255).astype('uint8'), mode='L')
    if self.transform is not None:
      img = self.transform(img)
    return img

In [None]:
test_dataset = MyTestSet(TEST_DATA_PATH,transform=img_transform)
# dataloader for test data
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## CNN models

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import torchvision.models as models

### Simple CNN

In [None]:
class Net(nn.Module):
    # This part defines the layers
    def __init__(self):
        super(Net, self).__init__()
        # At first there is only 1 channel (greyscale). The next channel size will be 10. 
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        # Then, going from channel size (or feature size) 10 to 20. 
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        # Now let us create some feed foreward layers in the end. Remember the sizes (from 320 to 50)
        self.fc1 = nn.Linear(320, 50)
        # The last layer should have an output with the same dimension as the number of classes
        self.fc2 = nn.Linear(50, 10)

    # And this part defines the way they are connected to each other
    # (In reality, it is our foreward pass)
    def forward(self, x):
        

        # F.relu is ReLU activation. F.max_pool2d is a max pooling layer with n=2
        # Max pooling simply selects the maximum value of each square of size n. Effectively dividing the image size by n
        # At first, x is out input, so it is 1x28x28
        # After the first convolution, it is 10x24x24 (24=28-5+1, 10 comes from feature size)
        # After max pooling, it is 10x12x12
        # ReLU doesn't change the size
        x = F.relu(F.max_pool2d(self.conv1(x), 2))

        # Again, after convolution layer, size is 20x8x8 (8=12-5+1, 20 comes from feature size)
        # After max pooling it becomes 20x4x4
        x = F.relu(F.max_pool2d(self.conv2(x), 2))

        # This layer is an imaginary one. It simply states that we should see each member of x
        # as a vector of 320 elements, instead of a tensor of 20x4x4 (Notice that 20*4*4=320)
        x = x.view(-1, 320)

        # Feedforeward layers. Remember that fc1 is a layer that goes from 320 to 50 neurons
        x = F.relu(self.fc1(x))

        # Output layer
        x = self.fc2(x)

        # We should put an appropriate activation for the output layer.
        return F.log_softmax(x)

### LeNet-5

In [None]:
class LeNet5(nn.Module):
    def __init__(self, kernel_size=5,ActFunc=['Tanh']):
        super(LeNet5, self).__init__()
        # input 1*28*28
        self.conv1 = nn.Conv2d(1, 6, kernel_size)
        self.conv2 = nn.Conv2d(6, 16, kernel_size)
        self.conv3 = nn.Conv2d(16, 120, kernel_size)
        self.fc1 = nn.Linear(120, 84)
        self.fc2 = nn.Linear(84, 10)

    def forward(self, x):
        # x = F.avg_pool2d(F.tanh(self.conv1(x)), kernel_size = (2,2),stride = 2) # 6 filters + avgpooling => 6*14*14
        # x = F.avg_pool2d(F.tanh(self.conv2(x)), kernel_size = (2,2),stride = 2) # 16 filters + avgpooling => 16*5*5
        x = F.avg_pool2d(torch.tanh(self.conv1(x)), kernel_size = (2,2),stride = 2) # 6 filters + avgpooling => 6*14*14
        x = F.avg_pool2d(torch.tanh(self.conv2(x)), kernel_size = (2,2),stride = 2) # 16 filters + avgpooling => 16*5*5
        x = torch.tanh(self.conv3(x)) # 120 filters => 120*1*1
        x = x.view(-1, 120) 
        x = torch.tanh(self.fc1(x))
        x = self.fc2(x)
        # x = (self.fc3(x) # output layer
        return F.softmax(x)

### VGG-11

In [None]:
model = models.vgg11(pretrained=False)
model.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.classifier[6] = nn.Linear(in_features=4096, out_features=10, bias=True)
model = model.to(DEVICE)

### AlexNet

In [None]:
model = models.alexnet(pretrained=False)
model.features[0] = nn.Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
model.classifier[6] = nn.Linear(4096, 10)
model = model.to(DEVICE)

### Resnet-18

In [None]:
model = models.resnet18(pretrained=False)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
model = model.to(DEVICE)

### Resnet-34

In [None]:
model = models.resnet34(pretrained=False)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
model = model.to(DEVICE)

### Optimizer & initialization

In [None]:
# optimizer = optim.SGD(tutor_model.parameters(), lr=0.01, momentum=0.5)
# optimizer = optim.SGD(tutor_model.parameters(), lr=1, momentum=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
# optimizer = optim.RMSprop(model.parameters())

train_losses = []
train_counter = []
val_accus = []
epochs = []
val_losses = []
val_counter = [i*len(train_loader.dataset) for i in range(3)]

## Hyperparameter tuning

In [None]:
! pip install optuna
! pip install mlflow

In [None]:
import optuna
import mlflow
from pprint import pformat
from torch.optim.lr_scheduler import StepLR

In [None]:
# for hyperparameter
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_set_size = len(train_loader.dataset)
    num_batches = len(train_loader)
    train_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            batch_size = len(data)
            print(f"Train Epoch: {epoch} [{batch_idx * batch_size}/{train_set_size} "
                  f"({100. * batch_idx / num_batches:.0f}%)]\tLoss: {loss.item():.6f}")
    avg_train_loss = train_loss / num_batches
    return avg_train_loss

# Testing loop
def val(model, device, val_loader):
    model.eval()
    val_set_size = len(val_loader.dataset)
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            val_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    val_loss /= val_set_size

    print(f"Test set: Average loss: {val_loss:.4f}, Accuracy: {correct}/{val_set_size} "
          f"({100. * correct / val_set_size:.0f}%)\n")
    return val_loss

In [None]:
def suggest_hyperparameters(trial):
    # Obtain the learning rate on a logarithmic scale
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    # Obtain the dropout ratio in a range from 0.0 to 0.9 with step size 0.1
    #dropout = trial.suggest_float("dropout", 0.0, 0.9, step=0.1)
    # Obtain the optimizer to use by name
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "RMSprop"])
    #momentum= trial.suggest_uniform('momentum', 0.4, 0.99)

    print(f"Suggested hyperparameters: \n{pformat(trial.params)}")
    return lr,  optimizer_name#,momentum#dropout

In [None]:
# for simple CNN
def objective(trial):
    print("\n********************************\n")
    best_val_loss = float('Inf')
    
    # Start a new mlflow run
    with mlflow.start_run():
        # Get hyperparameter suggestions created by optuna and log them as params using mlflow
        #lr,  optimizer_name, momentum= suggest_hyperparameters(trial)
        
        lr,optimizer_name= suggest_hyperparameters(trial)
        mlflow.log_params(trial.params)

        # Use CUDA if GPU is available and log device as param using mlflow
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        mlflow.log_param("device", device)

        # define model
        #model = Tutor_model(dropout=dropout).to(device)
        model = Net().to(device)

        # Pick an optimizer based on optuna's parameter suggestion
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "RMSprop":
            optimizer = optim.RMSprop(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        
        # Get DataLoaders for MNIST train and validation set
        #train_loader, val_loader = get_mnist_dataloaders()
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        # Network training & validation loop
        for epoch in range(0, 10):
            avg_train_loss = train(model, device, train_loader, optimizer, epoch)
            #avg_train_loss = train( epoch,model,train_loader)
            avg_val_loss = val(model, device, val_loader)
            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss

            # Log average train and validation set loss metrics for the current epoch using mlflow
            mlflow.log_metric("avg_train_losses", avg_train_loss, step=epoch)
            mlflow.log_metric("avg_val_loss", avg_val_loss, step=epoch)
            
            scheduler.step()

    # Return the best validation loss achieved by the network.
    # This is needed as Optuna needs to know how the suggested hyperparameters are influencing the network loss.
    return best_val_loss

In [None]:
# for Resnet-18
def objective(trial):
    print("\n********************************\n")
    best_val_loss = float('Inf')
    
    # Start a new mlflow run
    with mlflow.start_run():
        # Get hyperparameter suggestions created by optuna and log them as params using mlflow
        #lr,  optimizer_name, momentum= suggest_hyperparameters(trial)
        
        lr,optimizer_name= suggest_hyperparameters(trial)
        mlflow.log_params(trial.params)

        # Use CUDA if GPU is available and log device as param using mlflow
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        mlflow.log_param("device", device)

        # define model
        #model = Tutor_model(dropout=dropout).to(device)
        model = models.resnet18(pretrained=False)
        model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
        model = model.to(DEVICE)

        # Pick an optimizer based on optuna's parameter suggestion
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "RMSprop":
            optimizer = optim.RMSprop(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        
        # Get DataLoaders for MNIST train and validation set
        #train_loader, val_loader = get_mnist_dataloaders()
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        # Network training & validation loop
        for epoch in range(0, 10):
            avg_train_loss = train(model, device, train_loader, optimizer, epoch)
            #avg_train_loss = train( epoch,model,train_loader)
            avg_val_loss = val(model, device, val_loader)
            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss

            # Log average train and validation set loss metrics for the current epoch using mlflow
            mlflow.log_metric("avg_train_losses", avg_train_loss, step=epoch)
            mlflow.log_metric("avg_val_loss", avg_val_loss, step=epoch)
            
            scheduler.step()

    # Return the best validation loss achieved by the network.
    # This is needed as Optuna needs to know how the suggested hyperparameters are influencing the network loss.
    return best_val_loss

In [None]:
# for Resnet-34
def objective(trial):
    print("\n********************************\n")
    best_val_loss = float('Inf')
    
    # Start a new mlflow run
    with mlflow.start_run():
        # Get hyperparameter suggestions created by optuna and log them as params using mlflow
        #lr,  optimizer_name, momentum= suggest_hyperparameters(trial)
        
        lr,optimizer_name= suggest_hyperparameters(trial)
        mlflow.log_params(trial.params)

        # Use CUDA if GPU is available and log device as param using mlflow
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        mlflow.log_param("device", device)

        # define model
        #model = Tutor_model(dropout=dropout).to(device)
        model = models.resnet34(pretrained=False)
        model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
        model = model.to(DEVICE)

        # Pick an optimizer based on optuna's parameter suggestion
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "RMSprop":
            optimizer = optim.RMSprop(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        
        # Get DataLoaders for MNIST train and validation set
        #train_loader, val_loader = get_mnist_dataloaders()
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        # Network training & validation loop
        for epoch in range(0, 10):
            avg_train_loss = train(model, device, train_loader, optimizer, epoch)
            #avg_train_loss = train( epoch,model,train_loader)
            avg_val_loss = val(model, device, val_loader)
            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss

            # Log average train and validation set loss metrics for the current epoch using mlflow
            mlflow.log_metric("avg_train_losses", avg_train_loss, step=epoch)
            mlflow.log_metric("avg_val_loss", avg_val_loss, step=epoch)
            
            scheduler.step()

    # Return the best validation loss achieved by the network.
    # This is needed as Optuna needs to know how the suggested hyperparameters are influencing the network loss.
    return best_val_loss

In [None]:
    # Create the optuna study which shares the experiment name
    study = optuna.create_study(study_name="pytorch-mlflow-optuna", direction="minimize")
    study.optimize(objective, n_trials=10)

    # Print optuna study statistics
    print("\n++++++++++++++++++++++++++++++++++\n")
    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Trial number: ", trial.number)
    print("  Loss (trial value): ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))