

































































































































# Project Introduction



# Part 1. Data Processing
The following procedures demonstrate how the data for this project is collected, cleaned, and processed.

*   The data is downloaded from Kaggle.com, with the data seperated into 44 folders from the age range of 16 - 60.

*   The data is then sorted according to the purpose of our project.
For the age range 6- 60, the data is organized into the folder called "below_16", while for the age range 17-60, the data is organized into the folder named "under_16".

*   The data is then retrieved from these folders and splits into training, validation, and testing set.




In [None]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.optim import lr_scheduler
import copy

use_cuda=True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Part (b) Splitting data to training validation and testing

Then the data will be randomized and split into 3 sets: 75% into training set, 12.5% into validation set, and 12.5% into test set. The cleaned data will be split and stored into the folder caleed "cleaned_data".

In [None]:
cleaned_dataset_path = "/content/gdrive/MyDrive/APS360 group project/cleaned_dataset"
cleaned_dataset = torchvision.datasets.DatasetFolder(cleaned_dataset_path, loader=torch.load, extensions=('.tensor'))

In [None]:
train_size = int(0.75 * len(cleaned_dataset))
val_size = int(0.125 * len(cleaned_dataset))
test_size = len(cleaned_dataset) - train_size - val_size

train_set, val_set, test_set = torch.utils.data.random_split(
    cleaned_dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

In [None]:
b_s = 16
train_loader = torch.utils.data.DataLoader(train_set, batch_size=b_s,
                                           num_workers=1, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=b_s,
                                           num_workers=1, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=b_s,
                                           num_workers=1, shuffle=True)

# Part 2. Baseline Model

### Part (a) Building baseline model


(Description of model, paste from progress report)

This is another CNN baseline model with more advanced feature extracting ability.

(Description, paste from progress report)

In [None]:

class primary_CNN_features_1(nn.Module):
  def __init__(self):
        self.name='CNN'
        super(primary_CNN_features_1, self).__init__()
        self.conv1 = nn.Conv2d(3,8,7) # in_channels, out_chanels, kernel_size  # Stride defalt to be 1, padding 0
        self.conv2 = nn.Conv2d(8,16,5)
        self.conv3 = nn.Conv2d(16, 32, 4, 2)
        self.pool = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(123904, 1280)
        self.fc2 = nn.Linear(1280, 320)
        self.fc3 = nn.Linear(320, 2)

  def forward(self, x):
      x = self.pool(F.relu(self.conv1(x)))
      x = self.pool(F.relu(self.conv2(x)))
      x= self.pool(F.relu(self.conv3(x)))
      x = x.view(-1, 123904)
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return x


### Part (b) Useful functions

This section provides several useful function for training and reporting accuracy.

In [None]:
def train_CNN(model, train_loader, val_loader, batch_size=32, l_r=0.005, num_epochs=5, momentum = 0.4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=l_r, momentum=momentum)

    iters, train_losses, val_losses, train_acc, val_acc = [], [], [], [], []

    # training
    n = 0 # the number of iterations
    for epoch in range(num_epochs):
        for imgs, labels in iter(train_loader):


            #############################################
            #To Enable GPU Usage
            if use_cuda and torch.cuda.is_available():
              imgs = imgs.cuda()
              labels = labels.cuda()
            #############################################


            out = model(imgs)             # forward pass
            train_loss = criterion(out, labels) # compute the total loss
            train_loss.backward()               # backward pass (compute parameter updates)
            optimizer.step()              # make the updates for each parameter
            optimizer.zero_grad()         # a clean up step for PyTorch

        iters.append(n)
        train_losses.append(float(train_loss)/batch_size)             # compute *average* loss


        for imgs, labels in iter(val_loader):

            #############################################
            #To Enable GPU Usage
            if use_cuda and torch.cuda.is_available():
              imgs = imgs.cuda()
              labels = labels.cuda()
            #############################################


            out = model(imgs)             # forward pass
            val_loss = criterion(out, labels) # compute the total loss

        val_losses.append(float(val_loss)/batch_size)             # compute *average* loss

        train_acc.append(get_accuracy(model, train_loader, val_loader, train=True)) # compute training accuracy
        val_acc.append(get_accuracy(model, train_loader, val_loader, train=False))  # compute validation accuracy
        model_path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(model.name, batch_size, l_r, epoch)
        torch.save(model.state_dict(), model_path)
        n+=1

        print(("Epoch: {} | Training Accuracy: {} | Validation Accuracy: {}").format(epoch+1, train_acc[epoch], val_acc[epoch]))

    plot(iters, train_losses, val_losses, train_acc, val_acc)

    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))

In [None]:
def train_CNN_2(model, train_set, val_set, batch_size=32, l_r=0.005, num_epochs=5, momentum=0.4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=l_r, momentum=momentum)

    iters, train_losses, val_losses, train_acc, val_acc = [], [], [], [], []
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                           num_workers=1, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size,
                                           num_workers=1, shuffle=True)

    # training
    n = 0  # the number of iterations
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0.0  # Variable to store the total training loss for this epoch

        for imgs, labels in train_loader:
            if use_cuda and torch.cuda.is_available():
                imgs = imgs.cuda()
                labels = labels.cuda()

            optimizer.zero_grad()
            out = model(imgs)  # forward pass
            train_loss = criterion(out, labels)  # compute the total loss
            train_loss.backward()  # backward pass (compute parameter updates)
            optimizer.step()  # make the updates for each parameter

            total_train_loss += train_loss.item()  # Accumulate the loss for this batch

        avg_train_loss = total_train_loss / len(train_loader.dataset)
        train_losses.append(avg_train_loss)

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0.0  # Variable to store the total validation loss for this epoch

        with torch.no_grad():
            for imgs, labels in val_loader:
                if use_cuda and torch.cuda.is_available():
                    imgs = imgs.cuda()
                    labels = labels.cuda()

                out = model(imgs)  # forward pass
                val_loss = criterion(out, labels)  # compute the total loss
                total_val_loss += val_loss.item()  # Accumulate the loss for this batch

        avg_val_loss = total_val_loss / len(val_loader.dataset)
        val_losses.append(avg_val_loss)

        # Calculate accuracy after each epoch
        train_acc_epoch = get_accuracy(model, train_loader)
        val_acc_epoch = get_accuracy(model, val_loader)
        train_acc.append(train_acc_epoch)
        val_acc.append(val_acc_epoch)

        model_path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(model.name, batch_size, l_r, epoch)
        torch.save(model.state_dict(), model_path)
        n += 1

        print(("Epoch: {} | Training Loss: {:.4f} | Validation Loss: {:.4f} | "
               "Training Accuracy: {:.4f} | Validation Accuracy: {:.4f}").format(
            epoch + 1, avg_train_loss, avg_val_loss, train_acc_epoch, val_acc_epoch
        ))

    plot(iters, train_losses, val_losses, train_acc, val_acc)

    print("Final Training Accuracy: {:.4f}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))

In [None]:
def plot(iters, train_losses, val_losses, train_acc, val_acc):
    plt.title("Training Curve")
    plt.plot(iters, train_losses, label="Train")
    plt.plot(iters, val_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(iters, train_acc, label="Train")
    plt.plot(iters, val_acc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Training Accuracy")
    plt.legend(loc='best')
    plt.show()

In [None]:
def get_accuracy(model,train_loader,val_loader,train=False):
    if train:
        data = train_loader
    else:
        data = val_loader

    correct = 0
    total = 0
    for imgs, labels in data:


        #############################################
        #To Enable GPU Usage
        if use_cuda and torch.cuda.is_available():
          imgs = imgs.cuda()
          labels = labels.cuda()
        #############################################


        output = model(imgs)

        #select index with maximum prediction score
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += imgs.shape[0]
    return correct / total


In [None]:
def get_test_accuracy(model, data, batch_size):

    data = test_set

    correct = 0
    total = 0

    for imgs, labels in torch.utils.data.DataLoader(data, batch_size=batch_size):

        #############################################
        #To Enable GPU Usage
        if use_cuda and torch.cuda.is_available():
          imgs = imgs.cuda()
          labels = labels.cuda()

        #############################################

        output = model(imgs)

        #select index with maximum prediction score
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += imgs.shape[0]
    return correct / total

### Part (c) Training baseline model

Next, we will train our second CNN model named `primary_CNN_features_1` with learning rate of 1e-4 and number of epochs = 30, batch_size = 32.



In [None]:
model_2 = primary_CNN_features_1()

if use_cuda and torch.cuda.is_available():
  model_2.cuda()
  print('CUDA is available!  Training on GPU ...')
else:
  print('CUDA is not available.  Training on CPU ...')

CUDA is available!  Training on GPU ...


In [None]:
train_CNN_2(model_2, train_set, val_set, l_r=2e-4,num_epochs=10, momentum = 0.4)

ValueError: ignored