In [None]:
from google.colab import drive

drive.mount("/content/gdrive/MyDrive/")


Mounted at /content/gdrive
/content/gdrive/MyDrive/STAT-946/TestingCode


In [None]:
%matplotlib inline
# %config InlineBackend.figure_format = "retina"
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import WeightedRandomSampler
import matplotlib.pyplot as plt
import time
import os
import copy
import helper
import shutil
import pandas as pd
import random
import os




> In this project I have used vgg16 (the version having batchnorm) for prediciting infection to COVID-19 based on chest X-ray images. 




# Prepairing the Dataset

> **Create a new folder and then place stat946winter2021.zip file in it. Go to the new folder that stat946winter2021.zip is in.**



In [None]:
def DownloadData():
  # load your coookies from kaggle.com to download the dataset
  !wget -x --load-cookies kaggle.com_cookies.txt "https://www.kaggle.com/c/25588/download-all" -O stat946winter2021.zip

def PrepairFolders():

  # Creating the data root folder
  os.makedirs(data_root)

  %cd $data_root

  # Moving the stat946winter2021.zip to data root folder
  ! mv ../stat946winter2021.zip ./

  
  !unzip stat946winter2021.zip

  # ! rm -rf train/0
  # ! rm -rf train/1
  # ! rm -rf validation/1
  # ! rm -rf validation/0
  try:
    os.makedirs(dest_validation_neg)
    os.makedirs(dest_validation_pos)
    os.makedirs(dest_train_neg)
    os.makedirs(dest_train_pos)
  except:
    print("file already exists")


def PrepairData():

  %cd $data_root
  df = pd.read_csv('train_labels.csv')

  # Split Train to Validation and Train
  Test_Train_ratio = .1
  ValidationIndex = random.sample(range(0, len(df)), int(len(df) * Test_Train_ratio))

  ImagesDir = os.path.join(data_root, "train/train")

  for index, filename in enumerate(df['File']):
    label = df.iloc[index][1]
    print("image name : {}, label :{}".format(filename, label))
    source = os.path.join(ImagesDir, filename)
    
    # Positive Case
    if label: 

      # Image in validation set
      if index in ValidationIndex: 
        shutil.move(source, dest_validation_pos)

      # Image in training set
      else:
        shutil.move(source, dest_train_pos)
    
    # Negative Case
    else: 
      
      # Image in validation set
      if index in ValidationIndex: 
        shutil.move(source, dest_validation_neg)

      # Image in training set
      else: 
        shutil.move(source, dest_train_neg)

  ! rm -rf train/train
  

> Run the following cell code to prepair the data and create needed folders. 

> Run this cell just once

In [None]:
current_floder = %pwd
data_root = os.path.join(current_floder, "data/")


dest_validation_neg = os.path.join(data_root, "val/0")
dest_validation_pos = os.path.join(data_root, "val/1")
dest_train_neg = os.path.join(data_root, "train/0")
dest_train_pos = os.path.join(data_root, "train/1")

model_name = 'vgg16'
num_classes = 2
batch_size = 8
num_epochs = 15

# Flag for feature extracting. When False, we finetune the whole model,
#   when True we only update the reshaped layer params
feature_extract = True


print(data_root)

# Here we donwload and prepair data for using ImageFolder module in pytroch
# DownloadData() # If you have put the stat946winter2021.zip in the current folder do not use this function.
PrepairFolders()
PrepairData()

/content/gdrive/MyDrive/STAT-946/TestingCode/data/


After running this cell the data should be splitted into the **train** and **val** folders. Data in each of these folders is also splitted based on positive and negative cases.

# Helper Functions

## Training Function

In [None]:
def train_model(model, dataloaders, criterion, optimizer, model_name, num_epochs=25):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.5f} Acc: {:.5f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                # Save the model if this model is better than the previously saved models
                save_model(model, float(best_acc), model_name)
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

## Setting Parameters requires_grad

In [None]:
# Setting requires_grad to Flase if the model is in feature extracting mode
# Else model is in fine tunning and the requires_grad is set to True

def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
    else:
        for param in model.parameters():
            param.requires_grad = True

## Initializing The Model

In [None]:
# We only initialize the last layer of the model. Other weights are used as the pretrained model
# Based on feature_extract we set requires_grad of weights.

def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    if model_name == "vgg11":
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == 'vgg16':
        model_ft = models.vgg16_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "alexnet":
        model_ft = models.alexnet(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224
  
    return model_ft, input_size

## Saving The Best Model Weights

In [None]:
# Saving the model weights
# We save the model that has the best accuracy.
# Other saved weights will be removed if current model has better accuracy.

def save_model(model, acc, model_name):
  
  %cd $data_root
  %cd ..
  main_path = %pwd

  model_path = os.path.join(main_path, 'BestModel/')
  if os.path.isdir(model_path) == False:
    os.makedirs(model_path)

  %cd $model_path

  # Check if the current model is better than saved models or not
  is_best = True

  # Get a list of model names
  saved_models = os.listdir("./")

  # Search for the best saved model with the same architecture
  for m in saved_models:
    saved_acc = m.split('_')[2]
    if float(saved_acc) < acc and m.split('_')[-1] == model_name:
      os.remove(m)
    elif float(saved_acc) >= acc and m.split('_')[-1] == model_name:
      is_best = False

  # best_acc = float(torch.tensor(hist).sort()[0][-1])
  if is_best:
    torch.save(model.state_dict(), 'model_acc_' + str(acc) + '_' + model_name)
    # Path('model_acc_' + str(acc) + '_' + model_name).touch()



## Loading The Best Model Weights

In [None]:
def load_model(model_ft):

  # Loading the best model from BestModel folder

  %cd $data_root
  %cd ..
  main_path = %pwd

  model_path = os.path.join(main_path, 'BestModel/')
  
  if os.path.isdir(model_path) == False:
    os.makedirs(model_path)
  
  %cd $model_path

  # Downloading the weight that has an 98.3 % accuracy
  ! wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N" -O model_acc_0.9836173001310_vgg16 && rm -rf /tmp/cookies.txt

  # Get a list of model names
  models = os.listdir("./")
  best_acc = 0
  best_model = ""

  # Search for the best saved model
  for m in models:
    acc = m.split('_')[2]
    if float(acc) > best_acc:
      best_model = m

  
  print("model {} is loaded". format(best_model))
  model_ft.load_state_dict(torch.load(best_model))
  model_ft.eval()
  
  return model_ft

## Weighted Sampler

In [None]:
# Creating a Weighted Sampler object
def weight_sampler(image_dataset):  
  targets = np.array(image_dataset.targets)
  class_sample_count = np.array([len(np.where(targets == t)[0]) for t in np.unique(targets)])
  weight = 1. / class_sample_count
  print(class_sample_count)
  samples_weight = np.array([weight[t] for t in targets])

  samples_weight = torch.from_numpy(samples_weight)
  samples_weigth = samples_weight.double()
  sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

  return sampler

# Initializing the model

In [None]:
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)

# Prinitng the altered pretrained model
print(model_ft)

Downloading: "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth" to /root/.cache/torch/hub/checkpoints/vgg16_bn-6c64b313.pth


HBox(children=(FloatProgress(value=0.0, max=553507836.0), HTML(value='')))


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 25

#Loading Data

In [None]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

print("Initializing Datasets and Dataloaders...")

# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_root, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders

# Trying to handle the unbalanced data using pytorch samplers
# Generating a weighted sampler
w_sampler = weight_sampler(image_datasets['train'])

dataloaders_dict = {}
for x in ['train', 'val']:
  if x == 'train':
    dataloaders_dict[x] = torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=False, sampler=w_sampler, num_workers=4)
  else:
    dataloaders_dict[x] = torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4)

# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Initializing Datasets and Dataloaders...
[12335  1403]


# Creating Optimizer

In [None]:
def set_optimizer(model_ft):
    # Send the model to GPU
    model_ft = model_ft.to(device)

    # Gather the parameters to be optimized/updated in this run.
    params_to_update = model_ft.parameters()
    print("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name,param in model_ft.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t",name)
    else:
        for name,param in model_ft.named_parameters():
            if param.requires_grad == True:
                print("\t",name)

    # Observe that all parameters are being optimized
    return optim.SGD(params_to_update, lr=0.001, momentum=0.9)

# Training

## Setting Optimizer

In [None]:
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()

optimizer_ft = set_optimizer(model_ft)
# Prints the parameters that are going to be updated during gradient decent

Params to learn:
	 classifier.6.weight
	 classifier.6.bias


## Training The Model

In [None]:

# Train only the last layer of the model - other wieths are freezed
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, model_name=model_name, num_epochs=num_epochs)

# Fine-tunning the whole model after the last layer weights has been updated for some ecpochs 
feature_extract = False
set_parameter_requires_grad(model_ft, feature_extracting=feature_extract)
optimizer_ft = set_optimizer(model_ft)

num_epochs = 35
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, model_name=model_name, num_epochs=num_epochs)

## Loading Best Model

Instead of Training you can Load the weights. 
The weights that can reporduce the CSV results that was submited in Kaggle will be downloaded and loaded automatically.

The folliwng cell will automaticaly download the weight and move the weight to the BestModel folder (will be created if does not exist). Then loads and returns the best model that exists in this folder.

You can also have access to this weight using this link:
[Model Accuracy 98.3](https://drive.google.com/file/d/1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N/view?usp=sharing)

In [None]:
# Loades the best model in the BestModel folder (will be created if does not exist).
# Downloads a weight that have an 98.3% accuracy
model_ft = load_model(model_ft)


/content/gdrive/MyDrive/STAT-946/TestingCode/data
/content/gdrive/MyDrive/STAT-946/TestingCode
/content/gdrive/MyDrive/STAT-946/TestingCode/BestModel
--2021-02-14 20:29:20--  https://docs.google.com/uc?export=download&confirm=CzaI&id=1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N
Resolving docs.google.com (docs.google.com)... 172.217.12.238, 2607:f8b0:4004:82a::200e
Connecting to docs.google.com (docs.google.com)|172.217.12.238|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-10-as-docs.googleusercontent.com/docs/securesc/jdcenaolhaq3vunredsataip4de1047o/6f4dc917310505ibnv4o298d5q45gifp/1613334525000/13043136208596042681/04847398216668824266Z/1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N?e=download [following]
--2021-02-14 20:29:20--  https://doc-10-as-docs.googleusercontent.com/docs/securesc/jdcenaolhaq3vunredsataip4de1047o/6f4dc917310505ibnv4o298d5q45gifp/1613334525000/13043136208596042681/04847398216668824266Z/1-mRMr_MB1S71WwDgIBaM6JYCpSmTPn5N?e=download

# Testing

## Loading The Test Data

In [None]:
%cd $data_root

tdf = pd.read_csv('sample_submission.csv')
model_ft.eval()

test_dataset = datasets.ImageFolder(os.path.join(data_root, 'test'), data_transforms['val'])
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4)

/content/gdrive/MyDrive/STAT-946/TestingCode/data


## Testing on Validation

In [None]:
# Testing the model on Validation dataset

model_ft.eval()
val_labels = [] # Predicted Lables
corr_labels = [] # Correct Label
for i, (batch_img, labels) in enumerate(dataloaders_dict['val']):
  batch_img = batch_img.to(device)
  output = model_ft(batch_img)
  
  for out in output.cpu().argmax(dim=1).numpy():
    val_labels.append(out)
  for l in labels.numpy():
    corr_labels.append(l)

In [None]:
# Finding the correct predictions
tar = np.array(val_labels) == np.array(corr_labels)

# Accurcay on Validation
print("the model's accuracy on validaiton is: {}".format(tar.sum()/len(corr_labels)))

the model's accuracy on validaiton is: 0.9888597640891219


## Testing on Test Dataset

In [None]:
model_ft.eval()
test_lables = []

for i, (batch_img, labels) in enumerate(test_dataloader):
  batch_img = batch_img.to(device)
  output = model_ft(batch_img)

  # Add the prediction to list of predictions
  test_lables.append(output.cpu().argmax(dim=1).numpy()[0])

In [None]:
# Obtaining image names
test_imgs_names = []
for i in range(len(test_dataset)):
  test_imgs_names.append(test_dataset.imgs[i][0].split('/')[-1])

In [None]:
%cd $data_root
df = pd.read_csv('train_labels.csv')
test_df = pd.read_csv('sample_submission.csv')

/content/gdrive/MyDrive/STAT-946/TestingCode/data


In [None]:
# Updating the data frame file of test data
for i, img in enumerate(test_imgs_names):
  # Setting the lable of each image based on the model prediction
  tdf.loc[tdf['File'] == img , tdf.columns[1]] = str(test_lables[i])

In [None]:
# Saving the new CSV file
tdf.to_csv('precidted_lables.csv', index=False)



---



---

