### This notebook is designed to investigate the effects of Image Quality on Classification Performance

In [None]:
# Install Python packages
!pip install numpy torch torchvision pytorch-ignite tensorboardX tensorboard opendatasets efficientnet-pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-ignite
  Downloading pytorch_ignite-0.4.9-py3-none-any.whl (259 kB)
[K     |████████████████████████████████| 259 kB 31.0 MB/s 
[?25hCollecting tensorboardX
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 73.8 MB/s 
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting efficientnet-pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16446 sha256=40945eb0d8d49488a5611d7ae6d235bf957e8f7145d98a4c8257f24cb8efd74a
  Stored in directory: /root/.cache/pip/wheels/0e/cc/b2/49e74588263573ff778da58cc99b9c6349b496636a7e165be6
Successful

In [None]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt

import torch
from torch import optim, nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torchvision.utils import make_grid
from torchvision import models, datasets
from torchvision import transforms as T
from PIL import Image
from google.colab import files
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, Precision, Recall
from ignite.handlers import LRScheduler, ModelCheckpoint, global_step_from_engine
from ignite.contrib.handlers import ProgressBar, TensorboardLogger
import ignite.contrib.engines.common as common

import opendatasets as od
import os
from random import randint
import urllib
import zipfile
import glob

# Define device to use (CPU or GPU). CUDA = GPU support for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

### Tiny ImageNet Dataset

In [None]:
# Download data directly from Stanford data source and unzip it
!wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
!unzip -qq 'tiny-imagenet-200.zip'
DATA_DIR = 'tiny-imagenet-200' # Original images come in shapes of [3,64,64]
# Define training and validation data paths
TRAIN_DIR = os.path.join(DATA_DIR, 'train') 
TEST_DIR = os.path.join(DATA_DIR, 'val')

--2022-06-01 16:10:57--  http://cs231n.stanford.edu/tiny-imagenet-200.zip
Resolving cs231n.stanford.edu (cs231n.stanford.edu)... 171.64.68.10
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248100043 (237M) [application/zip]
Saving to: ‘tiny-imagenet-200.zip’


2022-06-01 16:11:16 (13.1 MB/s) - ‘tiny-imagenet-200.zip’ saved [248100043/248100043]



### Helper functions

In [None]:
# This function writes experiment results on the .txt file
def write_results(file, name, result_list):
    for i in range(len(result_list)):
        if i==0:
            file.write(f'\n{name}:{result_list[i]}')
        else:
            file.write(f',{result_list[i]}')

# This function generates dataloaders for image datasets
def generate_dataloader(data, transform):
    if data is None: 
        return None
    
    # Read image files to pytorch dataset using ImageFolder, a generic data 
    # loader where images are in format root/label/filename
    # See https://pytorch.org/vision/stable/datasets.html
    #if transform is None:
    #    dataset = datasets.ImageFolder(data, transform=T.ToTensor())
    dataset = datasets.ImageFolder(data, transform=transform)
    
    # 90% train, 10% validation
    data_train, data_val = torch.utils.data.random_split(dataset, [90000, 10000])
    
    # Wrap image dataset (defined above) in dataloader 
    train_loader = DataLoader(data_train, batch_size=batch_size, 
                            shuffle=(True), pin_memory=True)

    val_loader = DataLoader(data_val, batch_size=batch_size, 
                            shuffle=(False), pin_memory=True)
    
    return train_loader, val_loader

# This functions modifies the image quality
def modify_quality(train_dir, test_dir, quality):
    if quality == 100:
        pass
    else:
        train_img_dirs = glob.glob(os.path.join(train_dir, '*'))
        for i in range(len(train_img_dirs)):
            train_images = glob.glob(os.path.join(train_img_dirs[i], 'images/*'))
            for j in range(len(train_images)):
                train_image = Image.open(train_images[j])
                train_image.save(train_images[j],quality=quality,optimize=True)

        test_img_dirs = glob.glob(os.path.join(test_dir, 'images/*'))
        for k in range(len(train_img_dirs)):
            test_images = glob.glob(os.path.join(test_img_dirs[k], '*'))
            for l in range(len(test_images)):
              test_image = Image.open(test_images[l])
              test_image.save(test_images[l],quality=quality,optimize=True)

___
### Organize Test Set

In [None]:
# In the training folder, images are already arranged in sub folders based 
# on labels, but images in test folder (which is the dataset's validation set) are all inside a single folder. 
# Test folder comes with images folder and val_annotations txt file. 

In [None]:
# Create separate test subfolders for the test images based on
# their labels which are indicated in the val_annotations txt file
test_img_dir = os.path.join(TEST_DIR, 'images')

# Read val annotations text file
fp = open(os.path.join(TEST_DIR, 'val_annotations.txt'), 'r')
data = fp.readlines()

# Create dictionary to store img filename (word 0) and corresponding
# label (word 1) for every line in the txt file
test_img_dict = {}
for line in data:
    words = line.split('\t')
    test_img_dict[words[0]] = words[1]
fp.close()

In [None]:
# Create subfolders for test images based on label ,
# and move images into the respective folders
for img, folder in test_img_dict.items():
    newpath = (os.path.join(test_img_dir, folder))
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    if os.path.exists(os.path.join(test_img_dir, img)):
        os.rename(os.path.join(test_img_dir, img), os.path.join(newpath, img))

In [None]:
# Save class names as dict from words.txt file
class_to_name_dict = dict()
fp = open(os.path.join(DATA_DIR, 'words.txt'), 'r')
data = fp.readlines()
for line in data:
    words = line.strip('\n').split('\t')
    class_to_name_dict[words[0]] = words[1].split(',')[0]
fp.close()

___
### Image Pre-processing Transformations
Using Pytorch we convert all images in dataset into a standardized format which is readable by the pre-trained models.

In [None]:
# Transformation sequence for image pre-processing
preprocess_transform = T.Compose([
                T.RandomHorizontalFlip(),
                T.ToTensor(),  # Converting cropped images to tensors
                T.Normalize(mean=[0.485, 0.456, 0.406], 
                            std=[0.229, 0.224, 0.225])
])

___
### Create dataloaders

#### Trainingset, Validationset and Testset dataloader

In [None]:
# Modify image quality and create train, validation and test sets
batch_size = 64
quality = 70
modify_quality(TRAIN_DIR, TEST_DIR, quality)
train_loader, val_loader = generate_dataloader(TRAIN_DIR, transform=preprocess_transform)
testset = datasets.ImageFolder(test_img_dir, transform=preprocess_transform)

___
### Image Classification

In [None]:
#### Define model architecture and relative Learning Rate

# EfficientNet-B6
model = models.efficientnet_b6(pretrained=True)
model.name = 'efficientnet_b6'
lr = 0.003  # Learning rate

# SqueezeNet
#model = models.squeezenet1_0(pretrained=True)
#model.name = 'squeezenet1_0'
#lr = 0.0003  # Learning rate

#VGG-16
#model = models.vgg16(pretrained=True)
#model.name = 'vgg16'
#lr = 0.0004  # Learning rate

# Adjust the final Layer based on the number of classes
model.fc = nn.Linear(2048, 200)

# Add Dropout layers
feats_list = list(model.features)
new_feats_list = []
for feat in feats_list:
    new_feats_list.append(feat)
    if isinstance(feat, nn.Conv2d):
        new_feats_list.append(nn.Dropout(p=0.5, inplace=True))

# Modify convolution layers based on the Dropouts
model.features = nn.Sequential(*new_feats_list)

# Move model to device
model = model.to(device)

# Create a .txt file for writing the results
f = open(f'{model.name}_{quality}.txt', 'x')
f = open(f'{model.name}_{quality}.txt', 'a+')
f.write(model.name)
f.write(f'\nquality:{quality}')

# Define hyperparameters and settings
num_epochs = 10  # Number of epochs
log_interval = 300  # Number of iterations before logging

# Set loss function
loss_func = nn.CrossEntropyLoss()

# Set optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

# Setup pytorch-ignite trainer engine
trainer = create_supervised_trainer(model, optimizer, loss_func, device=device)

# Add progress bar to monitor model training
ProgressBar(persist=True).attach(trainer, output_transform=lambda x: {"Batch Loss": x})

# Define evaluation metrics
metrics = {
    "accuracy": Accuracy(), 
    "loss": Loss(loss_func),
}

# We define two evaluators as they do not have exactly similar roles. 
# `evaluator` will save the best model based on validation score, 
# whereas `train_evaluator` logs metrics on training set only

# Evaluator for training data
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)

# Evaluator for validation data
evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)

# Message to show the start of training
@trainer.on(Events.STARTED)
def start_message():
    print("Begin training")

# Log results from each batch
@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_batch(trainer):
    batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1
    print(
        f"Epoch {trainer.state.epoch} / {num_epochs}, "
        f"Batch {batch} / {trainer.state.epoch_length}: "
        f"Loss: {trainer.state.output:.3f}"
    )

# Define lists to save results
train_loss = []
train_accuracy = []

# Evaluate and print training set metrics
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_loss(trainer):
    print(f"Epoch [{trainer.state.epoch}] - Loss: {trainer.state.output:.2f}")
    train_evaluator.run(train_loader)
    epoch = trainer.state.epoch
    metrics = train_evaluator.state.metrics
    train_loss.append(metrics['loss'])
    train_accuracy.append(metrics['accuracy'])
    print(f"Train - Loss: {metrics['loss']:.3f}, "
          f"Accuracy: {metrics['accuracy']:.3f} "
          )

# Define lists to save results
val_loss = []
val_accuracy = []

# Evaluate and print validation set metrics
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_loss(trainer):
    evaluator.run(val_loader)
    epoch = trainer.state.epoch
    metrics = evaluator.state.metrics
    val_loss.append(metrics['loss'])
    val_accuracy.append(metrics['accuracy'])
    print(f"Validation - Loss: {metrics['loss']:.3f}, "
          f"Accuracy: {metrics['accuracy']:.3f}"
          )
    print()
    print("-" * 60)
    print()

# Sets up checkpoint handler to save best n model(s) based on validation accuracy metric
common.save_best_model_by_val_score(
          output_path="best_models",
          evaluator=evaluator,
          model=model,
          metric_name="accuracy",
          n_saved=1,
          trainer=trainer,
          tag="val"
)

# Define a Tensorboard logger
tb_logger = TensorboardLogger(log_dir="logs")

# Attach handler to plot trainer's loss every n iterations
tb_logger.attach_output_handler(
    trainer,
    event_name=Events.ITERATION_COMPLETED(every=log_interval),
    tag="training",
    output_transform=lambda loss: {"Batch Loss": loss},
)

# Attach handler to dump evaluator's metrics every epoch completed
for tag, evaluator in [("training", train_evaluator), ("validation", evaluator)]:
    tb_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag=tag,
        metric_names="all",
        global_step_transform=global_step_from_engine(trainer),
    )


# Start training
trainer.run(train_loader, max_epochs=num_epochs)

# Close Tensorboard
tb_logger.close()

# Write results
write_results(f, 'training loss', train_loss)
write_results(f, 'training accuracy', train_accuracy)
write_results(f, 'validation loss', val_loss)
write_results(f, 'validation accuracy', val_accuracy)

Downloading: "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b6_lukemelas-c76e70fd.pth


  0%|          | 0.00/165M [00:00<?, ?B/s]

Begin training


[1/1407]   0%|           [00:00<?]

Epoch 1 / 10, Batch 300 / 1407: Loss: 4.162
Epoch 1 / 10, Batch 600 / 1407: Loss: 3.478
Epoch 1 / 10, Batch 900 / 1407: Loss: 3.651
Epoch 1 / 10, Batch 1200 / 1407: Loss: 3.410
Epoch [1] - Loss: 4.08
Train - Loss: 3.385, Accuracy: 0.226 
Validation - Loss: 3.527, Accuracy: 0.203

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 2 / 10, Batch 93 / 1407: Loss: 3.122
Epoch 2 / 10, Batch 393 / 1407: Loss: 3.131
Epoch 2 / 10, Batch 693 / 1407: Loss: 3.625
Epoch 2 / 10, Batch 993 / 1407: Loss: 2.913
Epoch 2 / 10, Batch 1293 / 1407: Loss: 2.723
Epoch [2] - Loss: 3.13
Train - Loss: 3.038, Accuracy: 0.288 
Validation - Loss: 3.264, Accuracy: 0.262

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 3 / 10, Batch 186 / 1407: Loss: 2.637
Epoch 3 / 10, Batch 486 / 1407: Loss: 2.562
Epoch 3 / 10, Batch 786 / 1407: Loss: 2.833
Epoch 3 / 10, Batch 1086 / 1407: Loss: 2.684
Epoch 3 / 10, Batch 1386 / 1407: Loss: 2.536
Epoch [3] - Loss: 2.88
Train - Loss: 4.815, Accuracy: 0.324 
Validation - Loss: 5.058, Accuracy: 0.286

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 4 / 10, Batch 279 / 1407: Loss: 2.697
Epoch 4 / 10, Batch 579 / 1407: Loss: 2.668
Epoch 4 / 10, Batch 879 / 1407: Loss: 2.880
Epoch 4 / 10, Batch 1179 / 1407: Loss: 2.165
Epoch [4] - Loss: 2.30
Train - Loss: 2.300, Accuracy: 0.444 
Validation - Loss: 2.693, Accuracy: 0.381

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 5 / 10, Batch 72 / 1407: Loss: 1.936
Epoch 5 / 10, Batch 372 / 1407: Loss: 2.125
Epoch 5 / 10, Batch 672 / 1407: Loss: 2.102
Epoch 5 / 10, Batch 972 / 1407: Loss: 2.301
Epoch 5 / 10, Batch 1272 / 1407: Loss: 2.579
Epoch [5] - Loss: 2.84
Train - Loss: 2.144, Accuracy: 0.462 
Validation - Loss: 2.565, Accuracy: 0.386

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 6 / 10, Batch 165 / 1407: Loss: 2.318
Epoch 6 / 10, Batch 465 / 1407: Loss: 2.338
Epoch 6 / 10, Batch 765 / 1407: Loss: 2.990
Epoch 6 / 10, Batch 1065 / 1407: Loss: 2.194
Epoch 6 / 10, Batch 1365 / 1407: Loss: 1.867
Epoch [6] - Loss: 2.98
Train - Loss: 1.863, Accuracy: 0.523 
Validation - Loss: 2.400, Accuracy: 0.422

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 7 / 10, Batch 258 / 1407: Loss: 1.705
Epoch 7 / 10, Batch 558 / 1407: Loss: 2.169
Epoch 7 / 10, Batch 858 / 1407: Loss: 2.262
Epoch 7 / 10, Batch 1158 / 1407: Loss: 1.916
Epoch [7] - Loss: 2.65
Train - Loss: 1.853, Accuracy: 0.530 
Validation - Loss: 2.479, Accuracy: 0.420

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 8 / 10, Batch 51 / 1407: Loss: 2.173
Epoch 8 / 10, Batch 351 / 1407: Loss: 1.913
Epoch 8 / 10, Batch 651 / 1407: Loss: 2.031
Epoch 8 / 10, Batch 951 / 1407: Loss: 2.037
Epoch 8 / 10, Batch 1251 / 1407: Loss: 1.895
Epoch [8] - Loss: 2.40
Train - Loss: 1.654, Accuracy: 0.572 
Validation - Loss: 2.372, Accuracy: 0.446

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 9 / 10, Batch 144 / 1407: Loss: 1.403
Epoch 9 / 10, Batch 444 / 1407: Loss: 1.662
Epoch 9 / 10, Batch 744 / 1407: Loss: 1.793
Epoch 9 / 10, Batch 1044 / 1407: Loss: 2.111
Epoch 9 / 10, Batch 1344 / 1407: Loss: 1.773
Epoch [9] - Loss: 3.33
Train - Loss: 1.455, Accuracy: 0.617 
Validation - Loss: 2.269, Accuracy: 0.458

------------------------------------------------------------



[1/1407]   0%|           [00:00<?]

Epoch 10 / 10, Batch 237 / 1407: Loss: 1.786
Epoch 10 / 10, Batch 537 / 1407: Loss: 2.356
Epoch 10 / 10, Batch 837 / 1407: Loss: 1.461
Epoch 10 / 10, Batch 1137 / 1407: Loss: 1.760
Epoch [10] - Loss: 2.28
Train - Loss: 1.500, Accuracy: 0.606 
Validation - Loss: 2.406, Accuracy: 0.442

------------------------------------------------------------



In [None]:
# Final evaluation metrics (on validation set)
print(evaluator.state.metrics)

{'accuracy': 0.4415, 'loss': 2.406295703125}


In [None]:
# Set the model to evaluation mode and test it using the test set
model.eval()
labels = []
predictions = []
for idx in range(len(testset)):
  data = testset[idx][0].unsqueeze(0).to(device)
  labels.append(testset[idx][1])
  out = model(data)
  _, prediction = torch.max(out, 1)
  predictions.append(prediction.item())

In [None]:
# Write the test results
test_accuracy = accuracy_score(labels, predictions)
test_precision = precision_score(labels, predictions, average='macro')
test_recall = recall_score(labels, predictions, average='macro')
test_f1 = f1_score(labels, predictions, average='macro')
write_results(f, 'test accuracy', [test_accuracy])
write_results(f, 'test precision', [test_precision])
write_results(f, 'test recal', [test_recall])
write_results(f, 'test f1', [test_f1])

In [None]:
# Download the .txt file
f.close()
files.download(f'{model.name}_{quality}.txt')

In [None]:
# Create a datafram from the labels and predictions, convert to csv and download
df = pd.DataFrame({'Labels' : labels, 'Predictions' : predictions})
df.to_csv(f'{model.name}_{quality}.csv')
files.download(f'{model.name}_{quality}.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>