## Planet four image classification

In [3]:
!tar -xvf  '/content/gdrive/MyDrive/EE514(Data-Analysis-&-Machine-Learning)/data/PlanetFour/planetfour.tar' -C '/content/gdrive/MyDrive/EE514(Data-Analysis-&-Machine-Learning)/data/PlanetFour/split/'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
planetfour/test/APF0000st7.jpg
planetfour/test/APF0000pvc.jpg
planetfour/test/APF0000k67.jpg
planetfour/test/APF00006kf.jpg
planetfour/test/APF0000kb3.jpg
planetfour/test/APF0000vc8.jpg
planetfour/test/APF0000tif.jpg
planetfour/test/APF0000dmm.jpg
planetfour/test/APF00000pz.jpg
planetfour/test/APF0000jde.jpg
planetfour/test/APF0000frq.jpg
planetfour/test/APF000056k.jpg
planetfour/test/APF00007rd.jpg
planetfour/test/APF0000jy6.jpg
planetfour/test/APF00009e6.jpg
planetfour/test/APF0000fnw.jpg
planetfour/test/APF00000bw.jpg
planetfour/test/APF0000cox.jpg
planetfour/test/APF00005iu.jpg
planetfour/test/APF0000i4k.jpg
planetfour/test/APF00004b1.jpg
planetfour/test/APF0000qkg.jpg
planetfour/test/APF000080p.jpg
planetfour/test/APF0000kxl.jpg
planetfour/test/APF0000kd1.jpg
planetfour/test/APF0000ivh.jpg
planetfour/test/APF00006mz.jpg
planetfour/test/APF0000ffs.jpg
planetfour/test/APF0000wpl.jpg
planetfour/test/APF00002vv.jpg
plane

In [4]:
!ls '/content/gdrive/MyDrive/EE514(Data-Analysis-&-Machine-Learning)/data/PlanetFour/split/planetfour'

test  test.csv	train  train.csv  valid  valid.csv


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
import sklearn.metrics as metrics
import tqdm

from torch.utils.data import DataLoader
from torchvision.datasets.folder import pil_loader
from pathlib import Path
from PIL import Image

from google.colab import drive, files

Change the device to "cpu" if you want to train on a CPU instead of a GPU.

In [None]:
device = 'cuda'

In [2]:
drive.mount("/content/gdrive", force_remount=True)
data_dir = '/content/gdrive/MyDrive/EE514(Data-Analysis-&-Machine-Learning)/data/PlanetFour/split/planetfour'

Mounted at /content/gdrive


In [None]:
# Print num samples in each set
train_dir = f"{data_dir}/train"
valid_dir = f"{data_dir}/valid"
test_dir = f"{data_dir}/test"

print(f"Training samples: {len(os.listdir(train_dir))}")
print(f"Validation samples: {len(os.listdir(valid_dir))}")
print(f"Test samples: {len(os.listdir(test_dir))}")

Training samples: 23299
Validation samples: 0
Test samples: 270


## Dataset

Here we define a custom Dataset object for the Planet Four data. You can read more about this in the PyTorch documentation: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

In [None]:
class PlanetFourDataset(object):
    def __init__(self, split='train', transform=None, loader=pil_loader):
        super().__init__()
        self.split = split
        self.base_dir = Path(data_dir)
        self.image_dir = f"{self.base_dir}/{self.split}"
        self.labels_file = f"{self.base_dir}/{(split + '.csv')}"
        self.labels_df = pd.read_csv(self.labels_file)
        self.transform = transform
        self.loader = loader
        
    def __getitem__(self, index):
        row = self.labels_df.iloc[index]
        filename = f"{self.image_dir}/{(row.tile_id + '.jpg')}"
        fans = int(row.fans)
        blotches = int(row.blotches)
        image = self.loader(str(filename))
        if self.transform is not None:
            image = self.transform(image)
        return image, torch.tensor([fans, blotches], dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels_df)

## Data augmentation

It is standard practice in deep learning to augment the training examples to prevent the network from overfitting. Here I use some standard augmentations such as randomly mirroring the images.

In [None]:
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))                  
])

valid_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) 
])

## Data loaders

In PyTorch, the data loaders take care of spinning up threads to load batches of data into memory from the dataset object.

In [None]:
train_set = PlanetFourDataset('train', transform=train_transform)
valid_set = PlanetFourDataset('valid', transform=train_transform)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False)

## Load a pretrained model

Here we'll use ResNet50 model that has been pretrained on ImageNet and replace the final layer with a new one suited to our problem.

In [None]:
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(2048, 2)
model.to(device);

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

## Loss

Images can contain fans, blotches, both, or neither. You could treat this as a four class softmax problem, or two binary classification problems. Here I take the latter approach and use a binary cross entropy loss. 

In [None]:
criterion = nn.BCEWithLogitsLoss()

## Optimizer

Stochastic gradient descent with momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)

## Training and validation functions

In [None]:
avg_train_losses = []
avg_valid_losses = []
valid_accuracies = []


def train_for_epoch(optimizer):
    model.train()

    train_losses = []

    for batch, target in tqdm.tqdm(train_loader):

        # data to GPU
        batch = batch.to(device)
        target = target.to(device)

        # reset optimizer
        optimizer.zero_grad()

        # forward pass
        predictions = model(batch)
        #breakpoint()

        # calculate loss
        loss = criterion(predictions, target)

        # backward pass
        loss.backward()

        # parameter update
        optimizer.step()

        # track loss
        train_losses.append(float(loss.item()))

    train_losses = np.array(train_losses)
    return train_losses


def validate():
    model.eval()

    valid_losses = []
    y_true, y_prob = [], []

    with torch.no_grad():
        for batch, target in valid_loader:

            # move data to the device
            batch = batch.to(device)
            target = target.to(device)

            # make predictions
            predictions = model(batch)

            # calculate loss
            loss = criterion(predictions, target)
            
            # logits -> probabilities
            torch.sigmoid_(predictions)

            # track losses and predictions
            valid_losses.append(float(loss.item()))
            y_true.extend(target.cpu().numpy())
            y_prob.extend(predictions.cpu().numpy())
            
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)
    y_pred = y_prob > 0.5
    valid_losses = np.array(valid_losses)

    # calculate validation accuracy from y_true and y_pred
    fan_accuracy = metrics.accuracy_score(y_true[:,0], y_pred[:,0])
    blotch_accuracy = metrics.accuracy_score(y_true[:,1], y_pred[:,1])
    exact_accuracy = np.all(y_true == y_pred, axis=1).mean()

    # calculate the mean validation loss
    valid_loss = valid_losses.mean()

    return valid_loss, fan_accuracy, blotch_accuracy, exact_accuracy


def train(epochs, first_epoch=1):
    for epoch in range(first_epoch, epochs+first_epoch):

        # train
        train_loss = train_for_epoch(optimizer)

        # validation
        valid_loss, fan_accuracy, blotch_accuracy, both_accuracy = validate()
        print(f'[{epoch:02d}] train loss: {train_loss.mean():0.04f}  '
              f'valid loss: {valid_loss:0.04f}  ',
              f'fan acc: {fan_accuracy:0.04f}  ',
              f'blotch acc: {blotch_accuracy:0.04f}  ',
              f'both acc: {both_accuracy:0.04f}'
        )
        
        # update learning curves
        avg_train_losses.append(train_loss.mean())
        avg_valid_losses.append(valid_loss)
        valid_accuracies.append((fan_accuracy, blotch_accuracy, both_accuracy))
        
        # save checkpoint
        #torch.save(model, f'checkpoints/baseline_{epoch:03d}.pkl')

## Constant classifier accuracy

Evaluate how accurate would a $f(x) = \text{"most common class"}$ classifier be? 

In [None]:
def constant_clf_accuracy():
    y_true, y_pred = [], []
    with torch.no_grad():
        for _, target in valid_loader:
            y_true.extend(target.cpu().numpy())
            y_pred.extend(np.ones((target.shape[0], 2), dtype=np.float32))
            
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
        
    # calculate validation accuracy from y_true and y_pred
    f = metrics.accuracy_score(y_true[:,0], y_pred[:,0])
    b = metrics.accuracy_score(y_true[:,1], y_pred[:,1])
    t = np.all(y_true == y_pred, axis=1).mean()
    print(f'fan: {f}  blotch: {b}  both: {t}')

In [None]:
constant_clf_accuracy()

FileNotFoundError: ignored

## Train the model
Call the ``train(n)`` function to train for ``n`` epochs.

In [None]:
train(5)