In [1]:
### let's get some of the necessary libraries in here

import sys
import os
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.transforms as transforms  
import torchvision
import torch.nn as nn
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time

import cv2
import pandas as pd
import torchvision.transforms as transform 
from torchvision.transforms import ToTensor, Normalize, Resize
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable
from torchvision.datasets import ImageFolder
from torch.utils.data import random_split

In [2]:
dataTrain = "/Users/John/Documents/allProjects/intelData/seg_train/seg_train" 
dataTest = "/Users/John/Documents/allProjects/intelData/seg_test/seg_test"
dataPred = "/Users/John/Documents/allProjects/intelData/seg_pred/seg_pred"

## the folders in the training data give us an easy list of labels
labels = os.listdir(dataTrain)

In [3]:
labels

['forest', 'buildings', 'glacier', 'street', 'mountain', 'sea']

### Normalizing image tensors
This process is important because it helps minimize the impact of image brightness and contrast amongst all images. Unless the images are taken school yearbook style - same location, lighting, camera, etc. - then there will inevitably be some differences among the images.

In [4]:
trainTemp = ImageFolder(dataTrain, transform = transforms.Compose([
    transforms.Resize(64),
    transforms.RandomCrop(64),
    transforms.ToTensor(),
]))
trainDL = DataLoader(trainTemp, 64, shuffle=True)

In [5]:
## do this to extract a single image
for (image, label) in list(enumerate(trainDL))[:1]:
    print(image)

0
0


Our dimensions are [64, 3, 64, 64] so we only need the means and sds for positions 0, 2, 3. We can discared 1 via the dim argument

In [6]:
label[0].size()

torch.Size([64, 3, 64, 64])

In [7]:
def calculateMeanSD(DL):
    """
    This function will calculate the mean and sum of squared mean for the data in a
    DataLoader. I adjusted it specifically to this dataset via the dim argument, skipping
    index 1 because that was not the data required. This function may require other adjustments
    for other datasets.
    """

    ## initialize the three variables as zero
    runningSum, sumSquared, batches = 0,0,0
    
    ## extract the data from the DataLoader and calculate the sums and sum squared
    for data, label in DL:
        runningSum += torch.mean(data, dim = ([0,2,3]))
        sumSquared += torch.mean(data**2, dim = ([0,2,3]))
        batches += 1

    ## simple calcs of     
    mean = runningSum/batches
    std = (sumSquared/batches - mean**2)**0.5
    return mean,std

In [8]:
mean, sd  = calculateMeanSD(trainDL)

Our means and standard deviations are below. This will inform the normalize step of any transformations within the data.

### Augmenting training data
The key here is to add in some randomness so that the CNN detects changes in the image. CNNs are shift invariant, meaning they will detect key details regardless of the position, yet they would struggle if I flipped, cropped, stretched, etc. the images. As such, we'll add in some of that randomness to help the CNN perform better on images with unique traits.

To do this I, resize, add a random crop, a random color jitter, and a random horizontal flip. This is quite a bit of randomness, which should introduce the CNN to a lot of different varieties. I'll likely want to crank the epochs to make sure the CNN "sees" everything.

In [9]:
## we'll do some augmentation on the training data
trainTransform = transform.Compose([
    transform.Resize((64,64)),
    transform.RandomCrop((64,64)),
    transforms.ColorJitter(0.3,0.4,0.4,0.2),
    transform.RandomHorizontalFlip(), ## default is p = 0.5
    transform.ToTensor(),
    transform.Normalize((mean[0],mean[1],mean[2]), (sd[0], sd[1], sd[2]))
])

We do not augment test data because we are evaluating the model's ability to correctly identify the images as opposed to preparing it to identify key attributes anywhere.

In [10]:
testTransform = transform.Compose([
    transform.Resize((64,64)),
    transform.ToTensor(),
    transform.Normalize((mean[0],mean[1],mean[2]), (sd[0], sd[1], sd[2]))
])

In [11]:
train = ImageFolder(dataTrain, transform = trainTransform)
test = ImageFolder(dataTest, transform = testTransform)

## Modeling

In [12]:
seedA = 33
torch.manual_seed(seedA);

In [13]:
## set the sizes and train test split the data
train_size = int(0.8*len(train))
val_size = int(len(train) - train_size)
print(train_size, val_size)

trainData, valData = random_split(train, [train_size, val_size])

11227 2807
11227 2807


Set parameters and the device. We'd like to use my gpu if it is available. If it is not, we can use the CPU.

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64
classes = len(labels)
learning_rate = 0.001
epochs = 50

In [15]:
## define the dataloaders
train_dl = DataLoader(trainData, batch_size, shuffle=True, num_workers=2, pin_memory=True)
valid_dl = DataLoader(valData, batch_size*2, num_workers=2, pin_memory=True)
test_dl = DataLoader(test, batch_size*2, num_workers=2, pin_memory=True)

### Definine the Neural Network

I'm going to do one bespoke CNN and one pretrained model for comparison purposes. To build a bespoke NN with pytorch we need to define the class with nn.Module. 

In [16]:
class BespokeCNN(nn.Module):
    """
    This is an extremely basic CNN, I doubt it will perform as well as the
    pretrained models.
    """
	## Decide upon some layers within the network
    def __init__(self, classes):
        super(BespokeCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3)
        self.max_pool1 = nn.MaxPool2d(kernel_size = 2, stride = 2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.max_pool2 = nn.MaxPool2d(kernel_size = 2, stride = 2)

        ## fully connected layers with Softmax as the activation function
        ## Using softmax because this is a multi-classification problem
        self.fc1 = nn.Linear(12544, 128)
        self.fc2 = nn.Linear(128, classes)
        self.softmax = nn.Softmax(dim = -1)
        
    
    ## defines how the data moves across the layers
    def forward(self,x):
        forw = self.conv1(x)
        forw = self.max_pool1(forw)
        
        forw = self.conv2(forw)
        forw = self.max_pool2(forw)
                
        forw = forw.reshape(forw.size(0), -1)
        
        forw = self.fc1(forw)
        forw = self.fc2(forw)
        forw = self.softmax(forw)
        

        return forw

In [28]:
## define the model from the class we built
model = BespokeCNN(classes)

In [22]:
def trainModel(model, epochs, train_dl, valid_dl, learning_rate):
    """
    This function will train the provided model printing the accuracy for the validation data.
    It takes the model, number of epochs, and two data loaders and then runs the model through the 
    training process. If we want to change the loss function or optimizer, we can in the code below
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)  
    #total_step = len(train_dl)


    for epoch in range(epochs):
        start = time.time()
        #Load in the data in batches using the train_loader object
        for i, (images, labels) in enumerate(train_dl):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        ## let's monitor the prediction accuracy on the validation data for each epoch
        ## this may indicate a good stopping point for the particular neural network
        with torch.no_grad():
            correct = 0
            total = 0
            for images, labels in valid_dl:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            print('Accuracy on {} validation images: {:.4f} %'.format(val_size, 100 * correct / total))

        end = time.time()
        print('Epoch [{}/{}], Loss: {:.4f}, Time in Seconds: {:.4f}'.format(epoch+1, epochs, loss.item(), end - start))

    return model

In [24]:
modelOut = trainModel(model, 25, train_dl, valid_dl, learning_rate) 

Accuracy on 2807 validation images: 52.7253 %
Epoch [1/25], Loss: 1.5926, Time in Seconds: 24.4088
Accuracy on 2807 validation images: 52.7253 %
Epoch [1/25], Loss: 1.5926, Time in Seconds: 24.4088


Accuracy on 2807 validation images: 54.9341 %
Epoch [2/25], Loss: 1.5307, Time in Seconds: 23.9522
Accuracy on 2807 validation images: 54.9341 %
Epoch [2/25], Loss: 1.5307, Time in Seconds: 23.9522


Accuracy on 2807 validation images: 54.9341 %
Epoch [3/25], Loss: 1.5617, Time in Seconds: 23.7382
Accuracy on 2807 validation images: 54.9341 %
Epoch [3/25], Loss: 1.5617, Time in Seconds: 23.7382


Accuracy on 2807 validation images: 55.8603 %
Epoch [4/25], Loss: 1.4687, Time in Seconds: 23.8442
Accuracy on 2807 validation images: 55.8603 %
Epoch [4/25], Loss: 1.4687, Time in Seconds: 23.8442


Accuracy on 2807 validation images: 56.5372 %
Epoch [5/25], Loss: 1.3823, Time in Seconds: 23.3743
Accuracy on 2807 validation images: 56.5372 %
Epoch [5/25], Loss: 1.3823, Time in Seconds: 23.3743


Accuracy on 2807 validation images: 56.1454 %
Epoch [6/25], Loss: 1.3027, Time in Seconds: 23.8263
Accuracy on 2807 validation images: 56.1454 %
Epoch [6/25], Loss: 1.3027, Time in Seconds: 23.8263


Accuracy on 2807 validation images: 57.3922 %
Epoch [7/25], Loss: 1.4306, Time in Seconds: 23.6338
Accuracy on 2807 validation images: 57.3922 %
Epoch [7/25], Loss: 1.4306, Time in Seconds: 23.6338


Accuracy on 2807 validation images: 59.1022 %
Epoch [8/25], Loss: 1.5030, Time in Seconds: 23.8514
Accuracy on 2807 validation images: 59.1022 %
Epoch [8/25], Loss: 1.5030, Time in Seconds: 23.8514


Accuracy on 2807 validation images: 58.4966 %
Epoch [9/25], Loss: 1.2865, Time in Seconds: 23.8525
Accuracy on 2807 validation images: 58.4966 %
Epoch [9/25], Loss: 1.2865, Time in Seconds: 23.8525


Accuracy on 2807 validation images: 58.3185 %
Epoch [10/25], Loss: 1.4515, Time in Seconds: 23.5675
Accuracy on 2807 validation images: 58.3185 %
Epoch [10/25], Loss: 1.4515, Time in Seconds: 23.5675


Accuracy on 2807 validation images: 60.5629 %
Epoch [11/25], Loss: 1.2663, Time in Seconds: 23.7524
Accuracy on 2807 validation images: 60.5629 %
Epoch [11/25], Loss: 1.2663, Time in Seconds: 23.7524


Accuracy on 2807 validation images: 59.6366 %
Epoch [12/25], Loss: 1.4705, Time in Seconds: 23.5645
Accuracy on 2807 validation images: 59.6366 %
Epoch [12/25], Loss: 1.4705, Time in Seconds: 23.5645


Accuracy on 2807 validation images: 60.7054 %
Epoch [13/25], Loss: 1.4525, Time in Seconds: 23.9887
Accuracy on 2807 validation images: 60.7054 %
Epoch [13/25], Loss: 1.4525, Time in Seconds: 23.9887


Accuracy on 2807 validation images: 62.0235 %
Epoch [14/25], Loss: 1.4016, Time in Seconds: 23.7018
Accuracy on 2807 validation images: 62.0235 %
Epoch [14/25], Loss: 1.4016, Time in Seconds: 23.7018


Accuracy on 2807 validation images: 60.4560 %
Epoch [15/25], Loss: 1.4029, Time in Seconds: 23.4981
Accuracy on 2807 validation images: 60.4560 %
Epoch [15/25], Loss: 1.4029, Time in Seconds: 23.4981


Accuracy on 2807 validation images: 62.5579 %
Epoch [16/25], Loss: 1.4704, Time in Seconds: 23.9585
Accuracy on 2807 validation images: 62.5579 %
Epoch [16/25], Loss: 1.4704, Time in Seconds: 23.9585


Accuracy on 2807 validation images: 63.2348 %
Epoch [17/25], Loss: 1.3993, Time in Seconds: 23.6773
Accuracy on 2807 validation images: 63.2348 %
Epoch [17/25], Loss: 1.3993, Time in Seconds: 23.6773


Accuracy on 2807 validation images: 62.7360 %
Epoch [18/25], Loss: 1.4288, Time in Seconds: 23.7531
Accuracy on 2807 validation images: 62.7360 %
Epoch [18/25], Loss: 1.4288, Time in Seconds: 23.7531


Accuracy on 2807 validation images: 64.8735 %
Epoch [19/25], Loss: 1.3159, Time in Seconds: 23.6075
Accuracy on 2807 validation images: 64.8735 %
Epoch [19/25], Loss: 1.3159, Time in Seconds: 23.6075


Accuracy on 2807 validation images: 63.3060 %
Epoch [20/25], Loss: 1.3714, Time in Seconds: 23.4025
Accuracy on 2807 validation images: 63.3060 %
Epoch [20/25], Loss: 1.3714, Time in Seconds: 23.4025


Accuracy on 2807 validation images: 64.0542 %
Epoch [21/25], Loss: 1.4479, Time in Seconds: 24.8049
Accuracy on 2807 validation images: 64.0542 %
Epoch [21/25], Loss: 1.4479, Time in Seconds: 24.8049


Accuracy on 2807 validation images: 63.1635 %
Epoch [22/25], Loss: 1.3894, Time in Seconds: 24.2381
Accuracy on 2807 validation images: 63.1635 %
Epoch [22/25], Loss: 1.3894, Time in Seconds: 24.2381


Accuracy on 2807 validation images: 64.7310 %
Epoch [23/25], Loss: 1.4283, Time in Seconds: 23.6714
Accuracy on 2807 validation images: 64.7310 %
Epoch [23/25], Loss: 1.4283, Time in Seconds: 23.6714


Accuracy on 2807 validation images: 63.4129 %
Epoch [24/25], Loss: 1.3788, Time in Seconds: 24.1553
Accuracy on 2807 validation images: 63.4129 %
Epoch [24/25], Loss: 1.3788, Time in Seconds: 24.1553


Accuracy on 2807 validation images: 65.3723 %
Epoch [25/25], Loss: 1.3588, Time in Seconds: 24.7434
Accuracy on 2807 validation images: 65.3723 %
Epoch [25/25], Loss: 1.3588, Time in Seconds: 24.7434


At 25 epochs the loss function is headed downwards and the prediction accuracy is increasing. This suggests more training cycles could be helpful. I'm going to re-run with 50 to see what I get.

In [29]:
modelOut = trainModel(model, 50, train_dl, valid_dl, learning_rate) 

Accuracy on 2807 validation images: 42.7146 %
Epoch [1/50], Loss: 1.6391, Time in Seconds: 25.3216
Accuracy on 2807 validation images: 42.7146 %
Epoch [1/50], Loss: 1.6391, Time in Seconds: 25.3216


Accuracy on 2807 validation images: 48.8422 %
Epoch [2/50], Loss: 1.5251, Time in Seconds: 24.1999
Accuracy on 2807 validation images: 48.8422 %
Epoch [2/50], Loss: 1.5251, Time in Seconds: 24.1999


Accuracy on 2807 validation images: 52.6185 %
Epoch [3/50], Loss: 1.3587, Time in Seconds: 23.9948
Accuracy on 2807 validation images: 52.6185 %
Epoch [3/50], Loss: 1.3587, Time in Seconds: 23.9948


Accuracy on 2807 validation images: 52.9035 %
Epoch [4/50], Loss: 1.6053, Time in Seconds: 24.3778
Accuracy on 2807 validation images: 52.9035 %
Epoch [4/50], Loss: 1.6053, Time in Seconds: 24.3778


Accuracy on 2807 validation images: 54.0791 %
Epoch [5/50], Loss: 1.5078, Time in Seconds: 23.8178
Accuracy on 2807 validation images: 54.0791 %
Epoch [5/50], Loss: 1.5078, Time in Seconds: 23.8178


Accuracy on 2807 validation images: 54.9341 %
Epoch [6/50], Loss: 1.5962, Time in Seconds: 23.9399
Accuracy on 2807 validation images: 54.9341 %
Epoch [6/50], Loss: 1.5962, Time in Seconds: 23.9399


Accuracy on 2807 validation images: 54.6847 %
Epoch [7/50], Loss: 1.5428, Time in Seconds: 23.8549
Accuracy on 2807 validation images: 54.6847 %
Epoch [7/50], Loss: 1.5428, Time in Seconds: 23.8549


Accuracy on 2807 validation images: 56.6797 %
Epoch [8/50], Loss: 1.5144, Time in Seconds: 23.5247
Accuracy on 2807 validation images: 56.6797 %
Epoch [8/50], Loss: 1.5144, Time in Seconds: 23.5247


Accuracy on 2807 validation images: 56.1454 %
Epoch [9/50], Loss: 1.5776, Time in Seconds: 23.7885
Accuracy on 2807 validation images: 56.1454 %
Epoch [9/50], Loss: 1.5776, Time in Seconds: 23.7885


Accuracy on 2807 validation images: 56.7510 %
Epoch [10/50], Loss: 1.3387, Time in Seconds: 24.4587
Accuracy on 2807 validation images: 56.7510 %
Epoch [10/50], Loss: 1.3387, Time in Seconds: 24.4587


Accuracy on 2807 validation images: 57.8197 %
Epoch [11/50], Loss: 1.3349, Time in Seconds: 24.1277
Accuracy on 2807 validation images: 57.8197 %
Epoch [11/50], Loss: 1.3349, Time in Seconds: 24.1277


This model stinks, as expected. The network is very shallow and skips some key steps. It is now time to move on to work that is reputable and researched.

Unless you are conducting research in the neural network realm to improve performance, building a bespoke model probably isn't a great idea. Sticking to tried and true methods will almost always be the best way to go. Due to the No Free Lunch Theorem, it makes the most sense to try many models, some of which are built into pytorch.

Next, I'm going to implement ResNet9. The literature on the model showed a 92% accuracy on the dataset I'm using - that's pretty good. There are some complex elements to it, so there will be quite a bit of handwritten coding to implement this, which is fine. The idea is that this code will work for any image dataset that I choose even though I'm using the one that this model was initially trained on.

## ResNet9

Now that our exceptionally poor bespoke CNN has run, we are going to mix things up and go with a well-researched approach: ResNet9. I chose this because the researchers applied this model to the dataset I'm working with, as such, I'm replicating the paper itself. It's important to note that I need to implement the training and validation procedure because they dynamically adjust the learning rate, as well as the architecture of the neural net itself.

In [None]:
### This block will define a class that allows me to train and validate the images as we make predictions through our epochs

class ImageNet(nn.Module):
    def training(self, batch):
        """
        Takes the images and self, trains the model, calculates the loss and returns it
        """
        
        imgs, labs = batch
        output = self(imgs)
        loss = nn.CrossEntropyLoss(output, labs)
        return loss

    def validation(self, batch):
        """
        Similar to training except we all calculate the accuracy.
        """
        imgs, labs = batch
        output = self(imgs)
        loss = nn.CrossEntropyLoss(output,labs)
        ### position 1 tells us the index for the largest predicted value
        preds = torch.max(outputs, dim = 1)
        acc = torch.tensor(torch.sum(preds[1] == labels).item() / len(preds))
    return loss, acc
