# Page Turner

### Load the necessary packages

In [1]:
import os, sys
from os import listdir
from os.path import isfile, join
import seaborn as sns
from collections import defaultdict
import shutil

# PyTorch
import torchvision
from torchvision import transforms, datasets, models
import torch
from torch import optim, cuda
from torch.utils.data import DataLoader, sampler
import torch.nn as nn

from torch_lr_finder import LRFinder

# Data science tools
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Image manipulations
import PIL.Image
# Timing utility
from timeit import default_timer as timer

# Visualizations
import matplotlib.pyplot as plt
from IPython.display import display, Image
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import warnings
warnings.filterwarnings('ignore')

import itertools
from itertools import cycle

  pip install torch-lr-finder -v --global-option="amp"


In [94]:
class VideoRecord(object):
    """
    Helper class for class VideoFrameDataset. This class
    represents a video sample's metadata.
    Args:
        root_datapath: the system path to the root folder
                       of the videos.
        row: A list with four or more elements where 1) The first
             element is the path to the video sample's frames excluding
             the root_datapath prefix 2) The  second element is the starting frame id of the video
             3) The third element is the inclusive ending frame id of the video
             4) The fourth element is the label index.
             5) any following elements are labels in the case of multi-label classification
    """
    def __init__(self, row, root_datapath):
        self._data = row
        self._path = os.path.join(root_datapath, row[0])


    @property
    def path(self):
        return self._path

    @property
    def num_frames(self):
        return self.end_frame - self.start_frame + 1  # +1 because end frame is inclusive
    @property
    def start_frame(self):
        return int(self._data[1])

    @property
    def end_frame(self):
        return int(self._data[2])

    @property
    def label(self):
        # just one label_id
        if len(self._data) == 4:
            return int(self._data[3])
        # sample associated with multiple labels
        else:
            return [int(label_id) for label_id in self._data[3:]]

class VideoFrameDataset(torch.utils.data.Dataset):
    r"""
    A highly efficient and adaptable dataset class for videos.
    Instead of loading every frame of a video,
    loads x RGB frames of a video (sparse temporal sampling) and evenly
    chooses those frames from start to end of the video, returning
    a list of x PIL images or ``FRAMES x CHANNELS x HEIGHT x WIDTH``
    tensors where FRAMES=x if the ``ImglistToTensor()``
    transform is used.
    More specifically, the frame range [START_FRAME, END_FRAME] is divided into NUM_SEGMENTS
    segments and FRAMES_PER_SEGMENT consecutive frames are taken from each segment.
    Note:
        A demonstration of using this class can be seen
        in ``demo.py``
        https://github.com/RaivoKoot/Video-Dataset-Loading-Pytorch
    Note:
        This dataset broadly corresponds to the frame sampling technique
        introduced in ``Temporal Segment Networks`` at ECCV2016
        https://arxiv.org/abs/1608.00859.
    Note:
        This class relies on receiving video data in a structure where
        inside a ``ROOT_DATA`` folder, each video lies in its own folder,
        where each video folder contains the frames of the video as
        individual files with a naming convention such as
        img_001.jpg ... img_059.jpg.
        For enumeration and annotations, this class expects to receive
        the path to a .txt file where each video sample has a row with four
        (or more in the case of multi-label, see README on Github)
        space separated values:
        ``VIDEO_FOLDER_PATH     START_FRAME      END_FRAME      LABEL_INDEX``.
        ``VIDEO_FOLDER_PATH`` is expected to be the path of a video folder
        excluding the ``ROOT_DATA`` prefix. For example, ``ROOT_DATA`` might
        be ``home\data\datasetxyz\videos\``, inside of which a ``VIDEO_FOLDER_PATH``
        might be ``jumping\0052\`` or ``sample1\`` or ``00053\``.
    Args:
        root_path: The root path in which video folders lie.
                   this is ROOT_DATA from the description above.
        annotationfile_path: The .txt annotation file containing
                             one row per video sample as described above.
        num_segments: The number of segments the video should
                      be divided into to sample frames from.
        frames_per_segment: The number of frames that should
                            be loaded per segment. For each segment's
                            frame-range, a random start index or the
                            center is chosen, from which frames_per_segment
                            consecutive frames are loaded.
        imagefile_template: The image filename template that video frame files
                            have inside of their video folders as described above.
        transform: Transform pipeline that receives a list of PIL images/frames.
        random_shift: Whether the frames from each segment should be taken
                      consecutively starting from the center of the segment, or
                      consecutively starting from a random location inside the
                      segment range.
        test_mode: Whether this is a test dataset. If so, chooses
                   frames from segments with random_shift=False.
    """
    def __init__(self,
                 root_path: str,
                 annotationfile_path: str,
                 num_segments: int = 3,
                 frames_per_segment: int = 1,
                 imagefile_template: str='img_{:05d}.jpg',
                 transform = None,
                 random_shift: bool = True,
                 test_mode: bool = False):
        super(VideoFrameDataset, self).__init__()

        self.root_path = root_path
        self.annotationfile_path = annotationfile_path
        self.num_segments = num_segments
        self.frames_per_segment = frames_per_segment
        self.imagefile_template = imagefile_template
        self.transform = transform
        self.random_shift = random_shift
        self.test_mode = test_mode

        self._parse_list()

    def _load_image(self, directory, idx):
        return [PIL.Image.open(os.path.join(directory, self.imagefile_template.format(idx))).convert('RGB')]

    def _parse_list(self):
        self.video_list = [VideoRecord(x.strip().split(' '), self.root_path) for x in open(self.annotationfile_path)]

    def _sample_indices(self, record):
        """
        For each segment, chooses an index from where frames
        are to be loaded from.
        Args:
            record: VideoRecord denoting a video sample.
        Returns:
            List of indices of where the frames of each
            segment are to be loaded from.
        """

        segment_duration = (record.num_frames - self.frames_per_segment + 1) // self.num_segments
        if segment_duration > 0:
            offsets = np.multiply(list(range(self.num_segments)), segment_duration) + np.random.randint(segment_duration, size=self.num_segments)

        # edge cases for when a video has approximately less than (num_frames*frames_per_segment) frames.
        # random sampling in that case, which will lead to repeated frames.
        else:
            offsets = np.sort(np.random.randint(record.num_frames, size=self.num_segments))

        return offsets

    def _get_val_indices(self, record):
        """
        For each segment, finds the center frame index.
        Args:
            record: VideoRecord denoting a video sample.
        Returns:
             List of indices of segment center frames.
        """
        if record.num_frames > self.num_segments + self.frames_per_segment - 1:
            offsets = self._get_test_indices(record)

        # edge case for when a video does not have enough frames
        else:
            offsets = np.sort(np.random.randint(record.num_frames, size=self.num_segments))

        return offsets

    def _get_test_indices(self, record):
        """
        For each segment, finds the center frame index.
        Args:
            record: VideoRecord denoting a video sample
        Returns:
            List of indices of segment center frames.
        """

        tick = (record.num_frames - self.frames_per_segment + 1) / float(self.num_segments)

        offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])

        return offsets

    def __getitem__(self, index):
        """
        For video with id index, loads self.NUM_SEGMENTS * self.FRAMES_PER_SEGMENT
        frames from evenly chosen locations.
        Args:
            index: Video sample index.
        Returns:
            a list of PIL images or the result
            of applying self.transform on this list if
            self.transform is not None.
        """
        record = self.video_list[index]

        if not self.test_mode:
            segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record)
        else:
            segment_indices = self._get_test_indices(record)

        return self._get(record, segment_indices)

    def _get(self, record, indices):
        """
        Loads the frames of a video at the corresponding
        indices.
        Args:
            record: VideoRecord denoting a video sample.
            indices: Indices at which to load video frames from.
        Returns:
            1) A list of PIL images or the result
            of applying self.transform on this list if
            self.transform is not None.
            2) An integer denoting the video label.
        """

        indices = indices + record.start_frame
        images = list()
        image_indices = list()
        for seg_ind in indices:
            frame_index = int(seg_ind)
            for i in range(self.frames_per_segment):
                seg_img = self._load_image(record.path, frame_index)
                images.extend(seg_img)
                image_indices.append(frame_index)
                if frame_index < record.end_frame:
                    frame_index += 1

        # sort images by index in case of edge cases where segments overlap each other because the overall
        # video is too short for num_segments*frames_per_segment indices.
        # _, images = (list(sorted_list) for sorted_list in zip(*sorted(zip(image_indices, images))))

        if self.transform is not None:
            images = self.transform(images)


        images = torch.transpose(images, 0, 1)
        print(images.shape)
        return images, record.label

    def __len__(self):
        return len(self.video_list)

class ImglistToTensor(torch.nn.Module):
    """
    Converts a list of PIL images in the range [0,255] to a torch.FloatTensor
    of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1].
    Can be used as first transform for ``VideoFrameDataset``.
    """
    def forward(self, img_list):
        """
        Converts each PIL image in a list to
        a torch Tensor and stacks them into
        a single tensor.
        Args:
            img_list: list of PIL images.
        Returns:
            tensor of size ``NUM_IMAGES x CHANNELS x HEIGHT x WIDTH``
        """
        tensor = torch.stack([transforms.functional.to_tensor(pic) for pic in img_list])
        
        print('first transofrm', tensor.shape)
        return tensor

In [95]:
train_dir = '../data/images/training'
test_dir = '../data/images/testing'
valid_dir = '../data/images/validation'
train_annotations = '../data/images/training/annotations.txt'
test_annotations = '../data/images/testing/annotations.txt'
valid_annotations = '../data/images/testing/annotations.txt'
batch_size = 2

num_segments = 3
frames_per_segment = 3

# Datasets from each folder
image_data = {
    'train': VideoFrameDataset(
                root_path=train_dir,
                annotationfile_path=train_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['train'],
                random_shift=True,
                test_mode=False
            ),
    'val': VideoFrameDataset(
                root_path=valid_dir,
                annotationfile_path=valid_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['val'],
                random_shift=True,
                test_mode=False
            ),
    'test': VideoFrameDataset(
                root_path=test_dir,
                annotationfile_path=test_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['test'],
                random_shift=True,
                test_mode=False
            )
}

# Dataloader iterators
dataloaders = {
    'train': DataLoader(image_data['train'], shuffle=True),
    'val': DataLoader(image_data['val'], shuffle=True),
    'test': DataLoader(image_data['test'], shuffle=False)
}

image_transforms = {
    # Train uses data augmentation
    'train':
    transforms.Compose([
        ImglistToTensor(),
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]),
    # Validation does not use augmentation
    'val':
    transforms.Compose([
        ImglistToTensor(),        
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]),
    # Test does not use augmentation
    'test':
    transforms.Compose([
        ImglistToTensor(),        
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),

    ]),
}

In [96]:
model, history = train_model(
    model,
    criterion,
    optimizer,
    dataloaders['train'],
    dataloaders['test'],
    save_file_name=save_file_name,
    checkpoint_file_name=checkpoint_file_name,
    early_stopping_patience=50,
    overfit_patience=10,
    n_epochs=200
    )

Model has been trained for: 0 epochs.

first transofrm torch.Size([9, 3, 1920, 1080])
torch.Size([3, 9, 227, 128])


RuntimeError: Given groups=1, weight of size [64, 9, 3, 3, 3], expected input[1, 3, 9, 227, 128] to have 9 channels, but got 3 channels instead

## Preliminary Data Wrangling and Exploratory Data Analysis

Description of data:
This dataset contains video frame images for pages being turned and not being turned. There are:
1. 65 training videos for flipping
2. 58 training videos for not flipping
3. 65 testing videos for flipping
4. 58 testing videos for flipping


### Random Samples Per Label

In [3]:
def gather_image_data(root_dir, image_dir):
    # root_dir = root directory of the image data
    # food_dir is the specific food directory you wish to gather the data from
    files_in_folder = os.listdir(os.path.join(root_dir,image_dir))
    random_image = np.random.choice(files_in_folder)
    return plt.imread(os.path.join(root_dir,image_dir,random_image))

In [4]:
# # setup root directory and grid size
# root_dir = '../data/images/training'
# row = 5
# col = 5

# # initiate subplot and configure title
# fig, ax = plt.subplots(row,col,figsize=(20,10))
# fig.suptitle("Random Images for Flipping", y=.95, fontsize=24)
# plt.setp(ax, xticks=[],yticks=[])
# plt.subplots_adjust(hspace=0.5)

# # iterate through each category of food and assign a random image to a spot on the grid
# for i in range(row):
#     for j in range(col):
#         try:
#             image_dir = 'flip'
#         except:
#             break
#         img = gather_image_data(root_dir,image_dir)
#         ax[i][j].imshow(img)

# plt.show();

In [5]:
# # setup root directory and grid size
# root_dir = '../data/images/training'
# row = 5
# col = 5

# # initiate subplot and configure title
# fig, ax = plt.subplots(row,col,figsize=(20,10))
# fig.suptitle("Random Images for Not Flipping", y=.95, fontsize=24)
# plt.setp(ax, xticks=[],yticks=[])
# plt.subplots_adjust(hspace=0.5)

# # iterate through each category of food and assign a random image to a spot on the grid
# for i in range(row):
#     for j in range(col):
#         try:
#             image_dir = 'notflip'
#         except:
#             break
#         img = gather_image_data(root_dir,image_dir)
#         ax[i][j].imshow(img)

# plt.show();

### Copy Train and Test images into respective folders

In [3]:
# create functions for copying files and ignoring files

def copytree(src, dst, ignored_ids = None):
    # src = source directory
    # dst = destination directory of copy
    # ignore = ignore function that provides list of id's to ignore based on testing or training set
    
    # if destination directory does not exist, create directory
    if not os.path.exists(dst):
        os.makedirs(dst)
        shutil.copystat(src, dst)
    
    # get list of directories in current directory
    directory_items = os.listdir(src)
    # filter out items to be ignored
    directory_items = [x for x in directory_items if x not in ignored_ids]
    # for each item in directory, copy into destination
    for item in directory_items:
        source = os.path.join(src, item)
        destination = os.path.join(dst, item)
        # if item is a directory, recurisvely call this function 
        if os.path.isdir(source):
            print(source)
            copytree(source, destination, ignored_ids)
        # copy item to destination
        else:
            shutil.copy2(source, destination)
            
def sort_videos(src, action="flip"):
    src = os.path.join(src, action)
        
    # get list of files in folder
    directory_items = os.listdir(src)
    # iterate through files
    last_video_id = 99999
    count = 0
    for video in directory_items:
#         print(directory_items)
        video_id = video.split('_')[0]
#         frame = video.split('_')[1]
        if video_id != last_video_id:
            count = 0
            last_video_id = video_id
        destination = os.path.join(src, video_id)
        # on new video
        if not os.path.exists(destination):
            # create new folder
            os.makedirs(destination)
            shutil.copystat(src, destination)
        # move images to folder
        shutil.copy2(os.path.join(src,video), os.path.join(destination, "{}.jpg".format(str(count))))
        count += 1


def sort_images():
    train_dir = '../data/images/training'
    test_dir = '../data/images/testing'
    valid_dir = '../data/valid'
    
    if not os.path.isdir('../data/train'):
        sort_videos(train_dir, 'flip')
        sort_videos(train_dir, 'notflip')
    else:
        print('Train files already copied into separate folders.')

#     if not os.path.isdir('../data/test'):
#         sort_videos(test_dir, 'flip')
#         sort_videos(test_dir, 'notflip')
#     else:
#         print('Test files already copied into separate folders.')
        
def annotate_videos(src):
#     jumping/0001 1 17 0
    annotations = []
    
    for action in os.listdir(src):
        action_id = 0 if action == 'flip' else 1
        for video in os.listdir(os.path.join(src, action)):
            print(video)
            images = os.listdir(os.path.join(src,action, video))
            print(images)
            first = images[0].split('.')[0].split('.')[0]
            last = 0
            for item in images:
                num = item.split('.')[0]
                if int(num) > last:
                    last = int(num)
            annotation = "{}/{} {} {} {}".format(action, video, first, last, action_id)
            annotations.append(annotation)
    with open(os.path.join(src, 'annotations.txt'), 'w') as f:
        for annotation in annotations:
            f.write("%s\n" % annotation)

In [7]:
# sort_images()

In [4]:
train_dir = '../data/images/training'
test_dir = '../data/images/testing'
valid_dir = '../data/images/validation'

annotate_videos(train_dir)
annotate_videos(test_dir)
annotate_videos(valid_dir)

0003
['0.jpg', '1.jpg', '10.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg', '15.jpg', '16.jpg', '17.jpg', '18.jpg', '19.jpg', '2.jpg', '20.jpg', '21.jpg', '22.jpg', '23.jpg', '24.jpg', '25.jpg', '3.jpg', '4.jpg', '5.jpg', '6.jpg', '7.jpg', '8.jpg', '9.jpg']
0004
['0.jpg', '1.jpg', '10.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg', '15.jpg', '16.jpg', '17.jpg', '18.jpg', '19.jpg', '2.jpg', '20.jpg', '21.jpg', '22.jpg', '23.jpg', '3.jpg', '4.jpg', '5.jpg', '6.jpg', '7.jpg', '8.jpg', '9.jpg']
0005
['0.jpg', '1.jpg', '10.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg', '15.jpg', '16.jpg', '17.jpg', '18.jpg', '19.jpg', '2.jpg', '20.jpg', '21.jpg', '22.jpg', '23.jpg', '24.jpg', '25.jpg', '26.jpg', '27.jpg', '28.jpg', '29.jpg', '3.jpg', '4.jpg', '5.jpg', '6.jpg', '7.jpg', '8.jpg', '9.jpg']
0006
['0.jpg', '1.jpg', '10.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg', '15.jpg', '16.jpg', '17.jpg', '18.jpg', '19.jpg', '2.jpg', '20.jpg', '21.jpg', '22.jpg', '23.jpg', '24.jpg', '25.jpg', '26.jpg', '27.jpg',

## Training the Model
### Initalize parameters

In [9]:
# Location of data
data_dir = '../data/images/'
train_dir = data_dir + 'train/'
# valid_dir = data_dir + 'valid/'
test_dir = data_dir + 'test/'

save_file_name = 'page-turner.pt'
checkpoint_file_name = 'page-turner-check.pt'

# Change to fit hardware
batch_size = 1

image_size = 224

# Whether to train on a gpu
train_on_gpu = cuda.is_available()
print(f'Train on gpu: {train_on_gpu}')

# Number of gpus
if train_on_gpu:
    gpu_count = cuda.device_count()
    print(f'{gpu_count} gpus detected.')
    if gpu_count > 1:
        multi_gpu = True
    else:
        multi_gpu = False

Train on gpu: True
1 gpus detected.


In [10]:
# set random seeds
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(42)

### Image Augmentation


In [32]:
image_transforms = {
    # Train uses data augmentation
    'train':
    transforms.Compose([
        ImglistToTensor(),
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]),
    # Validation does not use augmentation
    'val':
    transforms.Compose([
        ImglistToTensor(),        
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]),
    # Test does not use augmentation
    'test':
    transforms.Compose([
        ImglistToTensor(),        
        transforms.Resize(128),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),

    ]),
}

### Data Iterators

In [23]:
train_dir = '../data/images/training'
test_dir = '../data/images/testing'
valid_dir = '../data/images/validation'
train_annotations = '../data/images/training/annotations.txt'
test_annotations = '../data/images/testing/annotations.txt'
valid_annotations = '../data/images/testing/annotations.txt'
batch_size = 2

num_segments = 3
frames_per_segment = 3

# Datasets from each folder
image_data = {
    'train': VideoFrameDataset(
                root_path=train_dir,
                annotationfile_path=train_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['train'],
                random_shift=True,
                test_mode=False
            ),
    'val': VideoFrameDataset(
                root_path=valid_dir,
                annotationfile_path=valid_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['val'],
                random_shift=True,
                test_mode=False
            ),
    'test': VideoFrameDataset(
                root_path=test_dir,
                annotationfile_path=test_annotations,
                num_segments=num_segments,
                frames_per_segment=frames_per_segment,
                imagefile_template='{:01d}.jpg',
                transform=image_transforms['test'],
                random_shift=True,
                test_mode=False
            )
}

# Dataloader iterators
dataloaders = {
    'train': DataLoader(image_data['train'], shuffle=True),
    'val': DataLoader(image_data['val'], shuffle=True),
    'test': DataLoader(image_data['test'], shuffle=False)
}

### Model Setup


In [28]:
class C3D(nn.Module):
    """
    The C3D network as described in [1].
    """

    def __init__(self):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(9, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 2)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):

        h = self.relu(self.conv1(x))
        h = self.pool1(h)

        h = self.relu(self.conv2(h))
        h = self.pool2(h)

        h = self.relu(self.conv3a(h))
        h = self.relu(self.conv3b(h))
        h = self.pool3(h)

        h = self.relu(self.conv4a(h))
        h = self.relu(self.conv4b(h))
        h = self.pool4(h)

        h = self.relu(self.conv5a(h))
        h = self.relu(self.conv5b(h))
        h = self.pool5(h)

        h = h.view(-1, 8192)
        h = self.relu(self.fc6(h))
        h = self.dropout(h)
        h = self.relu(self.fc7(h))
        h = self.dropout(h)

        logits = self.fc8(h)
        probs = self.softmax(logits)

        return probs

"""
References
----------
[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks." 
Proceedings of the IEEE international conference on computer vision. 2015.
"""

'\nReferences\n----------\n[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks." \nProceedings of the IEEE international conference on computer vision. 2015.\n'

In [29]:
model = C3D()
model

C3D(
  (conv1): Conv3d(9, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool1): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool2): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv3a): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (conv3b): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv4a): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (conv4b): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool4): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv5a): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=

#### Replace Last layer with fully connected layer configured for the task at hand

To finish modifying the ResNet50 pre-trained network I replace the last fully connected layer and replace it with two fully connected layers and a LeakyReLU function after the first one. I chose to use two so I could down sample the outgoing features of the pre-trained model and the new layers to first the pixel size of the image, then to the number of classes in the dataset.

If I had more time I would experiment with the configuration of these last layers as I have seen increases and decreases in the accuracy simply based on the layers a the end of the network.

In [15]:
# n_classes = 2

# n_inputs = model.fc8.in_features

# classifier = nn.Sequential(
#     nn.Linear(n_inputs,image_size),
#     nn.LeakyReLU(),
#     nn.Linear(image_size,n_classes)
#     )

# model.fc8 = classifier
# model

#### Move model to GPU

This will allow the model to train quicker by taking advantage of a GPU if the learning environment has one.

In [30]:

if train_on_gpu:
    model = model.to('cuda')

if multi_gpu:
    model = nn.DataParallel(model)

### Training Loss and Optimizer

The loss function I use is Cross Entropy Loss because it is commonly used in classification models. It works by comparing predicted probabilities of the datapoint is and the actual label. The loss increases as the difference predicted probability is from the actual point.

For the optimizer I chose to use Stochastic Gradient Descent (SGD). At first I used the ADAM optimizer to prototype the model, however once I knew everything else was in place and working I switched over to SGD to give me a more fine tuned approach to exploring the gradient of the data. 

Along with using SGD as my optimizer, I pair it with a Cyclical Learning Rate scheduler. This allows the model to get a better lay of the land by cycling between a lower and upper bound learning rate. Doing this allows the model to explore the gradient curve in more than one direction and hopefully avoid getting stuck in a local minima.

In [17]:
# we will be using negative log likelihood as the loss function
criterion = nn.CrossEntropyLoss()
# we will be using the Adam optimizer as our optimizer
optimizer = optim.Adam(model.fc8.parameters())


### Training the Model

The model will train in two phases.
In phase one we train the model while only updating the weights on the new classifier layers that we added to the pre-trained ResNet50 model. This will allow training to focus on the last few layers and ensure that they are trained accordingly with the data. This phase will continue until the training loss becomes greater than the validation loss for a certain amount of training epochs, in this case 10. I use that as a metric to stop training because it is an indicator that the model is becoming overfitted to the training data. The validation loss is naturally smaller than the training loss in this example because of the amount of image transformations we are doing before feeding the images into the model. I chose to do this to allow the model to be able to generalize better to unseen data.

Phase two of training will be almost identical to phase one, with the exception of unfreezing all layers of the model to allow the training process to update the weights of the rest of the layers. This will continue until the training loss becomes smaller than the valid loss for a certain amount of epochs.

In [18]:
def train_model(model,
                criterion,
                optimizer,
                train_loader,
                valid_loader,
                save_file_name,
                checkpoint_file_name,
                early_stopping_patience=100,
                overfit_patience=15,
                n_epochs=25,
                valid_every=2
               ):
    """Train a PyTorch Model

    Params
    --------
        model (PyTorch model): cnn to train
        criterion (PyTorch loss): objective to minimize
        optimizer (PyTorch optimizier): optimizer to compute gradients of model parameters
        train_loader (PyTorch dataloader): training dataloader to iterate through
        valid_loader (PyTorch dataloader): validation dataloader used for early stopping
        save_file_name (str ending in '.pt'): file path to save the model state dict
        max_epochs_stop (int): maximum number of epochs with no improvement in validation loss for early stopping
        n_epochs (int): maximum number of training epochs
        valid_every (int): frequency of epochs to validate model

    Returns
    --------
        model (PyTorch model): trained cnn with best weights
        history (DataFrame): history of train and validation loss and accuracy
    """
    # early stopping initializaiton
    epochs_no_improve = 0
    epochs_overfit = 0
    valid_loss_min = np.Inf
    
    valid_max_acc = 0
    history = []
    
    # number of epochs already trained (if using loaded in model weights)
    try:
        print("Model has been trained for: {} epochs.\n".format(model.epochs))
    except:
        model.epochs = 0
        print("Starting training from scratch.\n")
        
    overall_start = timer()
    
    #Main loop
    for epoch in range(n_epochs):
        
        #keep track of training and validation loss of each epoch
        train_loss = 0.0
        valid_loss = 0.0
        
        train_acc = 0
        valid_acc = 0
        
        #set to training
        model.train()
        start = timer()
        
        # training loop
        for ii, (data, target) in enumerate(train_loader):
            #tensors to gpu
            if train_on_gpu:
                data, target = data.cuda(), target.cuda()
                
            # clear gradients
            optimizer.zero_grad()
            #predicted outpouts are log probabilities
            output = model(data)
            
            # loss and backpropagation of gradients
            loss = criterion(output, target)
            loss.backward()
            
            # update the parameters
            optimizer.step()
            
            # track train loss by multiplying average loss by number of examples in batch
            train_loss += loss.item() * data.size(0)
            
            # calculate accuracy by finding max log probability
            _, pred = torch.max(output, dim=1)
            correct_tensor = pred.eq(target.data.view_as(pred))
            # need to convert correct tensor from int to float to average
            accuracy = torch.mean(correct_tensor.type(torch.FloatTensor))
            # multiply average accuracy times the number of examples in batch
            train_acc += accuracy.item() * data.size(0)
            
            # Track training progress
            print(
                f'Epoch: {epoch}\t{100 * (ii + 1) / len(train_loader):.2f}% complete. {timer() - start:.2f} seconds elapsed in epoch.',
                end='\r')
        # after training loop ends
        else:
            model.epochs += 1
            
            if model.epochs > 1 and (model.epochs % valid_every == 0): 
                # don't need to keep track of gradients
                with torch.no_grad():
                    # set to evaluation mode
                    model.eval()

                    #validation loop
                    for data, target in valid_loader:
                        #tensors to gpu
                        if train_on_gpu:
                            data, target = data.cuda(), target.cuda()

                        # Forward pass
                        output = model(data)

                        # validation loss 
                        loss = criterion(output, target)
                        # multiply average loss times the number of examples in batch
                        valid_loss += loss.item() * data.size(0)

                        # calculate validation accuracy
                        _, pred = torch.max(output, dim=1)
                        correct_tensor = pred.eq(target.data.view_as(pred))
                        accuracy = torch.mean(
                            correct_tensor.type(torch.FloatTensor))
                        # multiply average accuracy times the number of examples
                        valid_acc += accuracy.item() * data.size(0)

                    # calculate average losses
                    train_loss = train_loss / (len(train_loader.dataset))
                    valid_loss = valid_loss / (len(valid_loader.dataset))

                    # calculate average accuracy
                    train_acc = train_acc / (len(train_loader.dataset))
                    valid_acc = valid_acc / (len(valid_loader.dataset))


                    history.append([train_loss, valid_loss, train_acc, valid_acc, model.epochs])

                    print(
                        f'\nEpoch: {epoch} \tTraining Loss: {train_loss:.4f} \tValidation Loss: {valid_loss:.4f}'
                    )
                    print(
                        f'\t\tTraining Accuracy: {100 * train_acc:.2f}%\t Validation Accuracy: {100 * valid_acc:.2f}%'
                    )

                    # save the model if validation loss decreases
                    if valid_loss < valid_loss_min:
                        print("Valid loss decreased ({:.6f} --> {:.6f}). Saving model...".format(valid_loss_min, valid_loss))

                        # save model
                        torch.save(model.state_dict(), save_file_name)

                        checkpoint = {
                            "model": model,
                            "criterion": criterion,
                            "epochs": model.epochs,
                            "optimizer_state": optimizer.state_dict(),
                            "model_state": model.state_dict(),
                            "valid_loss_min": valid_loss
                        }
                        torch.save(checkpoint, checkpoint_file_name)

                        # track improvements
                        epochs_no_improve = 0
                        epochs_overfit = 0
                        valid_loss_min = valid_loss
                        valid_best_acc = valid_acc
                        best_epoch = epoch

                    # otherwise increment count of epochs with no improvement
                    elif train_loss < valid_loss:
                        epochs_overfit += 1
                        if epochs_overfit >= overfit_patience:
                            print(f'\n Valid loss has increased larger than training loss for {epochs_overfit} epochs')
                            print(
                            f'\nEarly Stopping! Total epochs: {epoch}. Best epoch: {best_epoch} with loss: {valid_loss_min:.2f} and acc: {100 * valid_acc:.2f}%'
                            )
                            # load the best state dict
                            model.load_state_dict(torch.load(save_file_name))
                            # attach the optimizer
                            model.optimizer = optimizer

                            # format history
                            history = pd.DataFrame(
                                    history,
                                    columns=[
                                        'train_loss', 'valid_loss', 'train_acc',
                                        'valid_acc', 'epochs'
                                    ])
                            return model, history

                    else:
                        epochs_no_improve += 1
                        #trigger early stopping
                        # this should be not going bad
                        if (epochs_no_improve >= early_stopping_patience):
                            print(
                                f'\nEarly Stopping! Total epochs: {epoch}. Best epoch: {best_epoch} with loss: {valid_loss_min:.2f} and acc: {100 * valid_acc:.2f}%'
                            )
                            total_time = timer() - overall_start
                            print(
                                f'{total_time:.2f} total seconds elapsed. {total_time / (epoch+1):.2f} seconds per epoch.'
                            )

                            # load the best state dict
                            model.load_state_dict(torch.load(save_file_name))
                            # attach the optimizer
                            model.optimizer = optimizer

                            # format history
                            history = pd.DataFrame(
                                    history,
                                    columns=[
                                        'train_loss', 'valid_loss', 'train_acc',
                                        'valid_acc', 'epochs'
                                    ])
                            return model, history
                        
    model.optimizer = optimizer
    total_time = timer() - overall_start
    print(
        f'\nBest epoch: {best_epoch} with loss: {valid_loss_min:.2f} and acc: {100 * valid_acc:.2f}%'
    )
    print(
        f'{total_time:.2f} total seconds elapsed. {total_time / (model.epochs):.2f} seconds per epoch.'
    )
    # Format history
    history = pd.DataFrame(
        history,
        columns=['train_loss', 'valid_loss', 'train_acc', 'valid_acc','epochs'])
    return model, history

In [25]:
model, history = train_model(
    model,
    criterion,
    optimizer,
    dataloaders['train'],
    dataloaders['test'],
    save_file_name=save_file_name,
    checkpoint_file_name=checkpoint_file_name,
    early_stopping_patience=50,
    overfit_patience=10,
    n_epochs=200
    )

Model has been trained for: 0 epochs.



RuntimeError: Given groups=1, weight of size [64, 16, 3, 3, 3], expected input[1, 9, 3, 227, 128] to have 16 channels, but got 9 channels instead

In [None]:
# summarize history for accuracy
plt.plot(history['train_acc'])
plt.plot(history['valid_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.xlim(1,200)
plt.legend(['train', 'valid'], loc='upper left')
plt.savefig('train_valid_accuracy.png')
plt.show();


# summarize history for loss
plt.plot(history['train_loss'])
plt.plot(history['valid_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.xlim(1,200)
plt.legend(['train', 'valid'], loc='upper left')
plt.savefig('train_valid_loss.png')

plt.show();


In [None]:
# unfreeze all layers of the model to allow them to start training
for param in model.parameters():
    param.requires_grad = True

In [None]:
model, history = train_model(
    model,
    criterion,
    optimizer,
    scheduler,
    dataloaders['train'],
    dataloaders['val'],
    save_file_name=save_file_name,
    checkpoint_file_name=checkpoint_file_name,
    early_stopping_patience=50,
    overfit_patience=10,
    n_epochs=200
    )

In [None]:
model, history = train_model(
    model,
    criterion,
    optimizer,
    scheduler,
    dataloaders['train'],
    dataloaders['val'],
    save_file_name=save_file_name,
    checkpoint_file_name=checkpoint_file_name,
    early_stopping_patience=50,
    overfit_patience=10,
    n_epochs=200
    )

### Model Evaluation

In [None]:
def get_labels():
    food_labels = pd.read_csv("../data/meta/labels.txt", header=None)
    food_labels = food_labels[0].tolist()
    return food_labels

In [None]:
if train_on_gpu:
    model.load_state_dict(torch.load('../saved_models/resnet50-transfer10.pt', map_location= torch.device('cuda')))
    model.to('cuda')
else:
    model.load_state_dict(torch.load('../saved_models/resnet50-transfer10.pt', map_location= torch.device('cpu')))
    model.to('cpu')
model.eval();

In [None]:
classes = get_labels()
class_correct = list(0. for i in range(len(classes)))
class_total = list(0. for i in range(len(classes)))
total_correct = 0
total = 12625
y_test = []
y_pred = []

with torch.no_grad():
    for data, target, path in dataloaders['test']:
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        y_test.extend(target.cpu().numpy().tolist())
        y_pred.extend(predicted.cpu().numpy().tolist())
        c = (predicted == target).squeeze()
        total_correct += (predicted == target).sum().item()
        for i in range(len(target)):
            label = target[i].item()
            class_correct[label] += c[i].item()
            class_total[label] += 1

print('Accuracy of the network on the 12625 test images: %d %%' % (
    100 * total_correct / total))
for i in range(len(classes)):
    print('Accuracy of %5s : %2d %%' % (
        classes[i], 100 * class_correct[i] / class_total[i]))

The classification report below allows us to get a better idea of how the predictions were made per class. This will be useful for fine-tuning the model down the road.

In [None]:
print('Classification Report')
print(classification_report(y_test, y_pred[:12625], target_names=get_labels()))

#### Plot confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                          cax=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

#     plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#### Plotting the confusion matrix

The confusion matrix allows us to examine which classes get predicted as other classes and vice-versa. The diagonal line from the top left to the bottom right contains the numbers for the amount of correct identifications.

In [None]:
fig, ax = plt.subplots()
fig = plt.gcf()
fig.set_size_inches(35,35)
# create an axes on the right side of ax. The width of cax will be 5%
# of ax and the padding between cax and ax will be fixed at 0.05 inch.

plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes=get_labels(),
                      title='Confusion matrix, without normalization',
                      cmap=plt.cm.GnBu,cax=ax)

axins = inset_axes(ax,
                   width="5%",  # width = 5% of parent_bbox width
                   height="100%",  # height : 50%
                   loc='lower left',
                   bbox_to_anchor=(1.05, 0., 1, 1),
                   bbox_transform=ax.transAxes,
                   borderpad=0,
                   )

plt.colorbar(cax=axins)

plt.show()

This confusion martrix plots the true identifications on the diagonal and the wrong classificationson every other spot. The Columns represent the predicted class and the rows represent the actual class.

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [None]:
auc = multiclass_roc_auc_score(y_test, y_pred)
print("Multi-Class AUC Score is: {}".format(auc))

There is a multi-class AUC score of .87 which means the predictions are quite accurate.

In [None]:
def show_images_prediction(food_class,y_test, page=0):
    page_size = 20
    nrows = 4
    ncols = 5
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 12))
    fig.set_size_inches(12, 8)
    fig.tight_layout()
    labels = get_labels()
    start_i = (labels.index(food_class) * 125) + (page * 20)

    for i, ax in enumerate(axes.flat):
        im = ax.imshow(plt.imread(image_data['test'][i+start_i][2]))
        ax.set_axis_off()
        ax.title.set_visible(False)
        ax.xaxis.set_ticks([])
        ax.yaxis.set_ticks([])
        for spine in ax.spines.values():
            spine.set_visible(False)
        predicted = labels[y_pred[i+start_i]]
        match = predicted ==  labels[y_test[start_i + i]]
        ec = (1, .5, .5)
        fc = (1, .8, .8)
        if match:
            ec = (0, .6, .1)
            fc = (0, .7, .2)
        # predicted label
        ax.text(0, 350, 'P: ' + predicted, size=10, rotation=0,
            ha="left", va="top",
             bbox=dict(boxstyle="round",
                   ec=ec,
                   fc=fc,
                   )
             )
        if not match:
            # true label
            ax.text(0, 440, 'A: ' + labels[y_test[start_i + i]], size=10, rotation=0,
                ha="left", va="top",
                 bbox=dict(boxstyle="round",
                       ec=ec,
                       fc=fc,
                       )
                 )
    plt.subplots_adjust(left=0, wspace=1, hspace=0)
    plt.show()

In [None]:
# change the first parameter to the class label you would like to examine.
# if you would like to check different images of the class increase the page number
show_images_prediction('Grilled cheese sandwich', y_test, page=2)

After observing some different pages above, it is clear that building a food classifier is not easy. With the understanding of how a CNN gets features and uses them to build predictions, it is possible to see why the model made some of the wrong predictions that it did. 

Some other food labels may have very similar shapes and colors, which could trick the model into thinking the image was of a different class.

Or in the case of edamame, the food has very clear and defined features that allows for the very high accuracy score.

In [None]:
# change the first parameter to the class label you would like to examine.
# if you would like to check different images of the class increase the page number
show_images_prediction('Edamame', y_test, page=0)