In [None]:

# Flower Classifier

Labeled image data taken from Kaggle containing 104 different flower species is used to train a classification model. The goal is to use transfer learning and augment several pretrained networks and compare the results. 

In [2]:
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch
import torch.nn as nn
from torchvision.utils import make_grid
from torchvision.utils import save_image
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import tensorflow as tf
import math, re, os

AUTO = tf.data.experimental.AUTOTUNE
%matplotlib inline

In [None]:
import kaggle

Import kaggle data

In [None]:
!kaggle competitions download -c tpu-getting-started

In [None]:
import zipfile
with zipfile.ZipFile('./tpu-getting-started.zip', 'r') as zip_ref:
     zip_ref.extractall('./CapStoneTwo/data')

In [9]:
IMAGE_SIZE = [224, 224]
BATCH_SIZE = 16 

PATH_SELECT = { # available image sizes
    192: '../input/dataset' + '/tfrecords-jpeg-192x192',
    224: '../input/dataset' + '/tfrecords-jpeg-224x224',
    331: '../input/dataset' + '/tfrecords-jpeg-331x331',
    512: '../input/dataset' + '/tfrecords-jpeg-512x512'
}

PATH = PATH_SELECT[224]

TRAINING_FILENAMES = tf.io.gfile.glob(PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(PATH + '/test/*.tfrec')
                                                                       

In [10]:
print(TRAINING_FILENAMES)

['../input/dataset/tfrecords-jpeg-224x224/train/13-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/11-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/05-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/00-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/02-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/14-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/06-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/10-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/01-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/03-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/08-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/09-224x224-798.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/15-224x224-783.tfrec', '../input/dataset/tfrecords-jpeg-224x224/train/07-224x224-798.tfrec', '../input/dataset/t

### Some useful functions 

In [13]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)  # image format uint8 [0,255]
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    #dataset = dataset.map(data_augment)
    #dataset = dataset.repeat() # the training dataset must repeat for several epochs
    #dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)



In [14]:
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALIDATION_STEPS = -(-NUM_VALIDATION_IMAGES // BATCH_SIZE) # The "-(-//)" trick rounds up instead of down :-)
TEST_STEPS = -(-NUM_TEST_IMAGES // BATCH_SIZE)             # The "-(-//)" trick rounds up instead of down :-)
print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

Dataset: 12753 training images, 3712 validation images, 7382 unlabeled test images



## Data Summary

This image data did not require any cleaning. The data set came in four different sizes of the same images. I will use 224x224 for the classification models.
The images were grouped into 12752 training images, 3712 validation images, and 7382 unlabeled test images. The included test images were intended to be used 
for a Kaggle competition and have little use here.So I will use the validation images as test images.