
# Preprocessing

The data is in a tfrec format which is suitable to be used with a TPU. The data will be changed to .jpeg format to be usaed on a GPU.

In [1]:
#Import libraries
import torch 
import torchvision
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch import nn, optim
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import glob
import tensorflow as tf
AUTO = tf.data.experimental.AUTOTUNE
%matplotlib inline

In [2]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)  # image format uint8 [0,255]
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def data_augment(image, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    #image = tf.image.random_saturation(image, 0, 2)
    return image, label   

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    #dataset = dataset.map(data_augment)
    #dataset = dataset.repeat() # the training dataset must repeat for several epochs
    #dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

In [3]:
IMAGE_SIZE = [224, 224]
BATCH_SIZE = 16 

PATH_SELECT = { # available image sizes
    192: '../input/dataset' + '/tfrecords-jpeg-192x192',
    224: '../input/dataset' + '/tfrecords-jpeg-224x224',
    331: '../input/dataset' + '/tfrecords-jpeg-331x331',
    512: '../input/dataset' + '/tfrecords-jpeg-512x512'
}

PATH = PATH_SELECT[224]

TRAINING_FILENAMES = tf.io.gfile.glob(PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(PATH + '/test/*.tfrec')

In [4]:
dataset = get_training_dataset()
images = []
labels = []

for img, lbl in dataset:
    images.append(img)
    labels.append(lbl)

2022-02-11 10:51:19.869926: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-02-11 10:51:20.124503: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [5]:
img_list=[]
for i in range(images.__len__()):
    for j in range(images[i].__len__()):
        img_list.append(images[i][j,:,:,:])
        
lab_list = list()
for label in labels:
    x = label.numpy().tolist()
    lab_list+=x

In [6]:
import os
from PIL import Image

In [7]:
for i in range(len(lab_list)):
    img = np.array(img_list[i]).astype(np.uint8)
    lbl = lab_list[i]
    lbl = str(lbl)
    if not os.path.exists("../output/kaggle/working/dataset224/train/"+lbl):
        os.makedirs("../output/kaggle/working/dataset224/train/"+lbl)
    im = Image.fromarray(img)
    im.save("../output/kaggle/working/dataset224/train/"+lbl+"/"+str(i)+".jpeg")

In [8]:
dataset = get_validation_dataset()
val_images = []
val_labels = []

for img, lbl in dataset:
    val_images.append(img)
    val_labels.append(lbl)

In [9]:
val_img_list=[]
for i in range(val_images.__len__()):
    for j in range(val_images[i].__len__()):
        img_list.append(val_images[i][j,:,:,:])
        
val_lab_list = list()
for label in val_labels:
    x = label.numpy().tolist()
    val_lab_list+=x

In [10]:
for i in range(len(lab_list)):
    img = np.array(img_list[i]).astype(np.uint8)
    lbl = lab_list[i]
    lbl = str(lbl)
    if not os.path.exists("../output/kaggle/working/dataset224/val/"+lbl):
        os.makedirs("../output/kaggle/working/dataset224/val/"+lbl)
    im = Image.fromarray(img)
    im.save("../output/kaggle/working/dataset224/val/"+lbl+"/"+str(i)+".jpeg")