# Using TPU to train an Xception Model

Outline for this execution is as follows:
1. Basics
    1. Importing of relevant modules
    2. Activating the TPU
    3. Setting the required constants
    
2. Preparing the dataset
    1. Pre-preparation
    2. Extraction functions
    3. Actual extraction
    
3. Training of the Model

## 1. Basics

### Importing of relevant modules

In [1]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.nasnet import NASNetLarge
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adadelta, SGD, Adamax, Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### Activating the TPU

In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
    
print(tf.__version__)

ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

Device: grpc://10.0.0.2:8470
Number of replicas: 8
2.2.0


### Setting the required constants

In [3]:
# defining the constants of the project

# FOR DATA #
_HEIGHT = 331
_WIDTH = 331
_COLOR_P = 3 
_NUMCLASSES = 42

# FOR MODEL #
_BATCHSIZE = 16 * strategy.num_replicas_in_sync
_EPOCHS = 5

# FOR TPU #
_AUTOTUNE = tf.data.experimental.AUTOTUNE
_GCSPATH = 'gs://kds-a7828fa4865e43143e0d685e3dc5209f830f440cb23f32438825ad83'
# run this code to extract for the first time KaggleDatasets().get_gcs_path('product-detection-images-full')

# GENERAL #
data_directory = '/kaggle/input/product-detection-images-full'
cp_directory = '/kaggle/working'
_TESTGCSPATH = 'gs://kds-4d41270d4fe6544dfabc7abaadf094000a7a2da041fef71dc7df2ddd'

## 2. Preparing the dataset

### Pre-preparation

Note: We use GCS_PATH here because all the datasets that kaggle stores are located in the Google Cloud Server (GCS). So what we're doing is just extracting the file names using the directory that links us to the GCS that contains our data. 

In [4]:
# extracting the ALL the directory names for each image file
filenames = tf.io.gfile.glob(str(_GCSPATH + '/train/*/*'))
test_filenames = tf.io.gfile.glob(str(_TESTGCSPATH + '/test/*'))

# splitting all the names into training and validation
train_filenames, val_filenames = train_test_split(filenames, test_size = 0.1)

# getting test image names
test_imagenames = [i.split('/')[-1] for i in test_filenames]

In [5]:
# creating a tensorflow Dataset of the list of filenames
train_dataset_list = tf.data.Dataset.from_tensor_slices(train_filenames)
val_dataset_list = tf.data.Dataset.from_tensor_slices(val_filenames)

# counting the number of images in the training or validation dataset
train_len = tf.data.experimental.cardinality(train_dataset_list).numpy()
val_len = tf.data.experimental.cardinality(val_dataset_list).numpy()
print('No. of Training Images:', str(train_len))
print('No. of Validation Images:', str(val_len))

# collating the names of each class
_CLASSNAMES = np.array([item.split('/')[-1] for item in tf.io.gfile.glob(str(_GCSPATH + "/train/*"))])
_CLASSNAMES

No. of Training Images: 100276
No. of Validation Images: 11142


array(['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21',
       '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32',
       '33', '34', '35', '36', '37', '38', '39', '40', '41'], dtype='<U2')

Note: At this point, we have only prepared the names of the directory to the files and split them into training and validation set. There is now a need to *actually* extract the image data from the directory name itself. 

### Extraction functions

In [6]:
def get_class(filepath):
    """
    Extract the class name of the particular image directory path
    
    Parameters:
        filepath : string
            The name of the directory path for the image file
            
    Returns:
        output : int
            The class the image belongs to
    """
    path_parts = tf.strings.split(filepath, os.path.sep)
    return int(path_parts[-2])

def decode_img(image):
    """
    Decodes an encoded form of a image into an actual image
    
    Parameters:
        image : Tensor string
            Extracted using tf.io.read_file(filepath)
            
    Returns:
        output : an image array
    """
    # converts the compressed string into a 3D uint8 tensor
    image = tf.image.decode_jpeg(
      image, 
      channels = 3,
      try_recover_truncated = True,
      acceptable_fraction = 0.5
    )
    
    # converts the uint8 tensor into floats within a 0 to 1 range
    image = tf.image.convert_image_dtype(image, tf.float32)
    
    # resizes the image
    return tf.image.resize(image, (_HEIGHT, _WIDTH))

def process_path(filepath, test = False):
    """
    Main function that processes a file path to return the image and its class
    
    Parameters:
        filepath : string
            The name of the directory path for the image file
    
    Returns:
        outputs : image and class name
    """
    # extract the class of this image
    label = get_class(filepath)
    
    # retrieve the encoded version of the image
    image = tf.io.read_file(filepath)
    
    # decode it
    image = decode_img(image)
    
    # whether or not to perform extra augmentation (flipping)
    if ~test:
        image = tf.image.random_flip_left_right(image)
        
    return image, label

### Actual extraction

In [7]:
train_dataset = train_dataset_list.map(process_path, num_parallel_calls = _AUTOTUNE)
val_dataset = val_dataset_list.map(process_path, num_parallel_calls = _AUTOTUNE)

In [8]:
# checking the correct shape and label
for image, label in train_dataset.take(5):
    print('Image Shape:', image.numpy().shape)
    print('Class:', label.numpy())

Image Shape: (331, 331, 3)
Class: 3
Image Shape: (331, 331, 3)
Class: 33
Image Shape: (331, 331, 3)
Class: 37
Image Shape: (331, 331, 3)
Class: 29
Image Shape: (331, 331, 3)
Class: 19


In [9]:
# cache and batch the training and validation data before feeding it to train the model
def preparation_for_fitting(dataset, cache = True, shuffle_buffer_size = 1000):
    """
    Set the settings of the tensorflow dataset before feeding it to a model
    
    Parameters:
        dataset : tensorflow dataset
        
        cache : bool
        
        shuffle_buffer_size : int
        
    Returns:
        output : tensorflow dataset
            With all the settings applied to it
    """
    if cache:
        if isinstance(cache, str):
            dataset = dataset.cache(filename)
        else:
            dataset = dataset.cache()
    
    dataset = dataset.shuffle(buffer_size = shuffle_buffer_size)
    
    dataset = dataset.repeat()
    
    dataset = dataset.batch(_BATCHSIZE)
    
    dataset = dataset.prefetch(buffer_size = _AUTOTUNE)
    
    return dataset

# applying the settings to these datasets
train_dataset = preparation_for_fitting(train_dataset)
val_dataset = preparation_for_fitting(val_dataset)

# creating a generator for the training dataset
# image_batch, label_batch = next(iter(train_dataset))

In [10]:
# settling the test dataset
test_dataset_list = tf.data.Dataset.from_tensor_slices(test_filenames)
test_len = tf.data.experimental.cardinality(test_dataset_list).numpy()
test_dataset = test_dataset_list.map(lambda x: process_path(x, test = True), num_parallel_calls = _AUTOTUNE)
test_dataset = test_dataset.batch(_BATCHSIZE)

test_len

12186

## 3. Training of the Model

In [11]:
def TransferNasNetLarge():
    custom = NASNetLarge(include_top = False, weights = 'NASNet-large-no-top.h5', input_shape = [_HEIGHT, _WIDTH, 3])
    for layer in custom.layers:   # training one chunk of convolution layer and all the BatchNormalisations
        if isinstance(layer, layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False
    x = custom.layers[-1].output
    ### Decoder
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(2048, activation = 'relu')(x)
    x = layers.Dropout(0.45)(x)
    x = layers.Dense(1024, activation = 'relu')(x)
    x = layers.Dropout(0.45)(x)
    x = layers.Dense(512, activation = 'relu')(x)
    x = layers.Dropout(0.45)(x)
    x = layers.Dense(256, activation = 'relu')(x)
    output = layers.Dense(_NUMCLASSES, activation = 'softmax')(x)
    
    model = models.Model(inputs = custom.input, outputs = output, name = 'Transfer_NasNet')

    return model

In [17]:
os.chdir('/kaggle/working')

# optimizers
sgd = SGD(learning_rate = 0.001, momentum = 0.8, nesterov = True)               # the standard is better for shallow networks                  
adadelta = Adadelta(learning_rate = 1.0, rho = 0.95)                            # a better version of rmsprop but may be slower
adam = Adam(learning_rate = 0.001)
adamax = Adamax(learning_rate = 0.002)
nadam = Nadam(learning_rate = 0.001)

# callbacks
early_stopping = EarlyStopping(monitor = 'loss', 
       patience = 1, 
       min_delta = 0.001)
model_checkpoint_callback = ModelCheckpoint(
    filepath = '/kaggle/working/model-{epoch:02d}-{val_loss:.4f}.hdf5',
    monitor = 'val_acc',
    mode = 'max',
    save_best_only = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 0)

with strategy.scope():
    # model = TransferNasNetLarge()
    model = models.load_model('42class_final_model_vNas1.h5')
    
    model.compile(optimizer = nadam,
                  loss = 'sparse_categorical_crossentropy',
                  metrics = ['sparse_categorical_accuracy'])

    # previewing the architecture of the resultant model
    model.summary()

Model: "Transfer_NasNet"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 331, 331, 3) 0                                            
__________________________________________________________________________________________________
stem_conv1 (Conv2D)             (None, 165, 165, 96) 2592        input_1[0][0]                    
__________________________________________________________________________________________________
stem_bn1 (BatchNormalization)   (None, 165, 165, 96) 384         stem_conv1[0][0]                 
__________________________________________________________________________________________________
activation (Activation)         (None, 165, 165, 96) 0           stem_bn1[0][0]                   
____________________________________________________________________________________

adjust_bn_10 (BatchNormalizatio (None, 21, 21, 336)  1344        adjust_conv_projection_10[0][0]  
__________________________________________________________________________________________________
normal_bn_1_10 (BatchNormalizat (None, 21, 21, 336)  1344        normal_conv_1_10[0][0]           
__________________________________________________________________________________________________
activation_142 (Activation)     (None, 21, 21, 336)  0           normal_bn_1_10[0][0]             
__________________________________________________________________________________________________
activation_144 (Activation)     (None, 21, 21, 336)  0           adjust_bn_10[0][0]               
__________________________________________________________________________________________________
activation_146 (Activation)     (None, 21, 21, 336)  0           adjust_bn_10[0][0]               
__________________________________________________________________________________________________
activation

Total params: 95,941,500
Trainable params: 11,221,350
Non-trainable params: 84,720,150
__________________________________________________________________________________________________


In [18]:
os.chdir('/kaggle/working')
history = model.fit(
    train_dataset,
    steps_per_epoch = train_len // _BATCHSIZE, 
    validation_data = val_dataset,
    validation_steps = val_len//_BATCHSIZE,
    epochs = _EPOCHS,
    callbacks = [early_stopping, reduce_lr]
    # model_checkpoint_callback
    # can consider putting class_weights
)
model.save('42class_final_model_vNas2.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Testing

In [None]:
test_datagen = ImageDataGenerator(rescale = 1./255)

test_generator = test_datagen.flow_from_directory(
    test_directory,
    target_size = (_HEIGHT, _WIDTH),
    batch_size = 1, 
    interpolation = "bilinear",
    shuffle = False, 
    class_mode = None
)
test_generator.reset()
file_names = [i.split('/')[1] for i in test_generator.filenames]
os.chdir(cp_directory)
model = models.load_model('42class_final_model.h5')
y_pred = model.predict(test_generator)
result_pd = pd.DataFrame({'filename': file_names, 'category': [f'{i:02}' for i in np.argmax(y_pred, axis = 1)]})
result_pd.to_csv('result_pd.csv', index = False)