In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import cv2
import os

import resnet

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

First we need to set up some basic variables such as our batch size, country and data directories/paths 

In [3]:
COUNTRY = "mex"
CSV_PATH = "../../CCI/hmbaier/cci_example.csv"
IMAGERY_DIR = "../../CCI/hmbaier/"
BATCH_SIZE = 64
checkpoint_path = "training/cp-{epoch:04d}.ckpt"

Next, we create a variables called files that contains the name of every image in our base folder, then we subset it to just the iamges for our selected country

In [4]:
files = os.listdir(IMAGERY_DIR)
files = [IMAGERY_DIR + i for i in files if COUNTRY in i]

files[0:5]

['../../CCI/hmbaier/01DTV0015X_mex_2016-08-18_2017-05-17.png',
 '../../CCI/hmbaier/01DTV0005Q_mex_2016-08-18_2017-05-17.png',
 '../../CCI/hmbaier/01DTV0010B_mex_2016-08-18_2017-05-17.png',
 '../../CCI/hmbaier/01DTV0013Z_mex_2016-08-18_2017-05-17.png',
 '../../CCI/hmbaier/01DTV0002T_mex_2016-08-18_2017-05-17.png']

In [5]:
for im in files[0:5]:
    print(cv2.imread(im).shape)

(256, 256, 3)
(256, 256, 3)
(256, 256, 3)
(256, 256, 3)
(256, 256, 3)


We will use a type of Python object called a Generator for our datalaoder. You can read more about them here: https://realpython.com/introduction-to-python-generators/

In [6]:
cc = tf.keras.layers.experimental.preprocessing.CenterCrop(256, 256)


def train_test_split(files, split):

    """ Split data into training and validation sets """

    train_num = int(len(files) * split)

    train = random.sample(files, train_num)
    val = [i for i in files if i not in train]

    return train, val


def get_train():

    """ Training data generator """

    for file in train_files:
        
        # Grab the school_id from the image name
        school_id = file.split("/")[4].split("_")[0]
        
        # Grab the test score for the current school from our scores dataframe
        score = df[df["school_id"] == school_id]["y"]
        
        if len(score) == 0:
            score = 0
        else:
            score = score.squeeze()        
        
        # Read in our image and normalize it by divding it by the maximum value (this normalization is super important!)
        im = cv2.imread(file)
        im = im / np.max(im)
        
#         im = im[0:256, 0:256, :]
        
        # Create a tuple with (image array, test score) and return it
        ret = ( np.array(im), np.reshape(np.array(score), (-1, 1)) )
        
        yield ret


def get_val():

    """ Validation data generator """

    for file in val_files:
        school_id = file.split("/")[4].split("_")[0]
        score = df[df["school_id"] == school_id]["y"]
        
        if len(score) == 0:
            score = 0
        else:
            score = score.squeeze()        
        
        im = cv2.imread(file)
        im = im / np.max(im)
        
#         im = im[0:256, 0:256, :]
        
        ret = ( np.array(im), np.reshape(np.array(score), (-1, 1)) )       
        
        yield ret

Read in our test scores dataframe

In [7]:
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,country,school_id,test_score,scaled_score,y
0,phl,107022,105.3,35.583573,0
1,phl,107023,137.05,58.458213,1
2,phl,107024,142.39,62.305476,1
3,phl,107025,166.03,79.337176,1
4,phl,107026,152.81,69.81268,1


Create a TensorFlow dataloader using the ```tf.data.Dataset.from_generator``` class

In [8]:
print(f"Number of image files for {COUNTRY}: {str(len(files))}")

train_files, val_files = train_test_split(files, .75)

train_dataset = tf.data.Dataset.from_generator(generator = get_train, output_types = (tf.float32, tf.float32)).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_generator(generator = get_val, output_types = (tf.float32, tf.float32)).batch(BATCH_SIZE)

print("Number of training files: ", len(train_files))
print("Number of validation files: ", len(val_files))

Number of image files for mex: 10173
Number of training files:  7629
Number of validation files:  2544


Create an compile our model using our selected parameters and metrics. In this case, I am using the MeanAbsoluteError Loss and a learning rate of 0.0001 along with the Adam optimizer.

In [9]:
model = resnet.resnet56(img_input = tf.keras.layers.Input((256, 256, 3)), classes = 2)

model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001),
    loss = tf.keras.losses.BinaryCrossentropy()
)

In [10]:
# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq = 5 * BATCH_SIZE)

# Save the weights using the `checkpoint_path` format
model.save_weights(checkpoint_path.format(epoch=0))

And finally, train your model!

In [None]:
with tf.device('/device:GPU:5'):
    
    model.fit(train_dataset,
               epochs = 15,
               validation_data = val_dataset,
               callbacks = [cp_callback],),

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: saving model to training/cp-0003.ckpt
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 00006: saving model to training/cp-0006.ckpt