This Google Colab is for prototyping and developing our deep learning project. It's linked to the Github repo that Yoony made so updating the code in this repo and `git push`ing will propagate any changes to Github. The convenient thing about it is that we don't have to install any packages (Google handles that for us) so we can start writing code right away. We also don't have to download and upload training data multiple times since we will be able to directly access the training data we have in Google Drive. We can also use this notebook to preprocess any data we have. 



# Import Required Libraries

In [1]:
import os
import sys
import argparse
import datetime
import numpy as np

import tensorflow
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D

# Mount Drive, Change to Current Directory

In [2]:
# Make files from Google Drive viewable through Colab.
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
# Change to the shared project folder
%cd 'drive/MyDrive/CPSC 452 Deep Learning Final Project/cpsc452-project'

/content/drive/.shortcut-targets-by-id/1vFHGYIR4br7lD84U7pCjnR8Q5HS0X-tu/CPSC 452 Deep Learning Final Project/cpsc452-project


In [7]:
# Note: If it doesn't work for you remember to add a shortcut to the Meili's shared folder to your drive as detailed here:
# https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

In [5]:
# Now we are in the Github repository!
!ls

'Deep Learning Final Project Data Visualization.ipynb'
 get_race_feature
 race_feature.csv
 README.md
 Scaling.ipynb
 train_face_detection.py


# Github Integration Section (Only Need to Do Once!)

In [26]:
# I followed this tutorial to get Github to integrate with Google Drive: 
# https://medium.com/analytics-vidhya/how-to-use-google-colab-with-github-via-google-drive-68efb23a42d

# GITHUB_TOKEN = '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'

In [27]:
# Clone the repo into the "CPSC 452 Deep Learning Final Project" shared folder
# ONLY RUN ONCE: git clone https://{GITHUB_TOKEN}@github.com/ykim321/cpsc452-project.git

Cloning into 'cpsc452-project'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 139 (delta 5), reused 130 (delta 2), pack-reused 0[K
Receiving objects: 100% (139/139), 311.81 MiB | 15.38 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [11]:
# !git config user.email "j.d.zhao@yale.edu"
# !git config user.name "James Zhao"

# Git Operations for Pushing / Pulling / Etc (Can Do Multiple Times)

In [21]:
# get repo status
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mDeep_Learning_Training.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [22]:
# pull latest repo data
!git pull

Already up to date.


In [23]:
# add updated files 
!git add -u

In [24]:
# commit changes to remote
!git commit -m "Added ipynb notebook"

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
	[31mDeep_Learning_Training.ipynb[m

nothing added to commit but untracked files present


In [17]:
# push to remote
!git push

Everything up-to-date


In [3]:
# This file contains the training data
def load_data_from_npz_file(file_path):
    """
    Load data from npz file
    :param file_path: path to npz file with training data
    :return: input features and target data as numpy arrays
    """
    data = np.load(file_path)
    return data['input'], data['target']

# Mean and variance centering data
def normalize_data_per_row(data):
    """
    Normalize a give matrix of data (samples must be organized per row)
    :param data: input data as a numpy array with dimensions NxHxWxC
    :return: normalized data with pixel values in [0,1] (array with same dimensions as input)
    """

    # sanity checks!
    assert len(data.shape) == 4, "Expected the input data to be a 4D matrix"

    return data / 255

# Split into train and test 
def split_data(input, target, train_percent):
    """
    Split the input and target data into two sets
    :param input: inputs [NxM] matrix
    :param target: target [Nx1] matrix
    :param train_percent: percentage of the data that should be assigned to training
    :return: train_input, train_target, test_input, test_target
    """
    assert input.shape[0] == target.shape[0], \
        "Number of inputs and targets do not match ({} vs {})".format(input.shape[0], target.shape[0])

    indices = list(range(input.shape[0]))
    np.random.shuffle(indices)

    num_train = int(input.shape[0]*train_percent)
    train_indices = indices[:num_train]
    val_indices = indices[num_train:]

    return input[train_indices, :], target[train_indices,:], input[val_indices,:], target[val_indices,:]

# Build the model
def build_nonlinear_model():
    """
    Build NN model with Keras
    :param num_inputs: number of input features for the model
    :return: Keras model
    """
    
    model = Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(64,64,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation ="relu"))
    model.add(Flatten())
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1,activation="sigmoid"))
    model.summary()
    
    return model

def train_model(model, train_input, train_target, val_input, val_target,
                epochs=200, learning_rate=0.01, batch_size=16):
    """
    Train the model on the given data
    :param model: Keras model
    :param train_input: train inputs
    :param train_target: train targets
    :param val_input: validation inputs
    :param val_target: validation targets
    :param input_mean: mean for the variables in the inputs (for normalization)
    :param input_stdev: st. dev. for the variables in the inputs (for normalization)
    :param epochs: epochs for gradient descent
    :param learning_rate: learning rate for gradient descent
    :param batch_size: batch size for training with gradient descent
    """

    norm_train_input = n2(train_input)
    norm_val_input = n2(val_input)

    # compile the model: define optimizer, loss, and metrics
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
                 loss='binary_crossentropy',
                 metrics=['binary_accuracy'])
                 
     # tensorboard callback
    logs_dir = 'logs/log_{}'.format(datetime.datetime.now().strftime("%m-%d-%Y-%H-%M"))
    tbCallBack = tf.keras.callbacks.TensorBoard(log_dir=logs_dir, write_graph=True)

     # save checkpoint callback
    checkpointCallBack = tf.keras.callbacks.ModelCheckpoint(os.path.join(logs_dir,'best_weights.h5'),
                                                            monitor='binary_accuracy',
                                                            verbose=0,
                                                            save_best_only=True,
                                                            save_weights_only=False,
                                                            mode='auto',
                                                            save_freq=1)

    # do training for the specified number of epochs and with the given batch size
    # TODO - Add callbacks to fit funciton
    model.fit(norm_train_input, train_target, epochs=epochs, batch_size=batch_size,
          validation_data=(norm_val_input, val_target),
          callbacks=[tbCallBack, checkpointCallBack])

In [5]:
def main(npz_data_file, batch_size, epochs, lr, val, logs_dir, build_fn=build_nonlinear_model):
    """
    Main function that performs training and test on a validation set
    :param npz_data_file: npz input file with training data
    :param batch_size: batch size to use at training time
    :param epochs: number of epochs to train for
    :param lr: learning rate
    :param val: percentage of the training data to use as validation
    :param logs_dir: directory where to save logs and trained parameters/weights
    """

    input, target = load_data_from_npz_file(npz_data_file)
    N = input.shape[0]
    assert N == target.shape[0], \
        "The input and target arrays had different amounts of data ({} vs {})".format(N, target.shape[0]) # sanity check!
    print("Loaded {} training examples.".format(N))

    train_input, train_target, val_input, val_target = split_data(input, target, val)
    model = build_fn()
    train_model(model, train_input, train_target, val_input, val_target, epochs=epochs, learning_rate=lr, batch_size=batch_size)

def n2(data):
    """
    Normalize a give matrix of data (samples must be organized per row)
    :param data: input data as a numpy array with dimensions NxHxWxC
    :return: normalized data with pixel values in [0,1] (array with same dimensions as input)
    """

    # sanity checks!
    assert len(data.shape) == 4, "Expected the input data to be a 4D matrix"

    if np.max(data) > 255:
        normalized_data = data / 255
    else:
        normalized_data = data
    return normalized_data


In [6]:
main("./Data/64x64_data.npz", 64, 100, 0.01, 0.1, "./Data/")

FileNotFoundError: ignored