In [1]:
# Thanks to Burak Himmetoglu at
# https://github.com/healthDataScience/deep-learning-HAR
# for providing the framework
import numpy as np
import pandas as pd
import gc
import os, sys
#from sklearn.model_selection import train_test_split
data_path = os.path.join(os.environ['DEEPQSODIR'], 'data')
sys.path.insert(0, data_path)
from data_utils import *
import tensorflow as tf
import time
from sklearn.model_selection import KFold

## Quasar classification using a 1D ConvNet
### Author: Ji Won Park (jiwoncpark)
In this notebook, we perform quasar classification using a fairly shallow 1D ConvNet.

In [2]:
# Data paths
features_path = os.path.join(data_path, 'features.npy')
label_path = os.path.join(data_path, 'labels.npy')

# Load data
X = np.load(features_path)
y = np.load(label_path).reshape(-1).astype(int)

# Directory where model checkpoints (weights) will be stored
weights_dir = 'checkpoints-cnn'
if (os.path.exists(weights_dir) == False):
    os.makedirs(weights_dir)
    
# Directory where logs will be stored
logs_dir = 'logs-cnn'
if (os.path.exists(logs_dir) == False):
    os.makedirs(logs_dir)

In [3]:
# Data constants
NUM_CLASSES = 2
NUM_OBJECTS, NUM_TIMES, NUM_CHANNELS = X.shape

# Training hyperparameters
DEBUG = True
BATCH_SIZE = 500
LEARNING_RATE = 1.e-2
NUM_EPOCHS = 2
KEEP_PROB = 1.0

In [4]:
# Set up k-fold cross validation
kf = KFold(n_splits=3, shuffle=True, random_state=123)

# One-hot encode labels
y = to_onehot(y, num_classes=NUM_CLASSES)

In [5]:
# Set up graph
graph = tf.Graph()

# Construct placeholders
with graph.as_default():
    inputs_ = tf.placeholder(tf.float32, [None, NUM_TIMES, NUM_CHANNELS], name = 'inputs')
    labels_ = tf.placeholder(tf.float32, [None, NUM_CLASSES], name = 'labels')
    keep_prob_ = tf.placeholder(tf.float32, name = 'keep')
    learning_rate_ = tf.placeholder(tf.float32, name = 'learning_rate')

In [6]:
# Define network architecture
with graph.as_default():
    with tf.name_scope('conv0'):
        # (batch, 738, NUM_CHANNELS) --> (batch, 738, NUM_CHANNELS)
        conv0 = tf.layers.conv1d(inputs=inputs_, filters=NUM_CHANNELS, kernel_size=1, strides=1,
                                 padding='same', activation = tf.nn.relu, name='conv0')
        if DEBUG: print("After conv0: ", conv0.shape)

    with tf.name_scope('conv_mp0'):
        # (batch, 738, NUM_CHANNELS) --> (batch, 369, NUM_CHANNELS*2)
        conv00 = tf.layers.conv1d(inputs=conv0, filters=NUM_CHANNELS*2, kernel_size=1, strides=1,
                                  padding='same', activation = tf.nn.relu, name='conv00')
        max_pool_00 = tf.layers.max_pooling1d(inputs=conv00, pool_size=2, strides=2, padding='same', name='maxpool00')
        if DEBUG: print("After conv00, max_pool00: ", max_pool_00.shape)

    with tf.name_scope('conv_mp1'):
        # (batch, 369, NUM_CHANNELS*2) --> (batch, 123, NUM_CHANNELS*2*3)
        conv1 = tf.layers.conv1d(inputs=max_pool_00, filters=NUM_CHANNELS*2*3, kernel_size=2, strides=1, 
                                 padding='same', activation = tf.nn.relu, name='conv1')
        max_pool_1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=2, strides=3, padding='same', name='maxpool1')
        if DEBUG: print("After conv1, max_pool_1: ", max_pool_1.shape)
    
    with tf.name_scope('conv_mp2'):
        # (batch, 123, NUM_CHANNELS*2*3) --> (batch, 41, NU_CHANNELS*2*3*3)
        conv2 = tf.layers.conv1d(inputs=max_pool_1, filters=NUM_CHANNELS*2*3*3, kernel_size=2, strides=1, 
                                 padding='same', activation = tf.nn.relu, name='conv2')
        max_pool_2 = tf.layers.max_pooling1d(inputs=conv2, pool_size=2, strides=3, padding='same', name='maxpool2')
        if DEBUG: print("After conv2, max_pool_2: ", max_pool_2.shape)

After conv0:  (?, 738, 8)
After conv00, max_pool00:  (?, 369, 16)
After conv1, max_pool_1:  (?, 123, 48)
After conv2, max_pool_2:  (?, 41, 144)


In [7]:
# Define cost function and metrics
with graph.as_default():
    with tf.name_scope('fc'):
        flat = tf.reshape(max_pool_2, (-1, 41*NUM_CHANNELS*2*3*3))
    with tf.name_scope('dropout'):
        flat = tf.nn.dropout(flat, keep_prob=keep_prob_)
    
    # Predictions
    logits = tf.layers.dense(flat, NUM_CLASSES)
    
    with tf.name_scope('cost'):
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels_))
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate_).minimize(cost)
    
    with tf.name_scope('accuracy'):
        labels_1d = tf.argmax(labels_, axis=1)
        preds_1d = tf.argmax(logits, axis=1)
        correct_pred = tf.equal(preds_1d, labels_1d)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy')
    
    with tf.name_scope('confusion_matrix'):
        conf_matrix = tf.confusion_matrix(labels=labels_1d, predictions= preds_1d,
                                          num_classes=NUM_CLASSES, name='conf_matrix')

    with tf.name_scope('summary'):
        # Monitor cost
        tf.summary.scalar("loss", cost)
        # Monitor accuracy
        tf.summary.scalar("accuracy", accuracy)
        # Monitor FP
        tf.summary.scalar("false_positives", conf_matrix[0, 1])
        # Monitor FN
        tf.summary.scalar("false_negatives", conf_matrix[1, 0])
        # Merge all summaries into a single op
        merged = tf.summary.merge_all()

In [8]:
with graph.as_default():
    saver = tf.train.Saver()

In [9]:
# Training loop
train_acc, train_loss = [], []
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    summary_writer = tf.summary.FileWriter(logs_dir, graph=sess.graph)
    iteration = 1
    # Loop over epochs
    for e in range(1, NUM_EPOCHS+1):
        cv_valacc, cv_valloss = [], []
        for train_index, val_index in kf.split(X):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]            
                
            # Loop over batches
            for x_t, y_t in fetch_batches(X_train, y_train, batch_size=BATCH_SIZE):
                # Feed dictionary
                feed = {inputs_ : x_t,
                        labels_ : y_t, 
                        keep_prob_ : KEEP_PROB, 
                        learning_rate_ : LEARNING_RATE}
                # Run ops
                loss, _ , acc, cm, summary = sess.run([cost, optimizer, accuracy, conf_matrix, merged], 
                                                      feed_dict = feed)
                # Add summary
                summary_writer.add_summary(summary, iteration)
                train_acc.append(acc)
                train_loss.append(loss)
                iteration += 1

            # Compute validation loss at the end of every CV epoch
            val_acc_ = []
            val_loss_ = []

            #for x_v, y_v in fetch_batches(X_val, y_val, BATCH_SIZE):
            feed = {inputs_ : X_val, labels_ : y_val, keep_prob_ : 1.0}  
            loss_v, acc_v, cm_v = sess.run([cost, accuracy, conf_matrix], feed_dict = feed)                    
            val_acc_.append(acc_v)
            val_loss_.append(loss_v)

            print("Epoch: {}/{}".format(e, NUM_EPOCHS),
                  "Learning rate: {:.6f}".format(LEARNING_RATE),
                  "Iteration: {:d}".format(iteration),
                  "Validation loss: {:6f}".format(np.mean(val_loss_)),
                  "Validation acc: {:.6f}".format(np.mean(val_acc_)))
            print(cm_v)
            # Store accuracy and loss
            cv_valacc.append(np.mean(val_acc_))
            cv_valloss.append(np.mean(val_loss_))
        print('CV mean accuracy: {:.6}, CV std: {:.6}'.format(np.mean(cv_valacc), np.std(cv_valacc)))
        print('CV mean loss: {:.6}'.format(np.mean(cv_valloss)))
        saver.save(sess, os.path.join(weights_dir, 'ckpt-cnn'))
summary_writer.close()

Epoch: 1/2 Learning rate: 0.010000 Iteration: 43 Validation loss: 0.462525 Validation acc: 0.997505
[[5285    2]
 [  24 5110]]
Epoch: 1/2 Learning rate: 0.010000 Iteration: 85 Validation loss: 0.011987 Validation acc: 0.999904
[[5147    1]
 [   0 5273]]
Epoch: 1/2 Learning rate: 0.010000 Iteration: 127 Validation loss: 0.003887 Validation acc: 0.999808
[[5194    2]
 [   0 5224]]
CV mean accuracy: 0.999072, CV std: 0.00110899
CV mean loss: 0.159466
Epoch: 2/2 Learning rate: 0.010000 Iteration: 169 Validation loss: 0.002765 Validation acc: 0.999520
[[5282    5]
 [   0 5134]]
Epoch: 2/2 Learning rate: 0.010000 Iteration: 211 Validation loss: 0.002825 Validation acc: 0.999520
[[5143    5]
 [   0 5273]]
Epoch: 2/2 Learning rate: 0.010000 Iteration: 253 Validation loss: 0.001438 Validation acc: 0.999808
[[5194    2]
 [   0 5224]]
CV mean accuracy: 0.999616, CV std: 0.000135713
CV mean loss: 0.00234251
