In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import h5py
import os
import time
from random import sample
import datetime
from collections import Counter
import sys
from tqdm import tqdm, notebook
from tensorflow.keras.datasets import reuters
pd.options.display.max_colwidth = 100

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten, Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import sparse_categorical_crossentropy
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import ParameterGrid

word_index = reuters.get_word_index()
tf.keras.backend.set_floatx('float64')

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
root_dir = '/content/gdrive/My Drive/'
base_dir = root_dir + 'Northwestern/Artificial Intelligence and Deep Learning/Assignment 3/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def import_data(voc_size):

    (train_data_raw, train_labels_raw), (test_data_raw, test_labels_raw) = reuters.load_data(num_words=voc_size)
    # word_index = reuters.get_word_index()

    return train_data_raw, train_labels_raw, test_data_raw, test_labels_raw

In [0]:
def top_label_data(X, y):

    topics_tpl, topics_freq = zip(*Counter(y).most_common(9))
    X_top, y_top = zip(*((x_samp, y_samp) for x_samp, y_samp in zip(X, y) if y_samp in topics_tpl))
    X_top, y_top = np.array(X_top), np.array(y_top)

    keys = Counter(y_top).keys()
    conv = dict(zip(sorted(keys), range(0,9)))
    y_top = [conv[l] for l in y_top]

    return X_top, y_top

In [0]:
def max_len(train_data, train_labels, percentile, top):

    if top == False:
        pass

    elif top == True:
        train_data, train_labels = top_label_data(train_data, train_labels)

    train_data_len = [len(w) for w in train_data]
    maxlen = int(np.percentile(train_data_len, percentile))

    return maxlen

In [0]:
def vectorize_sequences(sequences, dimension = 10000):

    results = np.zeros((len(sequences), dimension))
    for i, sequence, in enumerate(sequences):
        results[i, sequence] = 1.

    return results

In [0]:
def return_max_lens(train_data, train_labels, top):

    max_lengths = []
    for percentile in np.arange(20, 120, 20):
        ml = max_len(train_data, train_labels, percentile, top = top)
        max_lengths.append(ml)

    return max_lengths

In [0]:
def prepare_data(train_data, train_labels, test_data, test_labels, max_len,
                 top, embedding = False):

    if top == False:
        pass

    elif top == True:
        train_data, train_labels = top_label_data(train_data, train_labels)
        test_data, test_labels = top_label_data(test_data, test_labels)

    if embedding == False:
        train_data = vectorize_sequences(train_data)
        test_data = vectorize_sequences(test_data)
        # maxlen = None
        
    elif embedding == True:
        # maxlen = max_len(train_data, train_labels, top)
        train_data = preprocessing.sequence.pad_sequences(train_data, maxlen=max_len)
        test_data = preprocessing.sequence.pad_sequences(test_data, maxlen=max_len)

    train_data, val_data, train_labels, val_labels = train_test_split(
        train_data, train_labels, test_size = 0.15)
    train_labels = to_categorical(train_labels)
    val_labels = to_categorical(val_labels)
    test_labels = to_categorical(test_labels)

    return train_data, train_labels, val_data, val_labels, test_data, test_labels

In [0]:
def tensor_slices(train_data, train_labels, val_data, val_labels, test_data, test_labels):

    train_ds = tf.data.Dataset.from_tensor_slices((train_data, train_labels)).shuffle(100).batch(100)
    val_ds = tf.data.Dataset.from_tensor_slices((val_data, val_labels)).batch(100)
    test_ds = tf.data.Dataset.from_tensor_slices((test_data, test_labels))

    return train_ds, val_ds, test_ds

In [0]:
def embedding_matrix(embedding_file, voc_size):

    embeddings_index = {}
    f = open(os.path.join(base_dir, embedding_file))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs
    f.close()

    embedding_dim = int(embedding_file.split('.')[-2].replace('d', ''))

    embedding_matrix = np.zeros((voc_size, embedding_dim))
    for word, i in word_index.items():
        if i < voc_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

    return embedding_matrix, embedding_dim

In [0]:
def model_train(features, labels, model):
    
    with tf.GradientTape() as tape:
        predictions = model(features)
        loss = loss_func(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss.update_state(loss)
    train_acc.update_state(labels, predictions)

    return gradients

In [0]:
def model_validate(features, labels, model):
    
    predictions = model(features)
    v_loss = loss_func(labels, predictions)

    valid_loss.update_state(v_loss)
    valid_acc.update_state(labels, predictions)

In [0]:
train_loss = tf.keras.metrics.Mean(name="train_loss")
valid_loss = tf.keras.metrics.Mean(name="test_loss")
test_loss = tf.keras.metrics.CategoricalCrossentropy(name="test_loss")

train_acc = tf.keras.metrics.CategoricalAccuracy(name="train_acc")
valid_acc = tf.keras.metrics.CategoricalAccuracy(name="valid_acc")
test_acc = tf.keras.metrics.CategoricalAccuracy(name="test_acc")

loss_func = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.RMSprop()

In [0]:
def df_performance(performance, model_desc):

    df_performance = pd.DataFrame(performance).iloc[-1:]
    df_performance.insert(0, 'model', model_desc)
    file_path = os.path.join(base_dir, 'best_model_table.csv')

    if os.path.exists(file_path):
        df_performance_tot = pd.read_csv(file_path)
        df_performance = pd.concat([df_performance_tot, df_performance])
    else:
        pass

    df_performance.to_csv(file_path, index = False)

    return df_performance.round(3)

In [0]:
def model_run(model, max_len, top = False, embedding = False, pretrained_embed = False):
    
    keras.backend.clear_session()

    train_data_raw, train_labels_raw, test_data_raw, test_labels_raw = import_data(10000)
    train_data, train_labels, val_data, val_labels, test_data, test_labels = prepare_data(
        train_data_raw, train_labels_raw, test_data_raw, test_labels_raw, max_len, top = top, embedding = embedding)
    train_data, val_data, test_slice = tensor_slices(train_data, train_labels, 
                                                    val_data, val_labels, 
                                                    test_data, test_labels)
    

    EPOCHS = 20
    train_acc_history = []
    train_loss_history = []
    val_acc_history = []
    val_loss_history = []
    weight_history = []
    elapsed_time_tot = []
    gradients = []

    for epoch in range(EPOCHS):

        start_time = time.time()
        for features, labels in train_data:
            grad = model_train(features, labels, model)
        gradients.append(grad)
        
        for val_features, val_labels in val_data:
            model_validate(val_features, val_labels, model)

        loss, acc = train_loss.result(), train_acc.result()
        val_loss, val_acc = valid_loss.result(), valid_acc.result()

        train_acc_history.append(acc.numpy())
        train_loss_history.append(loss.numpy())
        val_acc_history.append(val_acc.numpy())
        val_loss_history.append(val_loss.numpy())
        weight_history.append([layer.get_weights() for layer in model.layers])

        train_loss.reset_states(), train_acc.reset_states()
        valid_loss.reset_states(), valid_acc.reset_states()

        end_time = time.time()
        elapsed_time = end_time - start_time
        elapsed_time_tot.append(elapsed_time)
        
        template = """Epoch {}, loss: {:.3f}, acc: {:.3f}, val_loss: {:.3f}, val_acc: {:.3f}, elapsed time: {:.3f}"""
        print (template.format(epoch+1,
                            loss,
                            acc,
                            val_loss,
                            val_acc,
                            elapsed_time))
        
    elapsed_time_tot = [sum(elapsed_time_tot[0:x:1]) for x in range(1, EPOCHS+1)]
        
    predictions = model(test_data, training = False)
    t_acc = test_acc(test_labels, predictions)
    t_loss = loss_func(test_labels, predictions)   
    print(f'Test accuracy: {t_acc:.3f}, Test loss: {t_loss:.3f}')    
               
        
    performance = {
        'train_acc': train_acc_history,
        'train_loss': train_loss_history,
        'val_acc': val_acc_history,
        'val_loss': val_loss_history,
        'test_acc': t_acc.numpy(),
        'test_loss': t_loss.numpy(),   
        'elapsed_time': elapsed_time_tot
    }
        
    return model, weight_history, performance, gradients

In [0]:
def build_model(top, pretrained_embed = False):

    if top == False:
        topics = 46
    
    elif top == True:
        topics = 9

    model = Sequential()

    if pretrained_embed == True:
        embedding_file = 'glove.6B.100d.txt'
        embed_matrix, embed_dim = embedding_matrix(embedding_file)
        model.layers[0].set_weights([embed_matrix])
        model.layers[0].trainable = False
        output_dim = embed_dim
    elif pretrained_embed == False:
        output_dim = 128

    model.add(Embedding(input_dim = 10000, output_dim = output_dim, input_length = 280))
    # model.add(Conv1D(filters= 16, kernel_size = 5, activation = 'relu'))
    # model.add(MaxPooling1D(3))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.25))
    # model.add(Dense(16, activation = 'relu', kernel_regularizer='l2'))
    # model.add(layers.GRU(units = 32, activation = 'tanh'))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(32, activation = 'relu', kernel_regularizer='l2'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = 'relu', kernel_regularizer='l2'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = 'relu', kernel_regularizer='l2'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    # model.add(Dense(32, activation = 'relu', kernel_regularizer='l2'))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.8))
    # model.add(Flatten())
    # model.add(Dense(128, activation = 'relu', kernel_regularizer='l2'))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.5))
    model.add(Dense(units = topics, activation = 'softmax'))

    model.compile(
                    optimizer = 'rmsprop',
                    loss = 'categorical_crossentropy',
                    metrics = ['accuracy'])

    return model

In [0]:
top = False
embedding = True
pretrained_embed = True
model, weight_history, performance, grads = model_run(build_model(top), 280, top = top, 
    embedding = embedding, pretrained_embed = pretrained_embed)
df_performance(performance, 'dnn_1 pretrained no TOP')

Epoch 1, loss: 2.134, acc: 0.439, val_loss: 1.797, val_acc: 0.567, elapsed time: 6.420
Epoch 2, loss: 1.524, acc: 0.605, val_loss: 1.571, val_acc: 0.623, elapsed time: 6.361
Epoch 3, loss: 1.109, acc: 0.717, val_loss: 1.514, val_acc: 0.640, elapsed time: 6.422
Epoch 4, loss: 0.739, acc: 0.827, val_loss: 1.458, val_acc: 0.682, elapsed time: 6.508
Epoch 5, loss: 0.478, acc: 0.891, val_loss: 1.555, val_acc: 0.665, elapsed time: 6.401
Epoch 6, loss: 0.327, acc: 0.930, val_loss: 1.639, val_acc: 0.684, elapsed time: 6.335
Epoch 7, loss: 0.243, acc: 0.943, val_loss: 1.820, val_acc: 0.673, elapsed time: 6.345
Epoch 8, loss: 0.198, acc: 0.949, val_loss: 2.426, val_acc: 0.588, elapsed time: 6.310
Epoch 9, loss: 0.171, acc: 0.951, val_loss: 2.089, val_acc: 0.655, elapsed time: 6.194
Epoch 10, loss: 0.149, acc: 0.952, val_loss: 2.482, val_acc: 0.654, elapsed time: 6.272
Epoch 11, loss: 0.139, acc: 0.952, val_loss: 2.345, val_acc: 0.657, elapsed time: 6.290
Epoch 12, loss: 0.129, acc: 0.952, val_lo

Unnamed: 0,model,train_acc,train_loss,val_acc,val_loss,test_acc,test_loss,elapsed_time
0,cnn_test_1,0.991,0.037,0.769,2.099,0.75,2.353,233.583
1,cnn_test_2,0.982,0.041,0.769,1.479,0.756,1.651,90.137
2,cnn w/ GRU,0.961,0.138,0.721,1.183,0.743,1.257,100.981
3,dnn_1,0.973,0.049,0.748,1.448,0.747,1.508,89.051
4,dnn_2,0.968,0.054,0.763,1.427,0.75,1.543,86.556
5,dnn_3,0.974,0.047,0.768,1.24,0.753,1.311,83.492
6,cnn_3,0.964,0.065,0.801,1.294,0.757,1.358,82.754
7,cnn_4,0.963,0.09,0.802,1.108,0.762,1.203,84.652
8,cnn_5,0.963,0.096,0.8,1.034,0.767,1.136,54.403
9,cnn_lstm_nn_1,0.964,0.141,0.78,0.968,0.785,0.98,95.094
