# Sentence-Level Categorization on TREC Dataset

In [None]:
#--- Imports ---#

# Setting Random Seed.
import os
import random
import tensorflow as tf

# Data Preprocessing.
import codecs # For reading FastText Embedding file.
import numpy  as np
import pandas as pd
from tqdm import tqdm # For tracking progress of iterable processes.
from sklearn.model_selection  import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Model Architecture.
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, \
                         BatchNormalization, Dropout, \
                         GlobalMaxPooling1D, GlobalAveragePooling1D, Dense
from keras.utils import plot_model
from IPython.display import Image

# Model Training.
from time import time
from keras.callbacks import Callback, LearningRateScheduler, EarlyStopping

# Data Visualization.
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#--- Set Random Seed ---#

# Create the seed.
WORD = "meow"                          # Set a string to use as the seed.
seed = sum(ord(char) for char in WORD) # Convert `WORD` to numerical representation.

# Set the seed.
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
#--- Constants ---#

RAND_STATE = seed   # Random State.
TRAIN_SIZE = 0.6    # Size of Train Dataset.
VAL_SIZE   = 0.15   # Size of Validation Dataset.
TEST_SIZE  = 0.15   # Size of Test Dataset.

EMBED_DIM  = 100    # Set Embedding Dimensions.
LSTM_CELLS = 128    # Number of LSTM cells for LSTM layer.
DROPOUT    = 0.5    # Dropout Rate for Dropout Layer.
DENSE_DIM  = 128    # Dimensions of Dense Layer.

LEARN_RATE = 0.001  # Initial Learning Rate.
LR_DECAY   = 0.95   # Learning Rate Decay.
PATIENCE   = 10     # Early Stopping Patience.
EPOCHS     = 10000  # Number of training epochs.
BATCH_SIZE = 32     # Training Batch Size.

In [None]:
#--- Load FastText Embeddings ---#

# Create Embedding Dictionary.
dict_fasttext_embedding = {} 

# Open FastText Embedding file.
f = codecs.open('../input/fasttext/wiki.simple.vec', encoding='utf-8')

# Populate Embedding Dictionary.
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    dict_fasttext_embedding[word] = coefs
    
f.close()

# Get FastText Embedding Vocabulary Size.
vocab_size_fasttext = len(dict_fasttext_embedding)
print(f"FastText Embedding Vocabulary Size: {vocab_size_fasttext}")

# Get FastText Embedding Dimensions.
embed_dim_fasttext = dict_fasttext_embedding['hello'].shape[0]
print(f"FastText Embedding Dimensions: {embed_dim_fasttext}")

In [None]:
#--- Load GloVe Embeddings ---#

# Read GloVe Embedding CSV file.
glove = pd.read_csv('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', 
                    sep=" ", quoting=3, header=None, index_col=0)

# Create Embedding Dictionary and populate with GloVe Embeddings.
dict_glove_embedding = {key: val.values for key, val in glove.T.items()}

# Get FastText Embedding Vocabulary Size.
vocab_size_glove = len(dict_glove_embedding)
print(f"GloVe Embedding Vocabulary Size: {vocab_size_glove}")

# Get FastText Embedding Dimensions.
embed_dim_glove = dict_glove_embedding['hello'].shape[0]
print(f"GloVe Embedding Dimensions: {embed_dim_glove}")

# Data Preprocessing

In [None]:
#--- Read data ---#

df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df

In [None]:
#--- Label Encoding ---#

df.sentiment = df.sentiment.apply(lambda x: 1 if x=='positive' else 0)
df

In [None]:
#--- Tokenization ---#

# Get the review text.
X_text = df['review']

# Get the labels.
y = df['sentiment']

# Fit tokenizer on review text.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text)

# Tokenize review text.
X_tokenized = tokenizer.texts_to_sequences(X_text)

# Print Dimensions of Embedded Data.
dims_embedded_data = len(X_tokenized), len(X_tokenized[0])
print(f"Dimensions of Embedded Data: {dims_embedded_data}")

# Get Tokenizer Vocabulary Size.
vocab_size_tokenizer = len(tokenizer.word_index) + 1
print(f"Tokenizer Vocabulary Size: {vocab_size_tokenizer}")

In [None]:
#--- Plot Sequence Lengths ---#

sequence_lengths = [len(seq) for seq in X_tokenized]
plt.hist(sequence_lengths, bins=30)
plt.xlabel('Sequence Length')
plt.ylabel('Count')
plt.show()

print(f"Mean sequence length: {np.mean(sequence_lengths)}")
print(f"Median sequence length: {np.median(sequence_lengths)}")
print(f"Max sequence length: {max(sequence_lengths)}")
print(f"95th percentile sequence length: {np.percentile(sequence_lengths, 95)}")

In [None]:
#--- Sequence Padding ---#

# Set sequence length to 95th percentile sequence length.
sequence_length = int(np.percentile(sequence_lengths, 95))

# Pad and truncate sequences in Embedded Data.
X_padded = pad_sequences(X_tokenized, maxlen=sequence_length)

# Convert to float32 array.
X_array = np.asarray(X_padded).astype('float32')

# Print Dimensions of Padded Data.
print(f"Dimensions of Padded Data: {X_array.shape}")

# Print Sequences Length.
print(f"Sequence Length: {sequence_length}")

In [None]:
#--- Data Splitting ---#

X_train, X_val, Y_train, Y_val = train_test_split(X_array,
                                                  y,
                                                  test_size=(1 - TRAIN_SIZE),
                                                  random_state=RAND_STATE)

X_val, X_test, Y_val, Y_test = train_test_split(X_val,
                                                Y_val,
                                                test_size=0.5,
                                                random_state=RAND_STATE)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of Y_train: {Y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of Y_val: {Y_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of Y_test: {Y_test.shape}")

# Model Definition

In [None]:
#--- Function for Creating Embedding Matrix ---#

def create_embedding_matrix(tokenizer, embed_dim, embedding_dict):
    
    # Get Tokenizer Vocabulary Size.
    vocab_size_tokenizer = len(tokenizer.word_index) + 1
    
    # Initialize Embedding Matrix with zeroes.
    embedding_matrix = np.zeros((vocab_size_tokenizer, embed_dim)) 
    
    # Populate Embedding Matrix.
    for word, index in tokenizer.word_index.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
            
    return embedding_matrix

In [None]:
#--- Creating Embedding Matrix ---#

# Create FastText Embedding Matrix.
embedding_matrix_fasttext = create_embedding_matrix(tokenizer, 
                                                    embed_dim_fasttext, 
                                                    dict_fasttext_embedding)

# Create GloVe Embedding Matrix.
embedding_matrix_glove = create_embedding_matrix(tokenizer, 
                                                 embed_dim_glove, 
                                                 dict_glove_embedding)

In [None]:
#--- Model Definition ---#

def create_model(vocab_size_tokenizer, 
                 sequence_length, 
                 embed_dim,
                 embedding_matrix=None,
                 aggregation='max'):
            
    input_layer = Input(shape=(sequence_length,))

    if embedding_matrix is None:
        embedding_layer = Embedding(input_dim=vocab_size_tokenizer, 
                                    output_dim=embed_dim, 
                                    input_length=sequence_length
                                   )(input_layer)
    else:
        embedding_layer = Embedding(input_dim=vocab_size_tokenizer,
                                    output_dim=embed_dim,  
                                    input_length=sequence_length, 
                                    weights=[embedding_matrix],   
                                    trainable=False      
                                   )(input_layer)

    lstm_layer        = Bidirectional(LSTM(int(LSTM_CELLS*1.0), return_sequences=False))(embedding_layer)

    dropout_layer     = Dropout(rate=DROPOUT)(lstm_layer)

    dense_layer       = Dense(int(DENSE_DIM*1.0), activation='relu')(dropout_layer)

    output_layer      = Dense(1, activation='sigmoid')(dense_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model


In [None]:
#--- Create Model without using Pretrained Embedding ---#

model_noembedding = create_model(vocab_size_tokenizer, 
                                 sequence_length,
                                 EMBED_DIM,
                                 embedding_matrix=None,
                                 agbgregation='max')

plot_model(model_noembedding,
           to_file='model_noembedding.png',
           show_shapes=True,
           show_layer_names=True,
           show_layer_activations=True,
           show_trainable=True)

print(model_noembedding.summary())

Image(filename='model_noembedding.png')

In [None]:
#--- Create Model using FastText Embedding ---#

model_fasttext = create_model(vocab_size_tokenizer, 
                              sequence_length,
                              embed_dim_fasttext,
                              embedding_matrix=embedding_matrix_fasttext,
                              aggregation='max')

plot_model(model_fasttext, 
           to_file='model_fasttext.png', 
           show_shapes=True,
           show_layer_names=True,
           show_layer_activations=True,
           show_trainable=True)

print(model_fasttext.summary())

Image(filename='model_fasttext.png')

In [None]:
#--- Create Model using GloVe Embedding ---#

model_glove = create_model(vocab_size_tokenizer, 
                           sequence_length,
                           embed_dim_glove,
                           embedding_matrix=embedding_matrix_glove,
                           aggregation='max')

plot_model(model_glove, 
           to_file='model_glove.png', 
           show_shapes=True,
           show_layer_names=True,
           show_layer_activations=True,
           show_trainable=True)

print(model_glove.summary())

Image(filename='model_glove.png')


# Model Training

In [None]:
#--- Callbacks ---#

#--- Callback Class for Saving Model Weights ---#

class SaveWeights(Callback):
    
    def __init__(self):
        super(SaveWeights, self).__init__()
        self.weights_history = []

    def on_epoch_end(self, epoch, logs=None):
        # Get weights of first layer.
        layer_weights = self.model.layers[-1].get_weights() 
        self.weights_history.append(layer_weights)
        
#--- Callback Class for Training Time History ---#

class TimeHistory(Callback):
    
    def on_train_begin(self, logs={}):
        self.time_start = time()        # Track time taken for each epoch.
        self.times = []                 # Track total time taken for training.

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time() - self.epoch_time_start)
        
    def on_train_end(self, logs={}):
        self.time_total = time() - self.time_start
        
#--- Callback Function for Learning Rate Scheduling ---#

def lr_schedule(epoch):
    
    initial_lr = 0.01                           # Set the initial learning rate.
    learning_rate = initial_lr * (0.9 ** epoch) # Update learning rate with decay.
    
    return learning_rate

In [None]:
#--- Model Training ---#

def train_model(model, X_train, Y_train, X_val, Y_val):

    # Define Callbacks.
    cb_earlystop    = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, verbose=2)
    cb_saveweights  = SaveWeights()
    cb_timehistory  = TimeHistory()
    cb_lr_scheduler = LearningRateScheduler(lr_schedule)

    callbacks = [cb_earlystop, cb_saveweights, cb_timehistory, cb_lr_scheduler]

    history = model.fit(X_train, 
                        Y_train, 
                        validation_data=(X_val, Y_val),
                        epochs=EPOCHS, 
                        batch_size=BATCH_SIZE, 
                        callbacks=callbacks,
                        verbose=2)
    
    df_results = pd.DataFrame(history.history)

    df_results['epoch'] = history.epoch
    df_results['time']  = cb_timehistory.times

    df_weights = pd.DataFrame(cb_saveweights.weights_history)
    total_time = cb_timehistory.time_total

    results = {'results'    : df_results,
               'weights'    : df_weights,
               'total_time' : total_time}
    
    return results

In [None]:
#--- Train Model without using Pretrained Embedding ---#

results_noembedding = train_model(model_noembedding, X_train, Y_train, X_val, Y_val)
print(f'Training Time: {results_noembedding["total_time"]}')

#--- Predict using Model without using Pretrained Embedding ---#

loss, accuracy = model_noembedding.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

In [None]:
#--- Train Model using FastText Embedding ---#

results_fasttext = train_model(model_fasttext, X_train, Y_train, X_val, Y_val)
print(f'Training Time: {results_fasttext["total_time"]}')

#--- Predict using Model using FastText Embedding ---#

loss, accuracy = model_fasttext.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

In [None]:
#--- Train Model using GloVe Embedding ---#

results_glove = train_model(model_glove, X_train, Y_train, X_val, Y_val)
print(f'Training Time: {results_glove["total_time"]}')

#--- Predict using Model using FastText Embedding ---#

loss, accuracy = model_glove.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

In [None]:
#--- Saving Models ---#

model_noembedding.save('model_noembedding.h5')
model_fasttext.save('model_fasttext.h5')
model_glove.save('model_glove.h5')

# Visualization

In [None]:
#--- Plot Accuracy ---#

# Create a figure and axes for the accuracy
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot training and validation accuracy on the first y-axis
ax1.plot(results_noembedding['results']['epoch'], results_noembedding['results']['val_accuracy'], 
         label='No Embedding Accuracy',   marker='o', color='darkblue')
ax1.plot(results_fasttext['results']['epoch'], results_fasttext['results']['val_accuracy'], 
         label='FastText Accuracy', marker='o', color='firebrick')
ax1.plot(results_glove['results']['epoch'], results_glove['results']['val_accuracy'], 
         label='GloVe Accuracy', marker='o', color='seagreen')

# Set labels and a legend for the first y-axis
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy', color='tab:blue')
ax1.legend(loc='center left')

# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()

# Plot training and validation loss on the second y-axis
ax2.plot(results_noembedding['results']['epoch'], results_noembedding['results']['val_loss'], 
         label='No Embedding Loss',   marker='o', color='lightskyblue')
ax2.plot(results_fasttext['results']['epoch'], results_fasttext['results']['val_loss'], 
         label='FastText Loss', marker='o', color='lightcoral')
ax2.plot(results_glove['results']['epoch'], results_glove['results']['val_loss'], 
         label='GloVe Loss', marker='o', color='lightgreen')

# Set labels and a legend for the second y-axis
ax2.set_ylabel('Loss', color='tab:red')
ax2.legend(loc='center right')

# Title and overall legend
plt.title('Training and Validation Accuracy and Loss Over Epochs')
plt.legend(loc='center right')

# Show the plot
plt.show()

In [None]:
from tensorflow import keras
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

model_noembedding = keras.models.load_model('model_noembedding.h5')

loss, accuracy = model_noembedding.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
Y_pred = model_noembedding.predict(X_test)

predictions = [1 if i>0.5 else 0  for i in Y_pred ]

accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
conf_matrix = confusion_matrix(Y_test, predictions)

print(accuracy)
print(f1)
print(conf_matrix)

In [None]:
model_fasttext = keras.models.load_model('model_fasttext.h5')

loss, accuracy = model_fasttext.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
Y_pred = model_fasttext.predict(X_test)

predictions = [1 if i>0.5 else 0  for i in Y_pred ]

accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
conf_matrix = confusion_matrix(Y_test, predictions)

print(accuracy)
print(f1)
print(conf_matrix)

In [None]:
model_glove = keras.models.load_model('model_glove.h5')

loss, accuracy = model_glove.evaluate(X_test, Y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
Y_pred = model_glove.predict(X_test)

predictions = [1 if i>0.5 else 0  for i in Y_pred ]

accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
conf_matrix = confusion_matrix(Y_test, predictions)

print(accuracy)
print(f1)
print(conf_matrix)

In [None]:
predictions = model_noembedding.predict(X_test)
plt.hist(np.ravel(predictions), bins=10, density=True, alpha=0.7, color='b')