### Benchmark - LSTM model


In [None]:
# reference: https://nzlul.medium.com/the-classification-of-text-messages-using-lstm-bi-lstm-and-gru-f79b207f90ad
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# Text pre-processing
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
# Modeling
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional
from keras.utils import to_categorical

In [17]:
df = pd.read_csv('../data/train.csv')
df['score'] = df['score'] - 1
df.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,2
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,3
3,001bdc0,"We all heard about Venus, the planet without a...",3
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",2



#### Train-validation split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(df['full_text'], df['score'], test_size=0.2, random_state=434)

In [26]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [19]:
max_len = 50 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500
tokenizer = Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)

In [20]:
word_index = tokenizer.word_index

In [21]:
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_val)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [27]:
# Define parameter
vocab_size = 500 
embedding_dim = 16
drop_value = 0.2
n_dense = 24

# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 50, 16)            8000      
                                                                 
 lstm_2 (LSTM)               (None, 100)               46800     
                                                                 
 dense_3 (Dense)             (None, 6)                 606       
                                                                 
Total params: 55406 (216.43 KB)
Trainable params: 55406 (216.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',patience=2)
checkpoint_filepath = './checkpoint.model.keras'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_filepath,
    monitor = "val_loss",
    mode = 'min',
    save_best_only = True
)

history = model.fit(training_padded,
                    y_train,
                    batch_size = 128,
                    epochs=num_epochs, 
                    validation_data=(testing_padded, y_val),
                    callbacks =[early_stop,model_checkpoint_callback],
                    verbose=2)

Epoch 1/30


2024-04-16 02:18:01.539357: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


KeyboardInterrupt: 

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch

fig = plt.figure(1, figsize = (15,5))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

ax1.set_title('Training and Validation Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss (Categorical Cross Entropy)')
ax1.plot(hist['epoch'], hist['loss'], label='Train Loss',marker = 'o',linestyle=':')
ax1.plot(hist['epoch'], hist['val_loss'], label='Validation Loss',marker = 'o',linestyle=':')
ax1.legend()

ax2.set_title('Training and Testing Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.plot(hist['epoch'], hist['accuracy'], label='Train Accuracy',marker = 'o',linestyle=':')
ax2.plot(hist['epoch'], hist['val_accuracy'], label='Validation Accuracy',marker = 'o',linestyle=':')
ax2.legend()
plt.show()

In [None]:
train_pred = model.predict(X_train,verbose = 0).argmax(1)
val_pred= model.predict(X_val,verbose = 0).argmax(1)

# print confusion matrix for the best model
def plot_cm(ground_truth,pred_data):
    fig, ax = plt.subplots(1, 2, figsize=(15,4))
    for i in range(len(ground_truth)):
        sns.heatmap(confusion_matrix(ground_truth[i],pred_data[i]),cmap = 'summer',ax = ax[i])
        ax[i].set_xlabel("Predicted Label")
        ax[i].set_ylabel("True Label")
        ax[0].set_title('Confusion Matrix for Train Data')
        ax[1].set_title('Confusion Matrix for Validation Data')

plot_cm([y_train.argmax(1),y_val.argmax(1)],[train_pred,val_pred])

In [None]:
model = keras.models.load_model(checkpoint_filepath)

train_loss, train_acc = model.evaluate(X_train, y_train, batch_size=128,verbose = 0)
val_loss, val_acc = model.evaluate(X_val, y_val, batch_size=128,verbose = 0)