In [1]:
import pandas as pd
from gensim.models import Word2Vec
from keras.api.preprocessing.sequence import pad_sequences
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import LabelEncoder
import numpy as np

train_raw = pd.read_csv("work-data/train.csv")
test_raw = pd.read_csv("work-data/val.csv")

train_raw.head()

Unnamed: 0,id,label,text
0,8901,5,Bennett 's naturalistic performance speaks vol...
1,2506,5,"Shot in rich , shadowy black-and-white , Devil..."
2,2381,5,"More than their unique residences , Home Movie..."
3,1262,3,The movie should be credited with remembering ...
4,2542,4,Audiences are advised to sit near the back and...


In [2]:
def tokenize_text(text):
    return [
        [word.lower() for word in word_tokenize(sentence)] for sentence in sent_tokenize(text)
    ]

In [3]:
train = train_raw.copy()
test = test_raw.copy()

tokenized_train = [sent for item in train['text'].apply(tokenize_text) for sent in item]
tokenized_test = [sent for item in test['text'].apply(tokenize_text) for sent in item]

In [4]:
word2vec = Word2Vec(sentences=tokenized_train, vector_size=100, window=7, min_count=3, workers=4)
word_index = {word: i + 1 for i, word in enumerate(word2vec.wv.index_to_key)}

# word_index

In [5]:
def text_to_seq(text, word_index):
    return [word_index[word] for word in word_tokenize(text.lower()) if word in word_index]

X_train_seq = [text_to_seq(text, word_index) for text in train['text']]
X_val_seq = [text_to_seq(text, word_index) for text in test['text']]

max_len = max([len(seq) for seq in X_train_seq + X_val_seq])

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(train['label'])
y_val = label_encoder.transform(test['label'])


In [6]:
#Embedding Matrix

embedding_matrix = np.zeros((len(word_index) + 1, 100))

for word, i in word_index.items():
    if word in word2vec.wv:
        embedding_matrix[i] = word2vec.wv[word]

embedding_matrix


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.41384864e-01,  8.61281514e-01,  2.59404778e-02, ...,
        -6.78646982e-01,  1.19367391e-01, -2.81263739e-01],
       [-3.61637115e-01,  1.10781515e+00,  2.35326774e-02, ...,
        -6.40042782e-01,  7.08586350e-02, -4.10629272e-01],
       ...,
       [-1.18783815e-02,  9.22933780e-03,  7.44628767e-03, ...,
        -1.42964004e-02,  4.43949457e-03,  4.57164831e-03],
       [-9.99800512e-04,  1.49956141e-02, -1.55561802e-03, ...,
        -4.05468792e-03,  9.78338998e-03, -7.83201866e-03],
       [-1.35418391e-02,  2.01124698e-02, -1.20921899e-03, ...,
        -1.71665810e-02, -6.76015019e-03,  2.71065277e-03]])

In [7]:
from keras import Sequential
from keras.api.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.api.callbacks import EarlyStopping
from keras.api.optimizers import Adam

model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=100, weights=[embedding_matrix], trainable=True),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax')
])
adam = Adam(learning_rate=0.0002)
model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

2025-03-07 22:39:33.140475: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-03-07 22:39:33.140615: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-03-07 22:39:33.140620: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
I0000 00:00:1741361973.140987  140256 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1741361973.141152  140256 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=100, batch_size=256, callbacks=[early_stopping])

Epoch 1/100


2025-03-07 22:39:34.010425: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 399ms/step - accuracy: 0.2551 - loss: 1.5924 - val_accuracy: 0.2784 - val_loss: 1.5805
Epoch 2/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 391ms/step - accuracy: 0.2684 - loss: 1.5801 - val_accuracy: 0.2829 - val_loss: 1.5713
Epoch 3/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 391ms/step - accuracy: 0.2720 - loss: 1.5715 - val_accuracy: 0.2942 - val_loss: 1.5656
Epoch 4/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 477ms/step - accuracy: 0.2932 - loss: 1.5618 - val_accuracy: 0.3037 - val_loss: 1.5576
Epoch 5/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 519ms/step - accuracy: 0.2977 - loss: 1.5497 - val_accuracy: 0.2998 - val_loss: 1.5656
Epoch 6/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 394ms/step - accuracy: 0.3126 - loss: 1.5339 - val_accuracy: 0.3110 - val_loss: 1.5450
Epoch 7/100
[1m33/33[0m [

<keras.src.callbacks.history.History at 0x33bdaf6b0>

In [9]:
model.evaluate(X_val_padded, y_val)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.3825 - loss: 1.4058


[1.401124119758606, 0.3976377844810486]

In [12]:
import pickle

model.save("models/lstm-model.keras")

with open("models/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# save the word index
with open("models/word_index.pkl", "wb") as f:
    pickle.dump(word_index, f)

# save the max len
with open("models/max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)

In [11]:
# model.predict(X_val_padded)
print(X_val_padded)

[[  35  229   10 ...    0    0    0]
 [ 667   14 1710 ...    0    0    0]
 [5426    3   56 ...    0    0    0]
 ...
 [  88  716 4902 ...    0    0    0]
 [   4  565  185 ...    0    0    0]
 [1085    9    4 ...    0    0    0]]
