In [None]:
import pandas as pd
from gensim.models import Word2Vec
from keras.api.preprocessing.sequence import pad_sequences
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import LabelEncoder
import numpy as np

train_raw = pd.read_csv("work-data/train.csv")
test_raw = pd.read_csv("work-data/val.csv")

train_raw.head()

In [None]:
def tokenize_text(text):
    return [
        [word.lower() for word in word_tokenize(sentence)] for sentence in sent_tokenize(text)
    ]

In [None]:
train = train_raw.copy()
test = test_raw.copy()

tokenized_train = [sent for item in train['text'].apply(tokenize_text) for sent in item]
tokenized_test = [sent for item in test['text'].apply(tokenize_text) for sent in item]

In [None]:
word2vec = Word2Vec(sentences=tokenized_train, vector_size=100, window=7, min_count=3, workers=4)
word_index = {word: i + 1 for i, word in enumerate(word2vec.wv.index_to_key)}

# word_index

In [None]:
def text_to_seq(text, word_index):
    return [word_index[word] for word in word_tokenize(text.lower()) if word in word_index]

X_train_seq = [text_to_seq(text, word_index) for text in train['text']]
X_val_seq = [text_to_seq(text, word_index) for text in test['text']]

max_len = max([len(seq) for seq in X_train_seq + X_val_seq])

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(train['label'])
y_val = label_encoder.transform(test['label'])


In [None]:
#Embedding Matrix

embedding_matrix = np.zeros((len(word_index) + 1, 100))

for word, i in word_index.items():
    if word in word2vec.wv:
        embedding_matrix[i] = word2vec.wv[word]

embedding_matrix


In [None]:
from keras import Sequential
from keras.api.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.api.callbacks import EarlyStopping
from keras.api.optimizers import Adam

model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=100, weights=[embedding_matrix], trainable=True),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax')
])
adam = Adam(learning_rate=0.002)
model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=10, batch_size=128, callbacks=[early_stopping])

In [None]:
model.evaluate(X_val_padded, y_val)

In [None]:
import pickle

model.save("models/lstm-model.keras")

with open("models/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# save the word index
with open("models/word_index.pkl", "wb") as f:
    pickle.dump(word_index, f)

# save the max len
with open("models/max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)

In [None]:
# model.predict(X_val_padded)
print(X_val_padded)