In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import os

In [2]:
# Step 1: Load Data
train_df = pd.read_csv("../data/imdb_train.csv")
test_df = pd.read_csv("../data/imdb_test.csv")

In [3]:
X_train = train_df['text'].astype(str)
y_train = train_df['label']
X_test = test_df['text'].astype(str)
y_test = test_df['label']

In [4]:
# Step 2: Tokenize and Pad Sequences
vocab_size = 10000
max_length = 100


In [5]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [6]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')


In [7]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')


In [8]:
# Step 3: Build Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [9]:
# Step 4: Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=2)
model.fit(X_train_pad, y_train, validation_split=0.2, epochs=5, batch_size=128, callbacks=[early_stop])


Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 222ms/step - accuracy: 0.6662 - loss: 0.6117 - val_accuracy: 0.7548 - val_loss: 0.5041
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 132ms/step - accuracy: 0.8702 - loss: 0.3288 - val_accuracy: 0.6720 - val_loss: 0.8031
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 101ms/step - accuracy: 0.9141 - loss: 0.2522 - val_accuracy: 0.6924 - val_loss: 0.7401


<keras.src.callbacks.history.History at 0x197cd6618e0>

In [16]:
# Step 5: Evaluate
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step
              precision    recall  f1-score   support

           0       0.76      0.92      0.83     12500
           1       0.89      0.70      0.79     12500

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000



In [11]:
# Step 6: Save Model and Tokenizer
model.save("../dl_models/lstm_model.h5")



In [12]:
import pickle
with open("../dl_models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)