In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score

# --- 1. DATA LOADING & PREP ---
train_df = pd.read_csv('./datasets/train.csv')
valid_df = pd.read_csv('./datasets/valid.csv')
test_df = pd.read_csv('./datasets/test.csv')

def simple_clean(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

for df in [train_df, valid_df, test_df]:
    df['clean_text'] = df['text'].apply(simple_clean)

MAX_WORDS, MAX_LEN = 10000, 100
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_df['clean_text'])

def encode(df):
    seq = tokenizer.texts_to_sequences(df['clean_text'])
    return pad_sequences(seq, maxlen=MAX_LEN), df['label'].values

X_train, y_train = encode(train_df)
X_valid, y_valid = encode(valid_df)
X_test, y_test = encode(test_df)

# --- 2. BUILD MODEL WITH BEST HYPERPARAMETERS ---
# Using the results from your search: 
# embed_dim: 64, units_1: 32, units_2: 64, dropout: 0.3, lr: 0.0001
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=64, input_length=MAX_LEN),
    Bidirectional(LSTM(32, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(16, activation='relu'), # Standard hidden layer
    Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# --- 3. FINAL TRAINING ---
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

print("Training the final optimized model...")
history = model.fit(
    X_train, y_train,
    epochs=15, # Increased slightly since LR is low (0.0001)
    batch_size=64,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stop]
)

# --- 4. EVALUATE ON TEST SET ---
print("\n--- FINAL EVALUATION ON TEST SET ---")
test_probs = model.predict(X_test)
test_preds = (test_probs > 0.5).astype(int)

print(f"Test Accuracy: {accuracy_score(y_test, test_preds):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, test_preds))

# Save using the recommended .keras format
model.save('final_optimized_bilstm.keras')
print("Model saved as 'final_optimized_bilstm.keras'")

  from pandas.core import (


Training the final optimized model...
Epoch 1/15




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 95ms/step - accuracy: 0.6351 - loss: 0.6123 - val_accuracy: 0.7961 - val_loss: 0.4359
Epoch 2/15
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 109ms/step - accuracy: 0.8484 - loss: 0.3531 - val_accuracy: 0.8073 - val_loss: 0.4047
Epoch 3/15
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 117ms/step - accuracy: 0.8949 - loss: 0.2601 - val_accuracy: 0.8282 - val_loss: 0.3878
Epoch 4/15
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 110ms/step - accuracy: 0.9214 - loss: 0.2070 - val_accuracy: 0.8324 - val_loss: 0.4493
Epoch 5/15
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 108ms/step - accuracy: 0.9395 - loss: 0.1700 - val_accuracy: 0.8352 - val_loss: 0.4544
Epoch 6/15
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 110ms/step - accuracy: 0.9530 - loss: 0.1387 - val_accuracy: 0.8352 - val_loss: 0.5117

--- FINAL EVALUATION 