In [2]:
import pandas as pd
import numpy as np
import json
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt

# --- 1. DATA LOADING & PREPROCESSING ---
def simple_clean(text):
    """Basic cleaning to ensure the model has 'clean_text' to work with."""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Load datasets
train_df = pd.read_csv('./datasets/train.csv')
valid_df = pd.read_csv('./datasets/valid.csv')

# FIX: Create the 'clean_text' column that was missing
print("Preprocessing text data...")
train_df['clean_text'] = train_df['text'].apply(simple_clean)
valid_df['clean_text'] = valid_df['text'].apply(simple_clean)

MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_df['clean_text'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['clean_text']), maxlen=MAX_LEN)
X_valid = pad_sequences(tokenizer.texts_to_sequences(valid_df['clean_text']), maxlen=MAX_LEN)

y_train = train_df['label'].values
y_valid = valid_df['label'].values

# --- 2. DEFINE THE HYPERMODEL ---
def build_model(hp):
    model = Sequential()
    
    # Embedding: Range from 64 to 256
    model.add(Embedding(
        input_dim=MAX_WORDS, 
        output_dim=hp.Int('embed_dim', 64, 256, step=64), 
        input_length=MAX_LEN
    ))
    
    # First Bi-LSTM Layer
    model.add(Bidirectional(LSTM(
        units=hp.Int('lstm_units_1', 32, 128, step=32),
        return_sequences=True
    )))
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1)))
    
    # Second Bi-LSTM Layer
    model.add(Bidirectional(LSTM(
        units=hp.Int('lstm_units_2', 16, 64, step=16)
    )))
    model.add(Dropout(hp.Float('dropout_2', 0.2, 0.5, step=0.1)))
    
    # Dense Hidden Layer
    model.add(Dense(
        units=hp.Int('dense_units', 8, 32, step=8),
        activation='relu'
    ))
    
    model.add(Dense(1, activation='sigmoid'))
    
    # Optimizer Learning Rate
    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# --- 3. RUN THE TUNER ---
tuner = kt.BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=5, # Set higher (e.g., 20) for better results if you have time
    directory='tuning_logs',
    project_name='bilstm_search'
)

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

print("\nStarting search for best hyperparameters...")
tuner.search(
    X_train, y_train,
    epochs=5,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stop]
)

# --- 4. EXPORT RESULTS ---
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("\n" + "="*30)
print("BEST HYPERPARAMETERS FOUND:")
for param in ['embed_dim', 'lstm_units_1', 'lstm_units_2', 'dropout_1', 'learning_rate']:
    print(f"{param}: {best_hps.get(param)}")
print("="*30)

# Save best parameters to a file
with open('best_config.json', 'w') as f:
    json.dump(best_hps.values, f)

# Rebuild and save the final best model
final_model = tuner.hypermodel.build(best_hps)
final_model.save('optimized_bilstm_model.h5')
print("\nBest configuration and model saved to disk.")



Trial 5 Complete [00h 03m 57s]
val_accuracy: 0.8561452627182007

Best val_accuracy So Far: 0.8561452627182007
Total elapsed time: 00h 30m 15s

BEST HYPERPARAMETERS FOUND:
embed_dim: 64
lstm_units_1: 32
lstm_units_2: 64
dropout_1: 0.30000000000000004
learning_rate: 0.0001

Best configuration and model saved to disk.
