## **Hyperparameter Tuning & Cross Validation** (2nd attempt)

In [None]:
import emoji
import matplotlib.pyplot as plt
import numpy as np
import nltk
import optuna
import pandas as pd
import re
import seaborn as sns
import tensorflow as tf
import warnings

from sklearn.model_selection import StratifiedKFold, train_test_split

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")
print(f"Optuna version: {optuna.__version__}")

Define model creation function

In [None]:
def create_model_optuna(trial, input_dim=10000):

    embedding_dim = trial.suggest_int('embedding_dim', 32, 256, step=32)
    lstm_units = trial.suggest_int('lstm_units', 32, 256, step=32)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.8)
    dense_units = trial.suggest_int('dense_units', 16, 128, step=16)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_dim),
        LSTM(lstm_units, return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        loss='binary_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy']
    )
    
    return model, batch_size

print("Model creation function defined!")

Define objective function for optimization

In [None]:
def objective(trial):

    model, batch_size = create_model_optuna(trial)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
        
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True
        )
        
        history = model.fit(
            X_train_fold, y_train_fold,
            validation_data=(X_val_fold, y_val_fold),
            epochs=12,
            batch_size=batch_size,
            callbacks=[early_stopping],
            verbose=0
        )
        
        best_val_acc = max(history.history['val_accuracy'])
        cv_scores.append(best_val_acc)
        
        del model
        tf.keras.backend.clear_session()
        
        if fold < 4:
            model, _ = create_model_optuna(trial)
    
    return np.mean(cv_scores)

print("Objective function defined!")

Load and prepare your data

In [None]:
dataset_path = '../data/twt.csv'
column_names = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv(dataset_path, encoding='latin', delimiter=',', names=column_names)

df = df.drop(['id', 'date', 'flag', 'user'], axis=1)
df = df.dropna()

sentiment_critical = {
    'not', 'no', 'never', 'nothing', 'nobody', 'none', 'nowhere', 'neither',
    'very', 'really', 'quite', 'rather', 'extremely', 'incredibly', 'absolutely',
    'but', 'however', 'although', 'though', 'yet', 'except',
    'too', 'so', 'such', 'more', 'most', 'less', 'least',
    'only', 'just', 'still', 'even', 'again'
}

negative_contractions = {
    "don't", "won't", "can't", "shouldn't", "wouldn't", "couldn't",
    "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
    "hadn't", "doesn't", "didn't", "won't", "shan't", "mustn't",
    "mightn't", "needn't"
}

sentiment_critical.update(negative_contractions)

def clean_twts(twt):
    twt = twt.lower()
    twt = re.sub(r"http\S+|www\S+|https\S+", '', twt)  # remove urls
    twt = re.sub(r"@\w+", '', twt)  # remove mentions
    twt = re.sub(r"#", '', twt)  # remove hashtag symbol
    twt = emoji.replace_emoji(twt, replace='')  # remove emojis
    twt = re.sub(r"[^a-zA-Z\s']", '', twt)  # remove punctuation
    twt = re.sub(r"\s+", ' ', twt).strip()  # clean whitespace

    tokens = twt.split()
    tokens = [word for word in tokens if (word not in stop_words or word in sentiment_critical) and len(word) > 1] # remove stopwords and keep sentiment-critical words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # lemmatize

    return ' '.join(tokens)

cleaned_twts = df['text'].apply(clean_twts)
df['cleaned_text'] = cleaned_twts

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(cleaned_twts)
sequences = tokenizer.texts_to_sequences(cleaned_twts)
padded_sequences = pad_sequences(sequences, maxlen=20, padding='post', truncating='post')

df['padded_text'] = list(padded_sequences)
df['sentiment'] = df['sentiment'].map({4: 1, 0: 0})

df.head()

In [None]:
X = padded_sequences
y = df['sentiment']

print(f"Data shape: X={X.shape}, y={y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Training data: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test data: X_test={X_test.shape}, y_test={y_test.shape}")
print("Data preparation complete!")

Run Optuna optimization

In [None]:
def run_optuna_optimization(n_trials, timeout=3600):
    
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5,
            n_warmup_steps=10
        )
    )
    
    print(f"Starting Optuna optimization with {n_trials} trials...")
    print("This may take a while...")
    
    study.optimize(
        objective, 
        n_trials=n_trials,
        timeout=timeout,
        show_progress_bar=True
    )
    
    return study

study = run_optuna_optimization(n_trials=8)

print("\n" + "="*50)
print("OPTIMIZATION COMPLETE!")
print("="*50)
print(f"Best trial: {study.best_trial.number}")
print(f"Best accuracy: {study.best_value:.4f}")
print(f"Best parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Visualize optimization results

In [None]:
def plot_optimization_results(study):
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    trials = study.trials
    values = [trial.value for trial in trials if trial.value is not None]
    
    axes[0, 0].plot(values)
    axes[0, 0].set_title('Optimization History')
    axes[0, 0].set_xlabel('Trial')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].grid(True)
    
    if len(trials) > 10:
        try:
            importance = optuna.importance.get_param_importances(study)
            params = list(importance.keys())
            importances = list(importance.values())
            
            axes[0, 1].barh(params, importances)
            axes[0, 1].set_title('Parameter Importance')
            axes[0, 1].set_xlabel('Importance')
        except:
            axes[0, 1].text(0.5, 0.5, 'Not enough trials\nfor importance analysis', 
                          ha='center', va='center', transform=axes[0, 1].transAxes)
    
    best_params = study.best_params
    param_names = list(best_params.keys())
    param_values = list(best_params.values())
    
    axes[1, 0].barh(param_names, param_values)
    axes[1, 0].set_title('Best Parameters')
    axes[1, 0].set_xlabel('Value')
    
    axes[1, 1].hist(values, bins=20, alpha=0.7)
    axes[1, 1].axvline(study.best_value, color='red', linestyle='--', 
                      label=f'Best: {study.best_value:.4f}')
    axes[1, 1].set_title('Distribution of Accuracy Scores')
    axes[1, 1].set_xlabel('Accuracy')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].legend()
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()

plot_optimization_results(study)

trials_df = study.trials_dataframe()
print(f"\nSummary Statistics:")
print(f"Total trials: {len(study.trials)}")
print(f"Best accuracy: {study.best_value:.4f}")
print(f"Mean accuracy: {trials_df['value'].mean():.4f}")
print(f"Std accuracy: {trials_df['value'].std():.4f}")
print(f"Worst accuracy: {trials_df['value'].min():.4f}")

Train final model with best parameters

In [None]:
def train_final_model(study, X_train, X_test, y_train, y_test):
    
    best_params = study.best_params
    print("Training final model with best parameters:")
    for key, value in best_params.items():
        print(f"  {key}: {value}")
    
    model = Sequential([
        Embedding(input_dim=10000, output_dim=best_params['embedding_dim']),
        LSTM(best_params['lstm_units'], return_sequences=False),
        Dropout(best_params['dropout_rate']),
        Dense(best_params['dense_units'], activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'])
    model.compile(
        loss='binary_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy']
    )
    
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=15,
        restore_best_weights=True,
        verbose=1
    )
    
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_accuracy',
        factor=0.5,
        patience=7,
        min_lr=1e-7,
        verbose=1
    )
    
    print("\nTraining final model...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=12,
        batch_size=best_params['batch_size'],
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    return model, history

final_model, final_history = train_final_model(study, X_train, X_test, y_train, y_test)

test_loss, test_acc = final_model.evaluate(X_test, y_test, verbose=0)
print(f"\nFinal test accuracy: {test_acc:.4f}")
print(f"Final test loss: {test_loss:.4f}")

Plot training history

In [None]:
def plot_training_history(history):
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True)
    
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    final_train_acc = history.history['accuracy'][-1]
    final_val_acc = history.history['val_accuracy'][-1]
    final_train_loss = history.history['loss'][-1]
    final_val_loss = history.history['val_loss'][-1]
    
    print(f"Final Training Accuracy: {final_train_acc:.4f}")
    print(f"Final Validation Accuracy: {final_val_acc:.4f}")
    print(f"Final Training Loss: {final_train_loss:.4f}")
    print(f"Final Validation Loss: {final_val_loss:.4f}")

plot_training_history(final_history)

Save results and model

In [None]:
# import joblib
# import json
# from datetime import datetime

# joblib.dump(study, 'optuna_study.pkl')
# print("Optuna study saved as 'optuna_study.pkl'")

# best_params = study.best_params
# best_params['best_accuracy'] = study.best_value
# best_params['optimization_date'] = datetime.now().isoformat()

# with open('best_hyperparameters.json', 'w') as f:
#     json.dump(best_params, f, indent=2)
# print("Best parameters saved as 'best_hyperparameters.json'")

# final_model.save('optimized_lstm_model.h5')
# print("Final model saved as 'optimized_lstm_model.h5'")

# trials_df = study.trials_dataframe()
# trials_df.to_csv('optuna_trials.csv', index=False)
# print("Trials data saved as 'optuna_trials.csv'")

# print("\n" + "="*60)
# print("HYPERPARAMETER OPTIMIZATION COMPLETE!")
# print("="*60)
# print(f"Best hyperparameters found:")
# for key, value in study.best_params.items():
#     print(f"  {key}: {value}")
# print(f"\nBest cross-validation accuracy: {study.best_value:.4f}")
# print(f"Final test accuracy: {test_acc:.4f}")
# print(f"Total trials run: {len(study.trials)}")
# print("\nFiles saved:")
# print("  - optuna_study.pkl (complete study object)")
# print("  - best_hyperparameters.json (best parameters)")
# print("  - optimized_lstm_model.h5 (trained model)")
# print("  - optuna_trials.csv (all trial results)")
# print("="*60)