# LSTM Model with GloVe Embeddings
## Drug Reviews Classification - Gershom

**Model**: LSTM (Long Short-Term Memory) / BiLSTM
**Embedding**: GloVe (200-dim, medium config)
**Task**: Drug review rating prediction

---

### Notebook Structure:
1. Setup and Data Loading
2. GloVe Embedding Training
3. LSTM Model Architecture
4. Training & Evaluation
5. Results Analysis

## 1. Setup and Imports

In [None]:
# Standard libraries
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Bidirectional, Dense, Dropout, Embedding, 
    Input, GlobalMaxPooling1D, GlobalAveragePooling1D
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Shared modules
from src.data_utils import DataLoader, create_dataset_from_dataframe
from src.preprocessing import get_preprocessor, TextPreprocessor
from src.eda import EDAAnalyzer
from embeddings.glove_embedding import GloVeEmbedding, get_glove_embedding

print("✓ All imports successful!")
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## 2. Configuration

In [None]:
# Paths
TRAIN_PATH = '../data/drug_review_train.csv'
VAL_PATH = '../data/drug_review_validation.csv'
TEST_PATH = '../data/drug_review_test.csv'

# Data columns
TEXT_COLUMN = 'review'
LABEL_COLUMN = 'rating'

# Preprocessing
PREPROCESSING_CONFIG = 'moderate'  # minimal, moderate, or aggressive

# Embedding configuration
EMBEDDING_TYPE = 'glove'
EMBEDDING_CONFIG = 'medium'  # 200-dim, 15 iterations
EMBEDDING_DIM = 200

# Model architecture
USE_BIDIRECTIONAL = True  # Set to True for BiLSTM, False for LSTM
LSTM_UNITS = 128
DROPOUT_RATE = 0.3
RECURRENT_DROPOUT = 0.2
DENSE_UNITS = 64

# Training
BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 0.001
PATIENCE = 3

# Sequence parameters
MAX_SEQUENCE_LENGTH = 200  # Maximum number of words per review
VOCAB_SIZE = 10000  # Maximum vocabulary size

# Random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print("Configuration set!")
print(f"Model type: {'BiLSTM' if USE_BIDIRECTIONAL else 'LSTM'}")
print(f"Embedding: {EMBEDDING_TYPE} ({EMBEDDING_CONFIG})")
print(f"LSTM units: {LSTM_UNITS}")

## 3. Load and Preprocess Data

In [None]:
# Load datasets
print("Loading datasets...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
print(f"\nColumns: {list(train_df.columns)}")
print(f"\nClass distribution (train):")
print(train_df[LABEL_COLUMN].value_counts().sort_index())

In [None]:
# Initialize preprocessor
preprocessor = get_preprocessor(PREPROCESSING_CONFIG)
print(f"Using '{PREPROCESSING_CONFIG}' preprocessing configuration")

# Get tokenized texts (GloVe needs tokens, not strings)
print("\nPreprocessing and tokenizing...")
train_texts = train_df[TEXT_COLUMN].fillna('').tolist()
val_texts = val_df[TEXT_COLUMN].fillna('').tolist()
test_texts = test_df[TEXT_COLUMN].fillna('').tolist()

# Get tokenized versions for GloVe training
train_tokens = preprocessor.get_tokens_batch(train_texts)
val_tokens = preprocessor.get_tokens_batch(val_texts)
test_tokens = preprocessor.get_tokens_batch(test_texts)

# Extract labels
train_labels = train_df[LABEL_COLUMN].values
val_labels = val_df[LABEL_COLUMN].values
test_labels = test_df[LABEL_COLUMN].values

print(f"✓ Tokenization complete!")
print(f"Example tokenized review: {train_tokens[0][:20]}...")  # First 20 tokens

## 4. Train GloVe Embeddings

In [None]:
# Initialize and train GloVe
print("Training GloVe embeddings...")
glove_model = get_glove_embedding(EMBEDDING_CONFIG)
glove_model.fit(train_tokens)

print("\n✓ GloVe training complete!")
print(f"Model info: {glove_model.get_model_info()}")
print(f"Vocabulary size: {glove_model.get_vocabulary_size()}")
print(f"Embedding dimension: {glove_model.embedding_dim}")

In [None]:
# Test GloVe embeddings with sample words
print("\nTesting GloVe embeddings...")
test_words = ['pain', 'drug', 'effective', 'side', 'effect']

for word in test_words:
    try:
        similar = glove_model.most_similar(word, topn=5)
        print(f"\n'{word}' most similar:")
        for sim_word, score in similar:
            print(f"  {sim_word}: {score:.3f}")
    except KeyError:
        print(f"\n'{word}' not in vocabulary")

## 5. Create Sequences and Embedding Matrix

In [None]:
# Build vocabulary from GloVe
vocab = glove_model.get_vocab()
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab[:VOCAB_SIZE-1])}  # Reserve 0 for padding
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = len(word_to_idx)

print(f"Vocabulary size (with special tokens): {len(word_to_idx)}")

# Convert tokens to sequences
def tokens_to_sequences(token_lists, word_to_idx):
    sequences = []
    for tokens in token_lists:
        seq = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
        sequences.append(seq)
    return sequences

train_sequences = tokens_to_sequences(train_tokens, word_to_idx)
val_sequences = tokens_to_sequences(val_tokens, word_to_idx)
test_sequences = tokens_to_sequences(test_tokens, word_to_idx)

# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print(f"\nSequence shapes:")
print(f"Train: {train_padded.shape}")
print(f"Val: {val_padded.shape}")
print(f"Test: {test_padded.shape}")

In [None]:
# Create embedding matrix from GloVe
print("\nCreating embedding matrix...")
embedding_matrix = np.zeros((len(word_to_idx), EMBEDDING_DIM))

for word, idx in word_to_idx.items():
    if word not in ['<PAD>', '<UNK>']:
        try:
            embedding_matrix[idx] = glove_model.get_word_vector(word)
        except KeyError:
            # Initialize with small random values
            embedding_matrix[idx] = np.random.randn(EMBEDDING_DIM) * 0.01

print(f"✓ Embedding matrix shape: {embedding_matrix.shape}")
print(f"Non-zero rows: {np.count_nonzero(embedding_matrix.any(axis=1))}")

## 6. Process Labels for Classification

In [None]:
# Check the rating distribution to decide on classification strategy
print("Rating distribution:")
print(train_df[LABEL_COLUMN].value_counts().sort_index())

# For this example, let's use binary classification: ratings >= 6 = positive (1), < 6 = negative (0)
CLASSIFICATION_TYPE = 'binary'  # 'binary', 'multiclass', or 'regression'
THRESHOLD = 6  # For binary classification

if CLASSIFICATION_TYPE == 'binary':
    train_y = (train_labels >= THRESHOLD).astype(int)
    val_y = (val_labels >= THRESHOLD).astype(int)
    test_y = (test_labels >= THRESHOLD).astype(int)
    NUM_CLASSES = 2
    print(f"\nBinary classification: >= {THRESHOLD} = positive")
    print(f"Train class distribution: {np.bincount(train_y)}")
elif CLASSIFICATION_TYPE == 'multiclass':
    train_y = train_labels.astype(int) - 1
    val_y = val_labels.astype(int) - 1
    test_y = test_labels.astype(int) - 1
    NUM_CLASSES = len(np.unique(train_y))
    print(f"\nMulti-class classification: {NUM_CLASSES} classes")
else:  # regression
    train_y = train_labels.astype(float)
    val_y = val_labels.astype(float)
    test_y = test_labels.astype(float)
    NUM_CLASSES = 1
    print("\nRegression: Predicting exact rating")

## 7. Build LSTM Model

In [None]:
def build_lstm_model(
    vocab_size,
    embedding_dim,
    embedding_matrix,
    max_length,
    lstm_units=128,
    dropout_rate=0.3,
    recurrent_dropout=0.2,
    dense_units=64,
    num_classes=2,
    use_bidirectional=True,
    classification_type='binary'
):
    model = Sequential()
    
    # Embedding layer with pre-trained GloVe weights
    model.add(Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=False,
        name='embedding'
    ))
    
    # LSTM layer(s)
    if use_bidirectional:
        model.add(Bidirectional(
            LSTM(
                lstm_units,
                return_sequences=True,
                dropout=dropout_rate,
                recurrent_dropout=recurrent_dropout
            ),
            name='bilstm_1'
        ))
        model.add(Bidirectional(
            LSTM(
                lstm_units // 2,
                dropout=dropout_rate,
                recurrent_dropout=recurrent_dropout
            ),
            name='bilstm_2'
        ))
    else:
        model.add(LSTM(
            lstm_units,
            return_sequences=True,
            dropout=dropout_rate,
            recurrent_dropout=recurrent_dropout,
            name='lstm_1'
        ))
        model.add(LSTM(
            lstm_units // 2,
            dropout=dropout_rate,
            recurrent_dropout=recurrent_dropout,
            name='lstm_2'
        ))
    
    # Dense layers
    model.add(Dense(dense_units, activation='relu', name='dense_1'))
    model.add(Dropout(dropout_rate))
    
    # Output layer
    if classification_type == 'binary':
        model.add(Dense(1, activation='sigmoid', name='output'))
        loss = 'binary_crossentropy'
        metrics = ['accuracy']
    elif classification_type == 'multiclass':
        model.add(Dense(num_classes, activation='softmax', name='output'))
        loss = 'sparse_categorical_crossentropy'
        metrics = ['accuracy']
    else:  # regression
        model.add(Dense(1, activation='linear', name='output'))
        loss = 'mse'
        metrics = ['mae']
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=LEARNING_RATE),
        loss=loss,
        metrics=metrics
    )
    
    return model

# Build the model
print("Building LSTM model...")
model = build_lstm_model(
    vocab_size=len(word_to_idx),
    embedding_dim=EMBEDDING_DIM,
    embedding_matrix=embedding_matrix,
    max_length=MAX_SEQUENCE_LENGTH,
    lstm_units=LSTM_UNITS,
    dropout_rate=DROPOUT_RATE,
    recurrent_dropout=RECURRENT_DROPOUT,
    dense_units=DENSE_UNITS,
    num_classes=NUM_CLASSES,
    use_bidirectional=USE_BIDIRECTIONAL,
    classification_type=CLASSIFICATION_TYPE
)

print("\n✓ Model built successfully!")
model.summary()

## 8. Setup Callbacks

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='best_lstm_glove_model.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

print("Callbacks configured:")
print("- Early stopping (patience=3)")
print("- Learning rate reduction")
print("- Model checkpointing")

## 9. Train the Model

In [None]:
# Train the model
print("\nStarting training...")
print(f"Model: {'BiLSTM' if USE_BIDIRECTIONAL else 'LSTM'}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Max epochs: {EPOCHS}")
print("="*50)

history = model.fit(
    train_padded,
    train_y,
    validation_data=(val_padded, val_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

print("\n✓ Training complete!")

## 10. Plot Training History

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Accuracy plot (for classification)
if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    axes[1].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
    axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[1].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
else:  # regression
    axes[1].plot(history.history['mae'], label='Train MAE', linewidth=2)
    axes[1].plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
    axes[1].set_title('Model MAE', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('MAE')
    axes[1].legend()
    axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Evaluate on Test Set

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = model.evaluate(test_padded, test_y, verbose=1)

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    print(f"\nTest Loss: {test_results[0]:.4f}")
    print(f"Test Accuracy: {test_results[1]:.4f}")
else:
    print(f"\nTest Loss (MSE): {test_results[0]:.4f}")
    print(f"Test MAE: {test_results[1]:.4f}")

In [None]:
# Get predictions
print("\nGenerating predictions...")
test_predictions = model.predict(test_padded, verbose=1)

if CLASSIFICATION_TYPE == 'binary':
    test_pred_classes = (test_predictions > 0.5).astype(int).flatten()
elif CLASSIFICATION_TYPE == 'multiclass':
    test_pred_classes = np.argmax(test_predictions, axis=1)
else:  # regression
    test_pred_classes = test_predictions.flatten()

print("✓ Predictions generated")

## 12. Detailed Classification Report

In [None]:
if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    # Classification report
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT")
    print("="*50)
    print(classification_report(test_y, test_pred_classes, digits=4))
    
    # Confusion matrix
    cm = confusion_matrix(test_y, test_pred_classes)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
    plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(test_y, test_pred_classes)
    
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })
    
    print("\nPer-Class Metrics:")
    print(metrics_df.to_string())
else:
    # Regression metrics
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    mse = mean_squared_error(test_y, test_pred_classes)
    mae = mean_absolute_error(test_y, test_pred_classes)
    r2 = r2_score(test_y, test_pred_classes)
    
    print("\n" + "="*50)
    print("REGRESSION METRICS")
    print("="*50)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(test_y, test_pred_classes, alpha=0.5)
    plt.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'r--', lw=2)
    plt.xlabel('True Rating')
    plt.ylabel('Predicted Rating')
    plt.title('Predicted vs True Ratings')
    plt.tight_layout()
    plt.show()

## 13. Save Results

In [None]:
# Compile results dictionary
results = {
    'model': 'BiLSTM' if USE_BIDIRECTIONAL else 'LSTM',
    'embedding': f'{EMBEDDING_TYPE}_{EMBEDDING_CONFIG}',
    'embedding_dim': EMBEDDING_DIM,
    'lstm_units': LSTM_UNITS,
    'max_sequence_length': MAX_SEQUENCE_LENGTH,
    'vocab_size': len(word_to_idx),
    'preprocessing': PREPROCESSING_CONFIG,
    'batch_size': BATCH_SIZE,
    'epochs_trained': len(history.history['loss']),
    'classification_type': CLASSIFICATION_TYPE,
}

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    results['test_accuracy'] = float(test_results[1])
    results['test_loss'] = float(test_results[0])
    
    # Add precision, recall, f1
    precision, recall, f1, _ = precision_recall_fscore_support(test_y, test_pred_classes, average='weighted')
    results['precision'] = float(precision)
    results['recall'] = float(recall)
    results['f1_score'] = float(f1)
else:
    results['test_mse'] = float(test_results[0])
    results['test_mae'] = float(test_results[1])
    results['r2_score'] = float(r2_score(test_y, test_pred_classes))

print("\n" + "="*50)
print("FINAL RESULTS SUMMARY")
print("="*50)
for key, value in results.items():
    print(f"{key}: {value}")

# Save results to JSON
import json
with open('lstm_glove_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n✓ Results saved to 'lstm_glove_results.json'")

## 14. Model Summary and Comparison

In [None]:
print("\n" + "="*70)
print("LSTM + GLOVE - EXPERIMENT COMPLETE")
print("="*70)
print(f"\nModel: {'BiLSTM' if USE_BIDIRECTIONAL else 'LSTM'}")
print(f"Embedding: GloVe (200-dim)")
print(f"Total Parameters: {model.count_params():,}")

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    print(f"\nFinal Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"F1-Score: {results['f1_score']:.4f}")
else:
    print(f"\nFinal Test MAE: {results['test_mae']:.4f}")
    print(f"R² Score: {results['r2_score']:.4f}")

print("\n" + "="*70)
print("NEXT: Create lstm_tfidf.ipynb for TF-IDF embeddings")
print("="*70)