# LSTM Model with TF-IDF Embeddings
## Drug Reviews Classification - Gershom

**Model**: LSTM (Long Short-Term Memory) / BiLSTM
**Embedding**: TF-IDF (5000 features, balanced config)
**Task**: Drug review rating prediction

---

### Important Note:
TF-IDF produces sparse vectors (not dense word embeddings), so we use a different architecture:
- NO embedding layer needed
- TF-IDF vectors fed directly to Dense layers OR reshaped for LSTM
- This notebook shows LSTM architecture adapted for TF-IDF features

### Notebook Structure:
1. Setup and Data Loading
2. TF-IDF Vectorization
3. LSTM Model Architecture (TF-IDF adapted)
4. Training & Evaluation
5. Results Analysis

## 1. Setup and Imports

In [2]:
# Standard libraries
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Bidirectional, Dense, Dropout, Reshape, Input
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Shared modules
from src.data_utils import DataLoader, create_dataset_from_dataframe
from src.preprocessing import get_preprocessor, TextPreprocessor
from src.eda import EDAAnalyzer
from embeddings.tfidf_embedding import TfidfEmbedding, get_tfidf_embedding

print("✓ All imports successful!")
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

ModuleNotFoundError: No module named 'pandas'

## 2. Configuration

In [None]:
# Paths
TRAIN_PATH = '../data/drug_review_train.csv'
VAL_PATH = '../data/drug_review_validation.csv'
TEST_PATH = '../data/drug_review_test.csv'

# Data columns
TEXT_COLUMN = 'review'
LABEL_COLUMN = 'rating'

# Preprocessing - TF-IDF works better with minimal preprocessing
PREPROCESSING_CONFIG = 'moderate'  # minimal, moderate, or aggressive

# Embedding configuration
EMBEDDING_TYPE = 'tfidf'
EMBEDDING_CONFIG = 'balanced'  # 5000 features
TFIDF_MAX_FEATURES = 5000

# Model architecture - Different for TF-IDF
USE_LSTM = True  # Set False to use Dense-only architecture (faster)
USE_BIDIRECTIONAL = True  # Set to True for BiLSTM, False for LSTM
LSTM_UNITS = 64  # Smaller since TF-IDF already has 5000 features
DROPOUT_RATE = 0.4  # Higher dropout for TF-IDF
RECURRENT_DROPOUT = 0.2
DENSE_UNITS = 128

# Training
BATCH_SIZE = 32
EPOCHS = 20  # TF-IDF models may need more epochs
LEARNING_RATE = 0.001
PATIENCE = 3

# Random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print("Configuration set!")
print(f"Model type: {'LSTM' if USE_LSTM else 'Dense-only'}")
if USE_LSTM:
    print(f"LSTM variant: {'BiLSTM' if USE_BIDIRECTIONAL else 'LSTM'}")
print(f"Embedding: {EMBEDDING_TYPE} ({EMBEDDING_CONFIG})")
print(f"TF-IDF features: {TFIDF_MAX_FEATURES}")

## 3. Load and Preprocess Data

In [None]:
# Load datasets
print("Loading datasets...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
print(f"\nColumns: {list(train_df.columns)}")
print(f"\nClass distribution (train):")
print(train_df[LABEL_COLUMN].value_counts().sort_index())

In [None]:
# Initialize preprocessor
preprocessor = get_preprocessor(PREPROCESSING_CONFIG)
print(f"Using '{PREPROCESSING_CONFIG}' preprocessing configuration")

# TF-IDF uses RAW TEXT (joined back from tokens), not token lists
print("\nPreprocessing texts...")
train_texts = train_df[TEXT_COLUMN].fillna('').tolist()
val_texts = val_df[TEXT_COLUMN].fillna('').tolist()
test_texts = test_df[TEXT_COLUMN].fillna('').tolist()

# Process texts (returns strings, not tokens)
train_processed = preprocessor.process_batch(train_texts)
val_processed = preprocessor.process_batch(val_texts)
test_processed = preprocessor.process_batch(test_texts)

# Extract labels
train_labels = train_df[LABEL_COLUMN].values
val_labels = val_df[LABEL_COLUMN].values
test_labels = test_df[LABEL_COLUMN].values

print(f"✓ Preprocessing complete!")
print(f"Example processed review: {train_processed[0][:100]}...")  # First 100 chars

## 4. Create TF-IDF Vectors

In [None]:
# Initialize and fit TF-IDF
print("Training TF-IDF vectorizer...")
tfidf_model = get_tfidf_embedding(EMBEDDING_CONFIG)

# Fit on training data
tfidf_model.fit(train_processed)

# Transform all datasets to dense arrays
X_train = tfidf_model.transform_dense(train_processed)
X_val = tfidf_model.transform_dense(val_processed)
X_test = tfidf_model.transform_dense(test_processed)

print("\n✓ TF-IDF vectorization complete!")
print(f"Feature dimension: {tfidf_model.get_embedding_dim()}")
print(f"Vocabulary size: {len(tfidf_model.get_vocabulary())}")
print(f"\nTrain shape: {X_train.shape}")
print(f"Val shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}")

In [None]:
# Check sparsity
sparsity_train = (X_train == 0).sum() / X_train.size * 100
print(f"\nTF-IDF sparsity: {sparsity_train:.2f}% of values are zero")
print(f"Average non-zero features per document: {(X_train != 0).sum(axis=1).mean():.1f}")

# Show top TF-IDF terms
print("\nTop 20 TF-IDF features:")
print(tfidf_model.get_top_terms(20))

## 5. Process Labels for Classification

In [None]:
# Check the rating distribution
print("Rating distribution:")
print(train_df[LABEL_COLUMN].value_counts().sort_index())

# Binary classification: ratings >= 6 = positive (1), < 6 = negative (0)
CLASSIFICATION_TYPE = 'binary'  # 'binary', 'multiclass', or 'regression'
THRESHOLD = 6  # For binary classification

if CLASSIFICATION_TYPE == 'binary':
    train_y = (train_labels >= THRESHOLD).astype(int)
    val_y = (val_labels >= THRESHOLD).astype(int)
    test_y = (test_labels >= THRESHOLD).astype(int)
    NUM_CLASSES = 2
    print(f"\nBinary classification: >= {THRESHOLD} = positive")
    print(f"Train class distribution: {np.bincount(train_y)}")
elif CLASSIFICATION_TYPE == 'multiclass':
    train_y = train_labels.astype(int) - 1
    val_y = val_labels.astype(int) - 1
    test_y = test_labels.astype(int) - 1
    NUM_CLASSES = len(np.unique(train_y))
    print(f"\nMulti-class classification: {NUM_CLASSES} classes")
else:  # regression
    train_y = train_labels.astype(float)
    val_y = val_labels.astype(float)
    test_y = test_labels.astype(float)
    NUM_CLASSES = 1
    print("\nRegression: Predicting exact rating")

## 6. Prepare Data for LSTM (Reshape TF-IDF)

In [None]:
if USE_LSTM:
    # LSTM expects 3D input: (batch, timesteps, features)
    # We'll reshape TF-IDF vectors: treat each feature as a timestep with 1 value
    # OR: split features into chunks
    
    # Option 1: Each TF-IDF feature as a timestep (simple but may not be ideal)
    CHUNK_SIZE = 100  # Split 5000 features into chunks of 100
    TIMESTEPS = TFIDF_MAX_FEATURES // CHUNK_SIZE
    
    print(f"\nReshaping TF-IDF for LSTM...")
    print(f"Original shape: {X_train.shape}")
    
    # Pad to make divisible by chunk size
    target_size = TIMESTEPS * CHUNK_SIZE
    if X_train.shape[1] < target_size:
        X_train = np.pad(X_train, ((0, 0), (0, target_size - X_train.shape[1])), mode='constant')
        X_val = np.pad(X_val, ((0, 0), (0, target_size - X_val.shape[1])), mode='constant')
        X_test = np.pad(X_test, ((0, 0), (0, target_size - X_test.shape[1])), mode='constant')
    
    # Reshape to (samples, timesteps, features_per_timestep)
    X_train_reshaped = X_train[:, :target_size].reshape(-1, TIMESTEPS, CHUNK_SIZE)
    X_val_reshaped = X_val[:, :target_size].reshape(-1, TIMESTEPS, CHUNK_SIZE)
    X_test_reshaped = X_test[:, :target_size].reshape(-1, TIMESTEPS, CHUNK_SIZE)
    
    print(f"Reshaped for LSTM: {X_train_reshaped.shape}")
    print(f"(samples, timesteps={TIMESTEPS}, features_per_step={CHUNK_SIZE})")
    
    # Use reshaped data
    X_train_input = X_train_reshaped
    X_val_input = X_val_reshaped
    X_test_input = X_test_reshaped
else:
    # For Dense-only model, use original 2D shape
    X_train_input = X_train
    X_val_input = X_val
    X_test_input = X_test
    print("\nUsing 2D TF-IDF vectors for Dense-only model")

## 7. Build Model (LSTM or Dense)

In [None]:
def build_tfidf_lstm_model(
    input_shape,
    lstm_units=64,
    dropout_rate=0.4,
    recurrent_dropout=0.2,
    dense_units=128,
    num_classes=2,
    use_bidirectional=True,
    classification_type='binary'
):
    """Build LSTM model for TF-IDF features."""
    model = Sequential()
    
    # LSTM layers (input is already 3D from reshape)
    if use_bidirectional:
        model.add(Bidirectional(
            LSTM(
                lstm_units,
                dropout=dropout_rate,
                recurrent_dropout=recurrent_dropout,
                input_shape=input_shape
            ),
            name='bilstm'
        ))
    else:
        model.add(LSTM(
            lstm_units,
            dropout=dropout_rate,
            recurrent_dropout=recurrent_dropout,
            input_shape=input_shape,
            name='lstm'
        ))
    
    # Dense layers
    model.add(Dense(dense_units, activation='relu', name='dense_1'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(dense_units // 2, activation='relu', name='dense_2'))
    model.add(Dropout(dropout_rate))
    
    # Output layer
    if classification_type == 'binary':
        model.add(Dense(1, activation='sigmoid', name='output'))
        loss = 'binary_crossentropy'
        metrics = ['accuracy']
    elif classification_type == 'multiclass':
        model.add(Dense(num_classes, activation='softmax', name='output'))
        loss = 'sparse_categorical_crossentropy'
        metrics = ['accuracy']
    else:  # regression
        model.add(Dense(1, activation='linear', name='output'))
        loss = 'mse'
        metrics = ['mae']
    
    # Compile
    model.compile(
        optimizer=Adam(learning_rate=LEARNING_RATE),
        loss=loss,
        metrics=metrics
    )
    
    return model

def build_tfidf_dense_model(
    input_dim,
    dense_units=256,
    dropout_rate=0.4,
    num_classes=2,
    classification_type='binary'
):
    """Build Dense-only model for TF-IDF features (faster alternative)."""
    model = Sequential()
    
    # Dense layers
    model.add(Dense(dense_units, activation='relu', input_dim=input_dim, name='dense_1'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(dense_units // 2, activation='relu', name='dense_2'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(dense_units // 4, activation='relu', name='dense_3'))
    model.add(Dropout(dropout_rate))
    
    # Output layer
    if classification_type == 'binary':
        model.add(Dense(1, activation='sigmoid', name='output'))
        loss = 'binary_crossentropy'
        metrics = ['accuracy']
    elif classification_type == 'multiclass':
        model.add(Dense(num_classes, activation='softmax', name='output'))
        loss = 'sparse_categorical_crossentropy'
        metrics = ['accuracy']
    else:  # regression
        model.add(Dense(1, activation='linear', name='output'))
        loss = 'mse'
        metrics = ['mae']
    
    # Compile
    model.compile(
        optimizer=Adam(learning_rate=LEARNING_RATE),
        loss=loss,
        metrics=metrics
    )
    
    return model

# Build the model
print("Building model...")
if USE_LSTM:
    input_shape = (X_train_input.shape[1], X_train_input.shape[2])  # (timesteps, features)
    model = build_tfidf_lstm_model(
        input_shape=input_shape,
        lstm_units=LSTM_UNITS,
        dropout_rate=DROPOUT_RATE,
        recurrent_dropout=RECURRENT_DROPOUT,
        dense_units=DENSE_UNITS,
        num_classes=NUM_CLASSES,
        use_bidirectional=USE_BIDIRECTIONAL,
        classification_type=CLASSIFICATION_TYPE
    )
else:
    input_dim = X_train_input.shape[1]
    model = build_tfidf_dense_model(
        input_dim=input_dim,
        dense_units=DENSE_UNITS * 2,  # Larger for Dense-only
        dropout_rate=DROPOUT_RATE,
        num_classes=NUM_CLASSES,
        classification_type=CLASSIFICATION_TYPE
    )

print("\n✓ Model built successfully!")
model.summary()

## 8. Setup Callbacks

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='best_lstm_tfidf_model.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

print("Callbacks configured")

## 9. Train the Model

In [None]:
# Train the model
print("\nStarting training...")
print(f"Model: {'LSTM with TF-IDF' if USE_LSTM else 'Dense with TF-IDF'}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Max epochs: {EPOCHS}")
print("="*50)

history = model.fit(
    X_train_input,
    train_y,
    validation_data=(X_val_input, val_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

print("\n✓ Training complete!")

## 10. Plot Training History

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Accuracy plot (for classification)
if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    axes[1].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
    axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[1].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
else:  # regression
    axes[1].plot(history.history['mae'], label='Train MAE', linewidth=2)
    axes[1].plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
    axes[1].set_title('Model MAE', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('MAE')
    axes[1].legend()
    axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Evaluate on Test Set

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = model.evaluate(X_test_input, test_y, verbose=1)

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    print(f"\nTest Loss: {test_results[0]:.4f}")
    print(f"Test Accuracy: {test_results[1]:.4f}")
else:
    print(f"\nTest Loss (MSE): {test_results[0]:.4f}")
    print(f"Test MAE: {test_results[1]:.4f}")

In [None]:
# Get predictions
print("\nGenerating predictions...")
test_predictions = model.predict(X_test_input, verbose=1)

if CLASSIFICATION_TYPE == 'binary':
    test_pred_classes = (test_predictions > 0.5).astype(int).flatten()
elif CLASSIFICATION_TYPE == 'multiclass':
    test_pred_classes = np.argmax(test_predictions, axis=1)
else:  # regression
    test_pred_classes = test_predictions.flatten()

print("✓ Predictions generated")

## 12. Detailed Classification Report

In [None]:
if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    # Classification report
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT")
    print("="*50)
    print(classification_report(test_y, test_pred_classes, digits=4))
    
    # Confusion matrix
    cm = confusion_matrix(test_y, test_pred_classes)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
    plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(test_y, test_pred_classes)
    
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })
    
    print("\nPer-Class Metrics:")
    print(metrics_df.to_string())
else:
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    mse = mean_squared_error(test_y, test_pred_classes)
    mae = mean_absolute_error(test_y, test_pred_classes)
    r2 = r2_score(test_y, test_pred_classes)
    
    print("\n" + "="*50)
    print("REGRESSION METRICS")
    print("="*50)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R² Score: {r2:.4f}")

## 13. Save Results

In [None]:
# Compile results dictionary
results = {
    'model': 'LSTM_TF-IDF' if USE_LSTM else 'Dense_TF-IDF',
    'embedding': f'{EMBEDDING_TYPE}_{EMBEDDING_CONFIG}',
    'tfidf_features': TFIDF_MAX_FEATURES,
    'lstm_units': LSTM_UNITS if USE_LSTM else 'N/A',
    'preprocessing': PREPROCESSING_CONFIG,
    'batch_size': BATCH_SIZE,
    'epochs_trained': len(history.history['loss']),
    'classification_type': CLASSIFICATION_TYPE,
}

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    results['test_accuracy'] = float(test_results[1])
    results['test_loss'] = float(test_results[0])
    
    precision, recall, f1, _ = precision_recall_fscore_support(test_y, test_pred_classes, average='weighted')
    results['precision'] = float(precision)
    results['recall'] = float(recall)
    results['f1_score'] = float(f1)
else:
    results['test_mse'] = float(test_results[0])
    results['test_mae'] = float(test_results[1])
    results['r2_score'] = float(r2_score(test_y, test_pred_classes))

print("\n" + "="*50)
print("FINAL RESULTS SUMMARY")
print("="*50)
for key, value in results.items():
    print(f"{key}: {value}")

# Save results to JSON
import json
with open('lstm_tfidf_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n✓ Results saved to 'lstm_tfidf_results.json'")

## 14. Model Summary and Comparison

In [None]:
print("\n" + "="*70)
print("LSTM + TF-IDF - EXPERIMENT COMPLETE")
print("="*70)
print(f"\nModel: {'LSTM with TF-IDF' if USE_LSTM else 'Dense with TF-IDF'}")
print(f"Embedding: TF-IDF (5000 features)")
print(f"Total Parameters: {model.count_params():,}")

if CLASSIFICATION_TYPE in ['binary', 'multiclass']:
    print(f"\nFinal Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"F1-Score: {results['f1_score']:.4f}")
else:
    print(f"\nFinal Test MAE: {results['test_mae']:.4f}")
    print(f"R² Score: {results['r2_score']:.4f}")

print("\n" + "="*70)
print("ALL THREE EMBEDDINGS COMPLETE!")
print("Compare results from:")
print("- lstm_word2vec_results.json")
print("- lstm_glove_results.json")
print("- lstm_tfidf_results.json")
print("="*70)