# Shared Data Preprocessing And EDA Module
## Drug Reviews NLP - Group Project

This is a shared notebook across all team members that shows how to use the shared preprocessing and EDA utilities for the drug reviews dataset. All team members can call these functions to prepare their data consistently before implementing their specific models (GRU, RNN, LSTM, Transformer).

**Important**: After running this notebook, you can simply import the functions into your own model notebooks without rewriting the code.

## 1. Import Required Libraries

First, we import all necessary libraries for data processing, analysis, and visualization.

In [None]:
# Standard library
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Import shared modules from src/
from src.data_utils import create_dataset_from_dataframe, DrugReviewDataset
from src.preprocessing import TextPreprocessor, get_preprocessor, AdvancedTextPreprocessor
from src.eda import EDAAnalyzer

print("All imports successful!")

## 2. Load and Inspect Drug Reviews Dataset

Before processing, let's load the data and understand its structure. 

In [None]:
# Load the dataset
# Dataset: drugLibTrain_raw.tsv - Tab-separated values file
DATA_PATH = '../data/drugLibTrain_raw.tsv'
TEXT_COLUMN = 'commentsReview'  # Main review text column
LABEL_COLUMN = 'rating'  # Target variable (rating score)
BENEFITS_COLUMN = 'benefitsReview'  # Optional additional text
SIDE_EFFECTS_COLUMN = 'sideEffectsReview'  # Optional additional text
DRUG_COLUMN = 'urlDrugName'  # Drug identifier

# Load TSV file directly with pandas
df = pd.read_csv(DATA_PATH, sep='\t')

print(f"Dataset loaded successfully from {DATA_PATH}!")
print(f"File format: Tab-separated values (TSV)")
print(f"\nDataset Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nFirst 3 rows:")df.head(3)

## 3. Exploratory Data Analysis

Use the EDAAnalyzer to understand the dataset before preprocessing.

In [None]:
# Initialize EDA Analyzer
analyzer = EDAAnalyzer(df)
analyzer.set_columns(text_column=TEXT_COLUMN, label_column=LABEL_COLUMN)

# Generate comprehensive report
print(analyzer.generate_report())

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Class distribution bar plot
analyzer.plot_label_distribution()

# Text length distribution
plt.figure(figsize=(10, 5))
texts = df[TEXT_COLUMN].dropna()
text_lengths = texts.str.split().str.len()
plt.hist(text_lengths, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
plt.title('Distribution of Review Lengths (Word Count)', fontsize=14, fontweight='bold')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Average review length: {text_lengths.mean():.1f} words")
print(f"Median review length: {text_lengths.median():.1f} words")

## 4. Text Preprocessing

Different models might prefer different preprocessing levels. Our team has chosen:
- **minimal**: For Word2Vec embeddings (preserves raw semantic information)
- **moderate**: General-purpose (RECOMMENDED for most cases)
- **aggressive**: For TF-IDF embeddings (maximizes statistical features)

In [None]:
PREPROCESSING_CONFIG = 'moderate'  # balances cleaning without over-preprocessing

preprocessor = get_preprocessor(PREPROCESSING_CONFIG)

# Function to safely handle None/NaN values during preprocessing
def safe_preprocess(text):
    if pd.isna(text):
        return ""
    return preprocessor.process(str(text))

# Apply preprocessing to all texts
print("\nPreprocessing all texts... (this may take a moment)")
processed_texts = df[TEXT_COLUMN].apply(safe_preprocess)

# Show before/after example
print("\nExample - Before and After:")
sample_idx = 0
print(f"\nOriginal: {df[TEXT_COLUMN].iloc[sample_idx][:100]}")
print(f"\nProcessed: {processed_texts.iloc[sample_idx][:100]}")

## 5. Create Dataset Object and Split Data

Create a DrugReviewDataset object and split into train, validation, and test sets.

In [None]:
# Create dataset object
dataset = create_dataset_from_dataframe(
    df=df,
    text_column=TEXT_COLUMN,
    label_column=LABEL_COLUMN,
    drug_column=DRUG_COLUMN if DRUG_COLUMN in df.columns else None
)

# Set processed texts
dataset.set_processed_texts(processed_texts.tolist())

print(f"Dataset created with {len(dataset)} samples")
print(f"Class distribution: {dataset.class_distribution()}")

# Split data using sklearn
from sklearn.model_selection import train_test_split

# First split to separate test set with 20 percent of the data
train_val_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df[LABEL_COLUMN],
    random_state=42
)

# Second split to separate validation set from training with 10 percent
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.1 / 0.8,  # Adjust ratio for the remaining data
    stratify=train_val_df[LABEL_COLUMN],
    random_state=42
)

print(f"\nTrain set: {len(train_df)} samples ({len(train_df)/len(df)*100:.1f}%)")
print(f"Val set: {len(val_df)} samples ({len(val_df)/len(df)*100:.1f}%)")
print(f"Test set: {len(test_df)} samples ({len(test_df)/len(df)*100:.1f}%)")

# Create separate dataset objects for each split
train_dataset = create_dataset_from_dataframe(
    df=train_df,
    text_column=TEXT_COLUMN,
    label_column=LABEL_COLUMN,
    drug_column=DRUG_COLUMN if DRUG_COLUMN in df.columns else None
)
train_dataset.set_processed_texts(train_df[TEXT_COLUMN].apply(safe_preprocess).tolist())

val_dataset = create_dataset_from_dataframe(
    df=val_df,
    text_column=TEXT_COLUMN,
    label_column=LABEL_COLUMN,
    drug_column=DRUG_COLUMN if DRUG_COLUMN in df.columns else None
)
val_dataset.set_processed_texts(val_df[TEXT_COLUMN].apply(safe_preprocess).tolist())

test_dataset = create_dataset_from_dataframe(
    df=test_df,
    text_column=TEXT_COLUMN,
    label_column=LABEL_COLUMN,
    drug_column=DRUG_COLUMN if DRUG_COLUMN in df.columns else None
)
test_dataset.set_processed_texts(test_df[TEXT_COLUMN].apply(safe_preprocess).tolist())


## 6. Using the Processed Data with Embeddings

Once you have the preprocessed data, you can create embeddings. Here's an example with the shared embedding modules.

In [None]:
from embeddings import get_embedding

print("=" * 70)
print("TEAM-FINALIZED EMBEDDINGS: Creating all 3 for reference")
print("=" * 70)

# Embedding 1: Word2Vec Skip-gram (PRIMARY - Semantic understanding)
print("\nEmbedding 1: Word2Vec Skip-gram (PRIMARY)")
print("-" * 70)
w2v = get_embedding('word2vec', 'skipgram_medium')
tokenizer = TextPreprocessor(lowercase=True, remove_punctuation=True, remove_stopwords=True)
train_tokens = [tokenizer.tokenize(text) for text in train_dataset.get_processed()]
w2v.fit(train_tokens)
w2v_train_vectors = w2v.encode_texts(train_tokens)
w2v_val_tokens = [tokenizer.tokenize(text) for text in val_dataset.get_processed()]
w2v_val_vectors = w2v.encode_texts(w2v_val_tokens)
print(f"Word2Vec vectors shape: {w2v_train_vectors.shape}")
print(f"Vocabulary size: {w2v.get_vocabulary_size()}")
print(f"Use for: GRU, LSTM, RNN, Transformer models (best semantic understanding)")

# Embedding 2: GloVe (SECONDARY - Global context)
print("\nEmbedding 2: GloVe (SECONDARY)")
print("-" * 70)
glove = get_embedding('glove', 'medium')
glove.fit(train_tokens)
glove_train_vectors = glove.encode_texts(train_tokens)
glove_val_vectors = glove.encode_texts(w2v_val_tokens)
print(f"GloVe vectors shape: {glove_train_vectors.shape}")
print(f"Use for: Deep learning models (captures global patterns)")

# Embedding 3: TF-IDF (BASELINE - Statistical)
print("\nEmbedding 3: TF-IDF (BASELINE)")
print("-" * 70)
tfidf = get_embedding('tfidf', 'balanced')
tfidf_train_vectors = tfidf.fit_transform(train_dataset.get_processed())
tfidf_val_vectors = tfidf.transform_dense(val_dataset.get_processed())
print(f"TF-IDF vectors shape: {tfidf_train_vectors.shape}")
print(f"Use for: Traditional ML and baseline comparisons (interpretable)")

print("\n" + "=" * 70)
print("SUMMARY: All 3 team embeddings reference created!")
print("=" * 70)
print("\nNext Steps:")
print("1. Create gru_model.ipynb (or your_model.ipynb)")
print("2. In that ONE notebook, implement your model with all 3 embeddings")
print("3. Compare results across all 3 embedding types in one place!")
print("\nThis shared notebook shows you how to create and use each embedding.")

## 7. How to Use This in Your Own Model Notebook

Create ONE notebook per team member where you implement your model with ALL 3 embeddings. For example:

**Essie (GRU)**: Create `gru_model.ipynb` with:
- Section 1: Load and preprocess data (like this notebook)
- Section 2: Build GRU model with Word2Vec Skip-gram embeddings + train + evaluate
- Section 3: Build GRU model with GloVe embeddings + train + evaluate  
- Section 4: Build GRU model with TF-IDF embeddings + train + evaluate
- Section 5: Compare all 3 embedding results

**Structure for your model notebook**:

```python
# At the top of your model notebook (e.g., gru_model.ipynb)
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Import shared utilities
from src.preprocessing import get_preprocessor, TextPreprocessor
from src.eda import EDAAnalyzer
from src.data_utils import create_dataset_from_dataframe
from embeddings import get_embedding

# ====================================================================
# SECTION 1: Load and Preprocess Data (same for all models)
# ====================================================================
DATA_PATH = '../data/drugLibTrain_raw.tsv'
TEXT_COLUMN = 'commentsReview'
LABEL_COLUMN = 'rating'

df = pd.read_csv(DATA_PATH, sep='\t')

# Preprocess
preprocessor = get_preprocessor('moderate')
def safe_preprocess(text):
    if pd.isna(text):
        return ""
    return preprocessor.process(str(text))

processed_texts = df[TEXT_COLUMN].apply(safe_preprocess)

# Split data
train_df, val_df, test_df = train_test_split(df, test_size=0.2, 
                                              stratify=df[LABEL_COLUMN], 
                                              random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1/0.8, 
                                     stratify=train_df[LABEL_COLUMN], 
                                     random_state=42)

# ====================================================================
# SECTION 2: GRU Model with Word2Vec Skip-gram (PRIMARY)
# ====================================================================
print("Building GRU with Word2Vec Skip-gram embeddings...")

w2v = get_embedding('word2vec', 'skipgram_medium')
tokenizer = TextPreprocessor(lowercase=True, remove_punctuation=True)
train_tokens = [tokenizer.tokenize(text) for text in train_df[TEXT_COLUMN].apply(safe_preprocess)]
w2v.fit(train_tokens)
train_vectors = w2v.encode_texts(train_tokens)
val_vectors = w2v.encode_texts([tokenizer.tokenize(text) for text in val_df[TEXT_COLUMN].apply(safe_preprocess)])
test_vectors = w2v.encode_texts([tokenizer.tokenize(text) for text in test_df[TEXT_COLUMN].apply(safe_preprocess)])

# Build and train GRU model with Word2Vec
gru_w2v = models.Sequential([
    layers.Input(shape=(w2v.embedding_dim,)),
    layers.Reshape((1, w2v.embedding_dim)),
    layers.GRU(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
gru_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_w2v.fit(train_vectors, train_df[LABEL_COLUMN].values, 
            validation_data=(val_vectors, val_df[LABEL_COLUMN].values),
            epochs=10, batch_size=32, verbose=1)
w2v_results = gru_w2v.evaluate(test_vectors, test_df[LABEL_COLUMN].values)

# ====================================================================
# SECTION 3: GRU Model with GloVe (SECONDARY)
# ====================================================================
print("Building GRU with GloVe embeddings...")

glove = get_embedding('glove', 'medium')
glove.fit(train_tokens)
train_vectors = glove.encode_texts(train_tokens)
val_vectors = glove.encode_texts([tokenizer.tokenize(text) for text in val_df[TEXT_COLUMN].apply(safe_preprocess)])
test_vectors = glove.encode_texts([tokenizer.tokenize(text) for text in test_df[TEXT_COLUMN].apply(safe_preprocess)])

# Build and train GRU model with GloVe
gru_glove = models.Sequential([
    layers.Input(shape=(glove.embedding_dim,)),
    layers.Reshape((1, glove.embedding_dim)),
    layers.GRU(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
gru_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_glove.fit(train_vectors, train_df[LABEL_COLUMN].values,
              validation_data=(val_vectors, val_df[LABEL_COLUMN].values),
              epochs=10, batch_size=32, verbose=1)
glove_results = gru_glove.evaluate(test_vectors, test_df[LABEL_COLUMN].values)

# ====================================================================
# SECTION 4: GRU Model with TF-IDF (BASELINE)
# ====================================================================
print("Building GRU with TF-IDF embeddings...")

tfidf = get_embedding('tfidf', 'balanced')
train_vectors = tfidf.fit_transform(train_df[TEXT_COLUMN].apply(safe_preprocess))
val_vectors = tfidf.transform_dense(val_df[TEXT_COLUMN].apply(safe_preprocess))
test_vectors = tfidf.transform_dense(test_df[TEXT_COLUMN].apply(safe_preprocess))

# Build and train GRU model with TF-IDF
gru_tfidf = models.Sequential([
    layers.Input(shape=(tfidf.get_embedding_dim(),)),
    layers.Reshape((1, tfidf.get_embedding_dim())),
    layers.GRU(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
gru_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_tfidf.fit(train_vectors, train_df[LABEL_COLUMN].values,
              validation_data=(val_vectors, val_df[LABEL_COLUMN].values),
              epochs=10, batch_size=32, verbose=1)
tfidf_results = gru_tfidf.evaluate(test_vectors, test_df[LABEL_COLUMN].values)

# ====================================================================
# SECTION 5: Compare Results
# ====================================================================
import pandas as pd

results_df = pd.DataFrame({
    'Embedding': ['Word2Vec Skip-gram', 'GloVe', 'TF-IDF'],
    'Loss': [w2v_results[0], glove_results[0], tfidf_results[0]],
    'Accuracy': [w2v_results[1], glove_results[1], tfidf_results[1]]
})
print("\nGRU Model Results Comparison:")
print(results_df)
```

**Key Points for Your Implementation**:
1. Load and preprocess data once at the beginning
2. For each of 3 embeddings: create embedding, train GRU, evaluate
3. Store results and compare at the end
4. This single notebook contains everything for direct comparison

**Expected notebook filename**: `gru_model.ipynb` (or similar for your model type)