# Multi-Class Review Ranker: Final Model Analysis

This notebook presents the complete pipeline for the Multi-Class Review Ranker project. 
It covers:
1.  **Data Preparation**: Loading, Cleaning, and EDA.
2.  **Model Training**: Implementing and Tuning the 3 strategies.
    *   **Naive Bayes (Tuned)**
    *   **SVM (Tuned)**
    *   **Word2Vec with POS-Lemmatization**
3.  **Final Comparison**: Selecting the best model.

## 1. Imports and Setup
Import necessary libraries and download NLTK resources.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import nltk
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter

sns.set_style('darkgrid')
%matplotlib inline

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

In [None]:
import os

# Configuration
models_dir = '../saved_models'
os.makedirs(models_dir, exist_ok=True)
print(f"Models directory set to: {models_dir}")

## 2. Load Dataset
Load the raw Amazon Reviews dataset.

In [None]:
file_path = '../data/raw/Amazon_Reviews.csv'
df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 3. Missing Values Analysis
Visualize missing data using a heatmap.

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.tight_layout()
plt.show()

print("Missing values per column:")
print(df.isnull().sum())

## 4. Data Cleaning & Rating Extraction
Extract numeric ratings and drop rows with missing values.

In [None]:
df['rating_numeric'] = df['Rating'].str.extract(r'Rated (\d) out of 5 stars').astype(float)

# Drop rows with missing review text or rating
df_clean = df.dropna(subset=['Review Text', 'rating_numeric']).copy()
df_clean['rating_numeric'] = df_clean['rating_numeric'].astype(int)
print(f"Shape after dropping nulls: {df_clean.shape}")

# Rating Distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='rating_numeric', data=df_clean, palette='coolwarm')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.show()

# Rating Proportions
rating_counts = df_clean['rating_numeric'].value_counts().sort_index()
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', colors=sns.color_palette('coolwarm', 5))
plt.title('Rating Proportions')
plt.show()

print("\nRating value counts:")
print(rating_counts)

## 5. Feature Engineering
Extract reviewer experience, combine text fields, and create sentiment labels.

In [None]:
df_clean['reviewer_experience'] = df_clean['Review Count'].str.extract(r'(\d+)').astype(float).fillna(1)

# Combine Review Title + Review Text for richer content
df_clean['full_text'] = df_clean['Review Title'].fillna('') + ' ' + df_clean['Review Text'].fillna('')
df_clean['full_text'] = df_clean['full_text'].str.strip()

# Create Sentiment Labels
sentiment_map = {1: 'Very Bad', 2: 'Bad', 3: 'Neutral', 4: 'Good', 5: 'Very Good'}
df_clean['sentiment_label'] = df_clean['rating_numeric'].map(sentiment_map)

# Reviewer experience distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_clean['reviewer_experience'], bins=50, kde=True, color='teal')
plt.title('Reviewer Experience (# of Reviews)')
plt.xlabel('Number of Reviews')
plt.xlim(0, 100)
plt.show()

# Reviewer experience by rating
plt.figure(figsize=(12, 6))
sns.boxplot(x='rating_numeric', y='reviewer_experience', data=df_clean, hue='rating_numeric', 
            palette='coolwarm', legend=False)
plt.title('Reviewer Experience by Rating')
plt.ylim(0, 50)
plt.show()

# Sentiment label distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='sentiment_label', data=df_clean, hue='sentiment_label',
              order=['Very Bad', 'Bad', 'Neutral', 'Good', 'Very Good'], 
              palette='RdYlGn', legend=False)
plt.title('Sentiment Label Distribution')
plt.xticks(rotation=45)
plt.show()

print("New Features Added:")
print(f"  - reviewer_experience: min={df_clean['reviewer_experience'].min():.0f}, max={df_clean['reviewer_experience'].max():.0f}, mean={df_clean['reviewer_experience'].mean():.1f}")
print(f"  - full_text: Review Title + Review Text combined")
print(f"  - sentiment_label: {list(sentiment_map.values())}")

## 6. Text Length Analysis (Before Preprocessing)
Analyze the distribution of character and word counts in the raw text.

In [None]:
df_clean['text_length'] = df_clean['Review Text'].str.len()
df_clean['word_count'] = df_clean['Review Text'].str.split().str.len()

# Review Length Distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_clean['text_length'], bins=50, kde=True, color='steelblue')
plt.title('Review Length Distribution (Characters)')
plt.xlabel('Character Count')
plt.show()

# Word Count by Rating
plt.figure(figsize=(12, 6))
sns.boxplot(x='rating_numeric', y='word_count', data=df_clean, palette='coolwarm')
plt.title('Word Count by Rating')
plt.xlabel('Rating')
plt.show()

print("Text Length Statistics:")
print(df_clean[['text_length', 'word_count']].describe().round(1))

## 7. Most Frequent Words (Before Preprocessing)
Identify the top occuring words before any cleaning.

In [None]:
all_words_before = ' '.join(df_clean['full_text'].astype(str)).lower().split()
word_freq_before = Counter(all_words_before).most_common(25)

words_df = pd.DataFrame(word_freq_before, columns=['word', 'count'])

plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data=words_df, palette='Blues_d')
plt.title('Top 25 Words BEFORE Preprocessing')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()

## 8. Define Preprocessing Functions
Setup NLTK tools and define the text cleaning function (lemmatization, stopword removal).

In [None]:
english_words = set(words.words())
for synset in wordnet.all_synsets():
    for lemma in synset.lemmas():
        english_words.add(lemma.name().lower().replace('_', ' '))

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    tokens = [w for w in tokens if w in english_words and len(w) > 2]  # English words only, min length 3
    return ' '.join(tokens)

print("Preprocessing function ready. English vocabulary size:", len(english_words))

## 9. Apply Preprocessing
Run the preprocessing function on the full dataset.

In [None]:
from tqdm import tqdm
tqdm.pandas()

df_clean['clean_text'] = df_clean['full_text'].progress_apply(preprocess_text)

# Show example before/after
print("Example transformation (Title + Review Text):")
print(f"BEFORE: {df_clean['full_text'].iloc[0][:250]}...")
print(f"\nAFTER: {df_clean['clean_text'].iloc[0][:250]}...")

## 10. Most Frequent Words (After Preprocessing)
Visualize the top words after cleaning to verify noise removal.

In [None]:
all_words_after = ' '.join(df_clean['clean_text'].astype(str)).split()
word_freq_after = Counter(all_words_after).most_common(25)

# Before
words_before_df = pd.DataFrame(word_freq_before, columns=['word', 'count'])
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data=words_before_df, hue='word', palette='Reds_d', legend=False)
plt.title('Top 25 Words BEFORE Preprocessing')
plt.show()

# After
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data=words_after_df, hue='word', palette='Greens_d', legend=False)
plt.title('Top 25 Words AFTER Preprocessing')
plt.show()

print(f"\nVocabulary size before: {len(set(all_words_before))}")
print(f"Vocabulary size after: {len(set(all_words_after))}")

## 11. Text Length Comparison
Compare text length statistics before and after cleaning.

In [None]:
df_clean['clean_text_length'] = df_clean['clean_text'].str.len()
df_clean['clean_word_count'] = df_clean['clean_text'].str.split().str.len()

# Length comparison
length_data = pd.DataFrame({
    'Before': df_clean['text_length'],
    'After': df_clean['clean_text_length']
}).melt(var_name='Stage', value_name='Length')

plt.figure(figsize=(12, 6))
sns.histplot(data=length_data, x='Length', hue='Stage', kde=True, alpha=0.5)
plt.title('Character Count: Before vs After')
plt.xlim(0, 2000)
plt.show()

# Word count by rating (after)
plt.figure(figsize=(12, 6))
sns.boxplot(x='rating_numeric', y='clean_word_count', data=df_clean, hue='rating_numeric', palette='coolwarm', legend=False)
plt.title('Word Count by Rating (After Cleaning)')
plt.show()

# Stats comparison
print("Text Length Statistics Comparison:")
comparison = pd.DataFrame({
    'Before (chars)': df_clean['text_length'].describe(),
    'After (chars)': df_clean['clean_text_length'].describe(),
    'Before (words)': df_clean['word_count'].describe(),
    'After (words)': df_clean['clean_word_count'].describe()
}).round(1)
print(comparison)

## 12. Top Words by Rating Class
Analyze which words are most distinctive for each star rating.

In [None]:
colors = ['#d73027', '#fc8d59', '#fee08b', '#91cf60', '#1a9850']

for i, rating in enumerate([1, 2, 3, 4, 5]):
    rating_text = ' '.join(df_clean[df_clean['rating_numeric'] == rating]['clean_text'])
    top_words = Counter(rating_text.split()).most_common(10)
    words_df = pd.DataFrame(top_words, columns=['word', 'count'])
    
    sns.set_style('darkgrid')
    plt.figure(figsize=(12, 6))
    sns.barplot(x='count', y='word', data=words_df, color=colors[i])
    plt.title(f'Top 10 Words - Rating {rating}')
    plt.xlabel('Frequency')
    plt.show()

## 13. Advanced Feature Engineering
Create features for punctuation usage, capitalization, and word length.

In [None]:
df_clean['exclamation_count'] = df_clean['Review Text'].str.count('!')
df_clean['question_count'] = df_clean['Review Text'].str.count('\?')
df_clean['uppercase_ratio'] = df_clean['Review Text'].apply(
    lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1)
)
df_clean['avg_word_length'] = df_clean['clean_text'].apply(
    lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0
)

# Feature correlation heatmap
feature_cols = ['clean_word_count', 'exclamation_count', 'question_count', 'uppercase_ratio', 'avg_word_length', 'rating_numeric']

sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
sns.heatmap(df_clean[feature_cols].corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 14. Feature Distributions
Visualize how the new engineering features correlate with ratings.

In [None]:
features = ['exclamation_count', 'question_count', 'uppercase_ratio', 'avg_word_length']
titles = ['Exclamation Count', 'Question Count', 'Uppercase Ratio', 'Avg Word Length']

for feat, title in zip(features, titles):
    sns.set_style('darkgrid')
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='rating_numeric', y=feat, data=df_clean, hue='rating_numeric', palette='coolwarm', legend=False)
    plt.title(f'{title} by Rating')
    plt.xlabel('Rating')
    plt.show()

# Stats per rating
print("Feature Statistics by Rating:")
print(df_clean.groupby('rating_numeric')[features].mean().round(3))

## 15. Final Data Cleaning
Remove empty texts and duplicates generated during preprocessing.

In [None]:
print(f"Shape before cleaning: {df_clean.shape}")

# Remove rows with empty clean_text
df_clean = df_clean[df_clean['clean_text'].str.len() > 0].copy()
print(f"After removing empty texts: {df_clean.shape}")

# Remove duplicates
n_duplicates = df_clean.duplicated(subset=['clean_text']).sum()
df_clean = df_clean.drop_duplicates(subset=['clean_text'])
print(f"After removing {n_duplicates} duplicates: {df_clean.shape}")

# Final rating distribution
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
ax = sns.countplot(x='rating_numeric', data=df_clean, hue='rating_numeric', palette='coolwarm', legend=False)
plt.title('Final Rating Distribution')

# Add explicit labels to bars
for i, v in enumerate(df_clean['rating_numeric'].value_counts().sort_index()):
     ax.text(i, v + 100, str(v), ha='center', fontsize=10)

plt.tight_layout()
plt.show()

## 16. Prepare Final Dataset
Select final columns and rename for consistency.

In [None]:
final_cols = ['clean_text', 'rating_numeric', 'sentiment_label', 'reviewer_experience',
              'clean_word_count', 'exclamation_count', 'question_count', 
              'uppercase_ratio', 'avg_word_length']
df_final = df_clean[final_cols].copy()
df_final.columns = ['text', 'rating', 'sentiment', 'reviewer_experience',
                    'word_count', 'exclamation_count', 'question_count', 
                    'uppercase_ratio', 'avg_word_length']

print("Final Dataset Summary:")
print(f"Shape: {df_final.shape}")
print(f"\nSentiment Class Distribution:")
print(df_final['sentiment'].value_counts().reindex(['Very Bad', 'Bad', 'Neutral', 'Good', 'Very Good']))
print(f"\nSample:")
df_final.head()

## 17. Save Processed Data
Save the clean dataset for future use.

In [None]:
output_path = '../data/processed/amazon_reviews_processed.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_final.to_csv(output_path, index=False)
print(f"Saved processed data to: {output_path}")

# Final Summary Table
summary = pd.DataFrame({
    'Metric': ['Total Samples', 'Features', 'Vocabulary Size', 'Avg Text Length', 'Class Imbalance Ratio'],
    'Value': [len(df_final), len(df_final.columns), len(set(' '.join(df_final['text']).split())),
              f"{df_final['word_count'].mean():.1f} words", f"{df_final['rating'].value_counts().max() / df_final['rating'].value_counts().min():.1f}x"]
})
print("\n" + "="*50)
print("PREPROCESSING COMPLETE")
print("="*50)
print(summary.to_string(index=False))

## 18. Model Training Setup
Import sklearn libraries and prepare for modeling.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import joblib

print("Model training libraries loaded.")

## 19. Train-Test Split
Split data into training and testing sets.

In [None]:
X_text = df_final['text']
X_features = df_final[['reviewer_experience', 'word_count', 'exclamation_count', 
                       'question_count', 'uppercase_ratio', 'avg_word_length']]
y = df_final['rating']

X_text_train, X_text_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
    X_text, X_features, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_text_train)} samples")
print(f"Test set: {len(X_text_test)} samples")
print(f"\nClass distribution in training set:")
print(y_train.value_counts().sort_index())

## 20. Hyperparameter Tuning Setup
Prepare data for GridSearchCV.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

print("=" * 60)
print("HYPERPARAMETER TUNING")
print("=" * 60)

# Define helper function for 3-class mapping
def map_to_3_classes(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

# We'll tune on the 3-class problem since it showed better results
# Using text data only for pipeline simplicity

# Create a fresh train/test split for tuning
X_tune = df_clean['clean_text']
y_tune = df_clean['rating_numeric'].apply(map_to_3_classes)

print(f"Data shape: {len(X_tune)} samples")
print(f"Class distribution:\n{y_tune.value_counts()}")

## 2. Models 1 & 2: Tuned Naive Bayes & SVM

We will now perform Hyperparameter Tuning on our two baseline models to maximize their performance.

In [None]:
from sklearn.model_selection import StratifiedKFold

# Create pipeline: TF-IDF + Naive Bayes
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define parameter grid
nb_param_grid = {
    'tfidf__max_features': [3000, 5000, 8000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [0.9, 0.95, 1.0],
    'clf__alpha': [0.01, 0.1, 0.5, 1.0]
}

# Use stratified k-fold for imbalanced data
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

print("Tuning Naive Bayes... (this may take a few minutes)")
print(f"Total combinations: {3*3*3*3*4} = 324")

nb_grid_search = GridSearchCV(
    nb_pipeline,
    nb_param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

nb_grid_search.fit(X_tune, y_tune)

print("\n" + "=" * 60)
print("NAIVE BAYES - BEST PARAMETERS:")
print("=" * 60)
for param, value in nb_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV Accuracy: {nb_grid_search.best_score_:.4f} ({nb_grid_search.best_score_*100:.2f}%)")

### 20.1 Naive Bayes Hyperparameter Tuning
Use GridSearchCV to find optimal parameters for Naive Bayes.

In [None]:
# Create pipeline: TF-IDF + SVM
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC(max_iter=2000))
])

# Define parameter grid for SVM
svm_param_grid = {
    'tfidf__max_features': [3000, 5000, 8000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [1, 2],
    'clf__C': [0.1, 0.5, 1.0, 2.0],
    'clf__class_weight': [None, 'balanced']
}

print("Tuning SVM... (this may take several minutes)")
print(f"Total combinations: {3*2*2*4*2} = 96")

svm_grid_search = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

svm_grid_search.fit(X_tune, y_tune)

print("\n" + "=" * 60)
print("SVM - BEST PARAMETERS:")
print("=" * 60)
for param, value in svm_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV Accuracy: {svm_grid_search.best_score_:.4f} ({svm_grid_search.best_score_*100:.2f}%)")

### 20.2 SVM Hyperparameter Tuning
Use GridSearchCV to find optimal parameters for LinearSVC.

In [None]:
from sklearn.model_selection import train_test_split

# Split data for final evaluation
X_train_tune, X_test_tune, y_train_tune, y_test_tune = train_test_split(
    X_tune, y_tune, test_size=0.2, random_state=42, stratify=y_tune
)

# Get best models
best_nb = nb_grid_search.best_estimator_
best_svm = svm_grid_search.best_estimator_

# Evaluate on test set
y_pred_nb_tuned = best_nb.predict(X_test_tune)
y_pred_svm_tuned = best_svm.predict(X_test_tune)

nb_tuned_acc = accuracy_score(y_test_tune, y_pred_nb_tuned)
svm_tuned_acc = accuracy_score(y_test_tune, y_pred_svm_tuned)

print("=" * 60)
print("TUNED MODELS - TEST SET PERFORMANCE")
print("=" * 60)

print("\nNAIVE BAYES (Tuned):")
print(f"   Accuracy: {nb_tuned_acc:.4f} ({nb_tuned_acc*100:.2f}%)")
print(classification_report(y_test_tune, y_pred_nb_tuned))

print("\nSVM (Tuned):")
print(f"   Accuracy: {svm_tuned_acc:.4f} ({svm_tuned_acc*100:.2f}%)")
print(classification_report(y_test_tune, y_pred_svm_tuned))

### 20.3 Evaluate Tuned Models
Test the best performing models on the hold-out test set.

In [None]:
# Save the tuned models
# joblib.dump(best_nb, f'{models_dir}/naive_bayes_tuned.pkl')
# joblib.dump(best_svm, f'{models_dir}/svm_tuned.pkl')

# print("Tuned models saved:")
# print(f"   - {models_dir}/naive_bayes_tuned.pkl")
# print(f"   - {models_dir}/svm_tuned.pkl")

# Print final best parameters for reference
print("\n" + "=" * 60)
print("FINAL BEST HYPERPARAMETERS")
print("=" * 60)
print("\nNaive Bayes:")
for param, value in nb_grid_search.best_params_.items():
    print(f"  {param}: {value}")
    
print("\nSVM:")
for param, value in svm_grid_search.best_params_.items():
    print(f"  {param}: {value}")

### 20.4 Save Best Tuned Models
Serialize the best models to disk.

## 3. Model 3: Word2Vec (POS-Lemma Approach)

We will explore semantic embeddings using Word2Vec, with a focus on improving text preprocessing using Part-of-Speech (POS) Lemmatization.

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

# Prepare data for Word2Vec (needs list of list of tokens)
# We use the text data split created earlier
X_train_clean_text = X_text_train.astype(str)
X_test_clean_text = X_text_test.astype(str)

def tokenize_corpus(text_series):
    return [text.split() for text in text_series]

X_train_tokens = tokenize_corpus(X_train_clean_text)
X_test_tokens = tokenize_corpus(X_test_clean_text)

# Combine for training the embedding model (to see more vocabulary)
full_corpus_tokens = X_train_tokens + X_test_tokens

print(f"Training Word2Vec on {len(full_corpus_tokens)} sentences...")

# Train Word2Vec Model
# vector_size=100: Dimension of the dense vector
# window=5: Context window size
# min_count=2: Ignore rare words
w2v_model = Word2Vec(sentences=full_corpus_tokens, vector_size=100, window=5, min_count=2, workers=4, seed=42)

print(f"Vocabulary size: {len(w2v_model.wv)}")

# Check similar words to 'good' and 'bad' to verify semantic learning
print("\nMost similar to 'good':")
try:
    print(w2v_model.wv.most_similar('good', topn=5))
except KeyError:
    print("'good' not in vocab")

print("\nMost similar to 'bad':")
try:
    print(w2v_model.wv.most_similar('bad', topn=5))
except KeyError:
    print("'bad' not in vocab")

## 22. Train Word2Vec Model
Train custom embeddings on the review corpus.

In [None]:
def get_mean_vector(word_list, model):
    # Filter words that are in the model's vocabulary
    vectors = [model.wv[word] for word in word_list if word in model.wv]
    
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    # Calculate mean
    return np.mean(vectors, axis=0)

# Vectorize Train and Test sets
X_train_w2v = np.array([get_mean_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([get_mean_vector(tokens, w2v_model) for tokens in X_test_tokens])

print(f"Train features shape: {X_train_w2v.shape}")
print(f"Test features shape: {X_test_w2v.shape}")

## 23. Create Document Vectors
Average word vectors to create a single vector representation for each document.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Determine target variable (prefer 3-class if available to match best models)
target_train = y_train
target_test = y_test

# Check unique classes
unique_classes = np.unique(target_train)
print(f"Current target classes: {unique_classes}")

if len(unique_classes) > 3:
    print("Detected 5-class target. Attempting to use y_train_3class if available...")
    if 'y_train_3class' in locals() and len(y_train_3class) == len(X_train_w2v):
        print("Using y_train_3class for training.")
        target_train = y_train_3class
        # Ensure y_test match
        if 'y_test_3class' in locals():
            target_test = y_test_3class
        else:
            print("Warning: y_test_3class not found, using original y_test (might mismatch)")
    else:
        print("y_train_3class not found or size mismatch. Creating temporary 3-class mapping...")
        # Simple mapping function if needed, or proceed with 5-class
        # Assuming map_to_3_classes exists from earlier in notebook
        try:
             target_train = y_train.apply(map_to_3_classes)
             target_test = y_test.apply(map_to_3_classes)
             print("Mapped to 3 classes successfully.")
        except NameError:
             print("map_to_3_classes generic function not found. Proceeding with original classes.")

# Logistic Regression is a strong baseline for dense embeddings
lr_w2v = LogisticRegression(max_iter=2000, random_state=42)
lr_w2v.fit(X_train_w2v, target_train)

# Evaluate
y_pred_w2v = lr_w2v.predict(X_test_w2v)
w2v_acc = accuracy_score(target_test, y_pred_w2v)

print(f"Word2Vec + Logistic Regression Accuracy: {w2v_acc:.4f}")
print("\nClassification Report:\n")
print(classification_report(target_test, y_pred_w2v))

## 24. Train Classifier on Embeddings
Train a Logistic Regression model using the document vectors.

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization: t-SNE of Word Vectors
# We will visualize the top 100 most frequent words to see semantic clusters

def plot_word_embeddings(model, topn=100):
    # Get vocabulary and vectors
    vocab_keys = list(model.wv.index_to_key)[:topn]
    vectors = [model.wv[word] for word in vocab_keys]
    
    # Reduce dimensions to 2D
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, init='pca', learning_rate='auto')
    vectors_2d = tsne.fit_transform(np.array(vectors))
    
    # Plot
    sns.set_style('darkgrid')
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=vectors_2d[:, 0], y=vectors_2d[:, 1], s=100, alpha=0.7)
    
    # Add labels
    for i, word in enumerate(vocab_keys):
        plt.annotate(word, xy=(vectors_2d[i, 0], vectors_2d[i, 1]), 
                     xytext=(5, 2), textcoords='offset points', 
                     ha='right', va='bottom', fontsize=9)
        
    plt.title(f't-SNE Visualization of Top {topn} Word Embeddings', fontsize=16)
    plt.xlabel('t-SNE dimension 1')
    plt.ylabel('t-SNE dimension 2')
    plt.grid(True, alpha=0.3)
    plt.show()

plot_word_embeddings(w2v_model, topn=150)

## 25. Visualize Word Embeddings
Use t-SNE to visualize semantic clusters of words.

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Ensure we have the tagger
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def improved_preprocess(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    
    tokens = text.split()
    
    if not tokens:
        return ""
        
    # POS Tagging (Context aware)
    pos_tags = pos_tag(tokens)
    
    final_tokens = []
    for word, tag in pos_tags:
        # Filter first
        if word not in stop_words and len(word) > 2 and word in english_words:
            # Lemmatize with tag
            wn_tag = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            final_tokens.append(lemma)
            
    return ' '.join(final_tokens)

# Demonstration of the fix
test_samples = [
    "I received the package yesterday and I am receiving another one today.",
    "This is the best product I have ever got.",
    "She was saying that it is bad and worse."
]

print("--- Discrepancy Check ---")
for sample in test_samples:
    print(f"\nOriginal: {sample}")
    # Using the old simple lemmatizer approach (assuming default noun)
    old_tokens = [lemmatizer.lemmatize(w) for w in sample.lower().split() if w in english_words] 
    print(f"Old (Naive): {' '.join(old_tokens)}")
    
    # New approach
    new_text = improved_preprocess(sample)
    print(f"New (POS):   {new_text}")

## 26. Improve Lemmatization (POS-Aware)
Implement smarter preprocessing that respects Part-of-Speech tags.

In [None]:
# Apply directly to the split data (handling Series or potential Array)
if hasattr(X_text_train, 'apply'):
    X_train_clean_v2 = X_text_train.progress_apply(improved_preprocess)
    X_test_clean_v2 = X_text_test.progress_apply(improved_preprocess)
else:
    # Fallback if it's a numpy array
    import pandas as pd
    X_train_clean_v2 = pd.Series(X_text_train).progress_apply(improved_preprocess)
    X_test_clean_v2 = pd.Series(X_text_test).progress_apply(improved_preprocess)

X_train_tokens_v2 = [text.split() for text in X_train_clean_v2]
X_test_tokens_v2 = [text.split() for text in X_test_clean_v2]
full_corpus_tokens_v2 = X_train_tokens_v2 + X_test_tokens_v2

# Re-train Word2Vec
print("Training Word2Vec V2...")
w2v_model_v2 = Word2Vec(sentences=full_corpus_tokens_v2, vector_size=100, window=5, min_count=2, workers=4, seed=42)

print("New Vocabulary size:", len(w2v_model_v2.wv))

# Re-check relationships
print("\nMost similar to 'good' (V2):")
try:
    print(w2v_model_v2.wv.most_similar('good', topn=5))
except KeyError:
    print("'good' not in vocab")

# Generate vectors for Classifier using V2
X_train_w2v_v2 = np.array([get_mean_vector(tokens, w2v_model_v2) for tokens in X_train_tokens_v2])
X_test_w2v_v2 = np.array([get_mean_vector(tokens, w2v_model_v2) for tokens in X_test_tokens_v2])

# Train LR V2
lr_w2v_v2 = LogisticRegression(max_iter=2000, random_state=42)
lr_w2v_v2.fit(X_train_w2v_v2, target_train) # target_train defined in previous steps

y_pred_w2v_v2 = lr_w2v_v2.predict(X_test_w2v_v2)
w2v_acc_v2 = accuracy_score(target_test, y_pred_w2v_v2)

print(f"\nWord2Vec (POS Lemmatized) Accuracy: {w2v_acc_v2:.4f}")

# Save Word2Vec Logistic Regression Model
# joblib.dump(lr_w2v_v2, f'{models_dir}/word2vec_lr.pkl')
# print(f"Word2Vec Logistic Regression model saved to: {models_dir}/word2vec_lr.pkl")

## 27. Re-train Word2Vec with Improved Processing
Apply the new preprocessing and re-train the model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, balanced_accuracy_score, roc_curve, auc
from sklearn.preprocessing import label_binarize

# 1. Detailed Metric Calculation
def get_metrics(name, y_true, y_pred):
    return {
        'Model': name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
        'F1 (Weighted)': f1_score(y_true, y_pred, average='weighted'),
        'F1 (Macro)': f1_score(y_true, y_pred, average='macro'),
        'Precision (Weighted)': precision_score(y_true, y_pred, average='weighted'),
        'Recall (Weighted)': recall_score(y_true, y_pred, average='weighted')
    }

# Gather metrics for all models
metrics_data = [
    get_metrics('Naive Bayes (Tuned)', y_test_tune, y_pred_nb_tuned),
    get_metrics('SVM (Tuned)', y_test_tune, y_pred_svm_tuned),
    get_metrics('Word2Vec (POS-Lemma)', target_test, y_pred_w2v_v2)
]

final_results = pd.DataFrame(metrics_data).sort_values('Accuracy', ascending=False)

print("=" * 60)
print("FINAL MODEL PERFORMANCE REPORT")
print("=" * 60)
print(final_results.round(4).to_string(index=False))

# ------------------------------------------------------------------
# 2. Visualizations
# ------------------------------------------------------------------

# A. Accuracy & Metrics Comparison
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
long_results = final_results.melt(id_vars='Model', value_vars=['Accuracy', 'F1 (Weighted)'], 
                                  var_name='Metric', value_name='Score')
ax = sns.barplot(x='Model', y='Score', hue='Metric', data=long_results, palette='viridis')
plt.title('Model Comparison: Key Metrics', fontsize=16)
plt.ylim(0.5, 1.0)
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')

# Add value labels to bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', padding=3)

plt.tight_layout()
plt.show()

# B. Confusion Matrices
def plot_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    labels = sorted(list(set(y_true)))
    sns.set_style('darkgrid')
    plt.figure(figsize=(12, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels)
    plt.title(title, fontsize=14)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

plot_cm(y_test_tune, y_pred_nb_tuned, 'Naive Bayes (Tuned) - Confusion Matrix')
plot_cm(y_test_tune, y_pred_svm_tuned, 'SVM (Tuned) - Confusion Matrix')
plot_cm(target_test, y_pred_w2v_v2, 'Word2Vec (POS-Lemma) - Confusion Matrix')

# ------------------------------------------------------------------
# 3. Feature Importance (SVM)
# ------------------------------------------------------------------
print("\n" + "="*60)
print("FEATURE IMPORTANCE (Predicted by SVM)")
print("="*60)

feature_names = best_svm.named_steps['tfidf'].get_feature_names_out()
svm_coefs = best_svm.named_steps['clf'].coef_
classes = best_svm.named_steps['clf'].classes_

for i, class_label in enumerate(classes):
    # Sort coefficients
    top10_indices = np.argsort(svm_coefs[i])[-10:]
    top10_words = feature_names[top10_indices]
    top10_weights = svm_coefs[i][top10_indices]
    sns.set_style('darkgrid')
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top10_weights, y=top10_words, hue=top10_words, legend=False, palette='magma')
    plt.title(f'Top 10 Indicative Words for Class: {class_label} (SVM)', fontsize=14)
    plt.xlabel('Coefficient Magnitude')
    plt.show()

# ------------------------------------------------------------------
# 4. ROC / AUC Curves (Micro-Average)
# ------------------------------------------------------------------
print("\n" + "="*60)
print("ROC Curves Comparison")
print("="*60)

# Binarize labels
y_test_bin = label_binarize(y_test_tune, classes=classes)

# Get Scores
y_score_nb = best_nb.predict_proba(X_test_tune)
y_score_svm = best_svm.decision_function(X_test_tune) # SVM LinearSVC uses decision_function
y_score_w2v = lr_w2v_v2.predict_proba(X_test_w2v_v2) 
# Note: W2V target_test must match classes order. Assuming consistency.

def get_micro_curve(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true.ravel(), y_score.ravel())
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc

fpr_nb, tpr_nb, auc_nb = get_micro_curve(y_test_bin, y_score_nb)
fpr_svm, tpr_svm, auc_svm = get_micro_curve(y_test_bin, y_score_svm)
fpr_w2v, tpr_w2v, auc_w2v = get_micro_curve(y_test_bin, y_score_w2v) # Reuse y_test_bin as it is the same split if seeds preserved
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
plt.plot(fpr_nb, tpr_nb, label=f'Naive Bayes (Micro AUC = {auc_nb:.3f})', linewidth=2)
plt.plot(fpr_svm, tpr_svm, label=f'SVM (Micro AUC = {auc_svm:.3f})', linewidth=2)
plt.plot(fpr_w2v, tpr_w2v, label=f'Word2Vec (Micro AUC = {auc_w2v:.3f})', linewidth=2, linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Micro-Average ROC Curve Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# ------------------------------------------------------------------
# 5. Misclassification Analysis (Error Analysis)
# ------------------------------------------------------------------
print("\n" + "="*60)
print("MISCLASSIFICATION ANALYSIS")
print("="*60)

# Analyze the best model
best_model_name = final_results.iloc[0]['Model']
print(f"Top Model Identified: {best_model_name}")

if 'SVM' in best_model_name:
    y_pred, X_src, y_true = y_pred_svm_tuned, X_test_tune, y_test_tune
elif 'Naive' in best_model_name:
    y_pred, X_src, y_true = y_pred_nb_tuned, X_test_tune, y_test_tune
else:
    y_pred, X_src, y_true = y_pred_w2v_v2, X_test_clean_v2, target_test

# Find errors
error_indices = np.where(y_pred != y_true)[0]
print(f"Total Misclassified Samples: {len(error_indices)} out of {len(y_true)}")

if len(error_indices) > 0:
    # Sample 5 random errors
    np.random.seed(99)
    sample_erros = np.random.choice(error_indices, min(5, len(error_indices)), replace=False)
    
    error_records = []
    # Need to handle pandas indices alignment
    for i in sample_erros:
        # Access via integer position (iloc) for both Series/Arrays
        text_val = X_src.iloc[i] if hasattr(X_src, 'iloc') else X_src[i]
        true_val = y_true.iloc[i] if hasattr(y_true, 'iloc') else y_true[i]
        pred_val = y_pred[i]
        
        error_records.append({
            'Review Text (Processed)': text_val, 
            'True Label': true_val, 
            'Predicted': pred_val
        })
    
    error_df = pd.DataFrame(error_records)
    print("\nSample Errors:")
    pd.set_option('display.max_colwidth', 120)
    print(error_df)
    pd.reset_option('display.max_colwidth')

best_model = final_results.iloc[0]
print(f"\nCONCLUSION: The best performing model is {best_model['Model']} with {best_model['Accuracy']:.2%} accuracy.")





In [None]:
import joblib
import os

# Define models directory
models_dir = '../saved_models'
os.makedirs(models_dir, exist_ok=True)

print("=" * 60)
print("SAVING FINAL MODELS")
print("=" * 60)

# 1. Naive Bayes
joblib.dump(best_nb, f'{models_dir}/naive_bayes_final.pkl')
print(f"Naive Bayes saved to: {models_dir}/naive_bayes_final.pkl")

# 2. SVM
joblib.dump(best_svm, f'{models_dir}/svm_final.pkl')
print(f"SVM saved to: {models_dir}/svm_final.pkl")

# 3. Word2Vec Classifier + Embedding Model
joblib.dump(lr_w2v_v2, f'{models_dir}/word2vec_lr_final.pkl')
w2v_model_v2.save(f'{models_dir}/word2vec.model')
print(f"Word2Vec Classifier saved to: {models_dir}/word2vec_lr_final.pkl")
print(f"Word2Vec Embeddings saved to: {models_dir}/word2vec.model")

print("\nAll models have been serialized and are ready for deployment.")