# MovieMind - Model Training & Evaluation

This notebook trains and evaluates ML models for:
1. Sentiment Classification (positive/neutral/negative)
2. Score Prediction (0-10 scale)
3. Clustering Analysis
4. Statistical Analysis

In [None]:
# Imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from src.utils.db_manager import DatabaseManager
from src.preprocessing.text_processor import TextProcessor
from src.models.sentiment_classifier import SentimentClassifier
from src.models.score_predictor import ScorePredictor
from src.models.clustering import MovieClusterer

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load reviews with movie metadata
with DatabaseManager() as db:
    query = """
    SELECT
        r.review_id,
        r.content,
        r.rating,
        r.text_length,
        r.word_count,
        m.title,
        m.genres,
        m.vote_average,
        m.runtime
    FROM reviews r
    JOIN movies m ON r.movie_id = m.movie_id
    WHERE r.content IS NOT NULL
    LIMIT 5000
    """
    
    reviews = db.execute_query(query)

df = pd.DataFrame(reviews)
print(f"Loaded {len(df)} reviews")
df.head()

## 2. Text Preprocessing

In [None]:
# Initialize text processor
processor = TextProcessor()

# Clean reviews
print("Preprocessing text...")
df['cleaned_content'] = df['content'].apply(
    lambda x: processor.clean_text(x) if pd.notna(x) else ""
)

# Show example
print("\n=== Original Review ===")
print(df['content'].iloc[0][:500])

print("\n=== Cleaned Review ===")
print(df['cleaned_content'].iloc[0][:500])

In [None]:
# Prepare target variable
df['score'] = df['rating'].fillna(df['vote_average'])
df_labeled = df[df['score'].notna()].copy()

print(f"Reviews with scores: {len(df_labeled)}")
print(f"\nScore distribution:")
print(df_labeled['score'].describe())

## 3. Sentiment Classification

In [None]:
# Create sentiment labels
classifier = SentimentClassifier(model_type='logistic', max_features=5000)

df_labeled['sentiment'] = classifier.prepare_sentiment_labels(
    df_labeled['score'],
    threshold_pos=7.0,
    threshold_neg=5.0
)

# Check distribution
print("Sentiment distribution:")
print(df_labeled['sentiment'].value_counts())

# Visualize
plt.figure(figsize=(10, 5))
df_labeled['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_labeled['cleaned_content'],
    df_labeled['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df_labeled['sentiment']
)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

In [None]:
# Train classifier
print("Training sentiment classifier...")
train_metrics = classifier.train(X_train, y_train, validation_split=0.2)

print("\nTraining metrics:")
for key, value in train_metrics.items():
    print(f"  {key}: {value}")

In [None]:
# Evaluate on test set
test_metrics = classifier.evaluate(X_test, y_test)

In [None]:
# Plot confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

plt.figure(figsize=(8, 6))
cm_display = ConfusionMatrixDisplay(
    confusion_matrix=test_metrics['confusion_matrix'],
    display_labels=classifier.model.classes_
)
cm_display.plot(cmap='Blues')
plt.title('Confusion Matrix - Sentiment Classification')
plt.tight_layout()
plt.show()

In [None]:
# Top features per class
importance = classifier.get_feature_importance(top_n=15)

print("\n=== Top Features Per Sentiment ===")
for class_name, features in importance.items():
    print(f"\n{class_name.upper()}:")
    for feat, score in features[:10]:
        print(f"  {feat:20s}: {score:7.4f}")

## 4. Score Prediction (Regression)

In [None]:
# Prepare metadata features
meta_features = ['text_length', 'word_count']
df_labeled[meta_features] = df_labeled[meta_features].fillna(0)

# Split data
X_text_train, X_text_test, X_meta_train, X_meta_test, y_score_train, y_score_test = train_test_split(
    df_labeled['cleaned_content'],
    df_labeled[meta_features],
    df_labeled['score'],
    test_size=0.2,
    random_state=42
)

print(f"Train size: {len(X_text_train)}")
print(f"Test size: {len(X_text_test)}")

In [None]:
# Train score predictor
predictor = ScorePredictor(model_type='ridge', max_features=3000)

print("Training score predictor...")
train_metrics = predictor.train(
    X_text_train,
    y_score_train,
    X_meta_train,
    validation_split=0.2
)

print("\nTraining metrics:")
for key, value in train_metrics.items():
    print(f"  {key}: {value}")

In [None]:
# Evaluate
test_metrics = predictor.evaluate(X_text_test, y_score_test, X_meta_test)

In [None]:
# Plot residuals
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Residual plot
axes[0].scatter(test_metrics['predictions'], test_metrics['residuals'], alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Score')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')
axes[0].grid(True, alpha=0.3)

# Predicted vs Actual
axes[1].scatter(y_score_test, test_metrics['predictions'], alpha=0.5)
axes[1].plot([0, 10], [0, 10], 'r--', label='Perfect prediction')
axes[1].set_xlabel('Actual Score')
axes[1].set_ylabel('Predicted Score')
axes[1].set_title('Predicted vs Actual')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Top features for regression
importance_reg = predictor.get_feature_importance(top_n=20)

print("\n=== Top Features for Score Prediction ===")
for feat, score in importance_reg[:15]:
    print(f"  {feat:20s}: {score:7.4f}")

## 5. Clustering Analysis

In [None]:
# Prepare data for clustering
df_cluster = df_labeled.copy()

# Initialize clusterer
clusterer = MovieClusterer(n_clusters=5, random_state=42)

# Prepare features (text + numeric)
features = clusterer.prepare_features(
    df_cluster,
    text_column='cleaned_content',
    numeric_columns=['text_length', 'word_count', 'score']
)

print(f"Feature matrix shape: {features.shape}")

In [None]:
# Elbow analysis
print("Performing elbow analysis...")
elbow_results = clusterer.elbow_analysis(features, max_k=10)

print("\nElbow Analysis Results:")
display(elbow_results)

# Plot
clusterer.plot_elbow(elbow_results)

In [None]:
# Fit clustering with optimal k (adjust based on elbow plot)
optimal_k = 5  # Adjust this based on elbow plot
clusterer = MovieClusterer(n_clusters=optimal_k, random_state=42)

labels, metrics = clusterer.fit_predict(features)

print("\nClustering Metrics:")
print(f"  Silhouette Score: {metrics['silhouette_score']:.4f}")
print(f"  Davies-Bouldin Score: {metrics['davies_bouldin_score']:.4f}")
print(f"  Inertia: {metrics['inertia']:.2f}")

In [None]:
# Visualize clusters
clusterer.visualize_clusters_2d(features, labels)

In [None]:
# Cluster distribution
cluster_counts = pd.Series(labels).value_counts().sort_index()

plt.figure(figsize=(10, 5))
cluster_counts.plot(kind='bar')
plt.title('Reviews per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Cluster summary
summary = clusterer.get_cluster_summary(
    df_cluster,
    labels,
    numeric_columns=['text_length', 'word_count', 'score']
)

print("\n=== Cluster Summary ===")
display(summary)

## 6. Statistical Tests

In [None]:
# ANOVA: Score differences across clusters
cluster_groups = []
for i in range(optimal_k):
    cluster_scores = df_cluster[labels == i]['score'].dropna()
    cluster_groups.append(cluster_scores)

f_stat, p_value = stats.f_oneway(*cluster_groups)

print("\n=== ANOVA: Score across Clusters ===")
print(f"F-statistic: {f_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant: {'Yes (p < 0.05)' if p_value < 0.05 else 'No (p >= 0.05)'}")

In [None]:
# Chi-squared: Sentiment vs Cluster
contingency = pd.crosstab(labels, df_cluster['sentiment'])

chi2, p_value, dof, expected = stats.chi2_contingency(contingency)

print("\n=== Chi-squared: Sentiment vs Cluster ===")
print(f"Chi¬≤ statistic: {chi2:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"Significant: {'Yes (p < 0.05)' if p_value < 0.05 else 'No (p >= 0.05)'}")

print("\nContingency Table:")
display(contingency)

In [None]:
# Correlation: Text length vs Score
valid_data = df_cluster[['text_length', 'score']].dropna()
corr, p_value = stats.pearsonr(valid_data['text_length'], valid_data['score'])

print("\n=== Correlation: Text Length vs Score ===")
print(f"Pearson correlation: {corr:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant: {'Yes (p < 0.05)' if p_value < 0.05 else 'No (p >= 0.05)'}")

## 7. Save Models

In [None]:
# Save trained models
import os

os.makedirs('../models', exist_ok=True)

# Save sentiment classifier
classifier.save_model('../models/sentiment_classifier')
print("‚úì Sentiment classifier saved")

# Save score predictor
predictor.save_model('../models/score_predictor')
print("‚úì Score predictor saved")

## 8. Summary

In [None]:
print("\n" + "="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)

print(f"\nüìä Dataset:")
print(f"  Total reviews: {len(df_labeled)}")
print(f"  Train samples: {len(X_train)}")
print(f"  Test samples: {len(X_test)}")

print(f"\nüéØ Sentiment Classification:")
print(f"  Accuracy: {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall: {test_metrics['recall']:.4f}")
print(f"  F1-Score: {test_metrics['f1_score']:.4f}")

print(f"\nüìà Score Prediction:")
test_reg_metrics = predictor.evaluate(X_text_test, y_score_test, X_meta_test)
print(f"  R¬≤ Score: {test_reg_metrics['r2']:.4f}")
print(f"  RMSE: {test_reg_metrics['rmse']:.4f}")
print(f"  MAE: {test_reg_metrics['mae']:.4f}")

print(f"\nüîç Clustering:")
print(f"  Number of clusters: {optimal_k}")
print(f"  Silhouette Score: {metrics['silhouette_score']:.4f}")

print(f"\n‚úÖ Models saved to: ../models/")
print("="*60)