# MovieMind - Clustering Analysis

This notebook performs k-means clustering to find similar movies and audience patterns.

## Contents:
1. Data Loading and Preparation
2. Feature Engineering
3. Elbow Method (Optimal k)
4. K-means Clustering
5. Silhouette Analysis
6. Cluster Interpretation
7. Visualization

In [None]:
# Imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

from src.utils.db_manager import DatabaseManager
from src.preprocessing.text_processor import TextProcessor
from src.models.clustering import MovieClusterer

# Settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline
pd.set_option('display.max_columns', None)

## 1. Data Loading and Preparation

In [None]:
# Load data from database
with DatabaseManager() as db:
    query = """
    SELECT 
        m.movie_id,
        m.title,
        m.genres,
        m.vote_average,
        m.vote_count,
        m.popularity,
        m.runtime,
        m.budget,
        m.revenue,
        m.overview,
        m.release_date,
        COUNT(r.review_id) as review_count,
        AVG(r.sentiment_score) as avg_sentiment
    FROM movies m
    LEFT JOIN reviews r ON m.movie_id = r.movie_id
    GROUP BY m.movie_id, m.title, m.genres, m.vote_average, m.vote_count, 
             m.popularity, m.runtime, m.budget, m.revenue, m.overview, m.release_date
    HAVING COUNT(r.review_id) > 0
    LIMIT 1000
    """
    df = pd.DataFrame(db.execute_query(query))

print(f"Loaded {len(df)} movies with reviews")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Fill missing values
df['runtime'] = df['runtime'].fillna(df['runtime'].median())
df['budget'] = df['budget'].fillna(0)
df['revenue'] = df['revenue'].fillna(0)
df['avg_sentiment'] = df['avg_sentiment'].fillna(0)

## 2. Feature Engineering

In [None]:
# Extract year from release_date
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year
df['year'] = df['year'].fillna(df['year'].median())

# Create ROI feature
df['roi'] = np.where(df['budget'] > 0, (df['revenue'] - df['budget']) / df['budget'], 0)

# Create genre features (one-hot encoding for top genres)
top_genres = ['Drama', 'Comedy', 'Action', 'Thriller', 'Romance', 'Adventure', 'Horror', 'Crime']

for genre in top_genres:
    df[f'genre_{genre}'] = df['genres'].apply(
        lambda x: 1 if isinstance(x, list) and genre in x else 0
    )

print("Features created:")
print(df.columns.tolist())

In [None]:
# Select numerical features for clustering
feature_cols = [
    'vote_average', 'vote_count', 'popularity', 'runtime', 
    'review_count', 'avg_sentiment', 'year', 'roi'
] + [f'genre_{g}' for g in top_genres]

X = df[feature_cols].copy()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")

## 3. Elbow Method (Finding Optimal k)

: 

In [None]:
# Calculate inertia for different k values
K_range = range(2, 11)
inertias = []
silhouette_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
    print(f"k={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette_scores[-1]:.3f}")

In [None]:
# Plot elbow curve
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Elbow plot
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[0].set_ylabel('Inertia (Within-cluster sum of squares)', fontsize=12)
axes[0].set_title('Elbow Method', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Silhouette plot
axes[1].plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Score by k', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Identify optimal k
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"\nOptimal k based on Silhouette Score: {optimal_k}")

## 4. K-means Clustering with Optimal k

In [None]:
# Use optimal k (or set manually if needed)
k_final = optimal_k  # You can change this manually if needed (e.g., k_final = 5)

# Fit final model
kmeans_final = KMeans(n_clusters=k_final, random_state=42, n_init=10)
df['cluster'] = kmeans_final.fit_predict(X_scaled)

print(f"Clustering with k={k_final}")
print(f"\nCluster distribution:")
print(df['cluster'].value_counts().sort_index())

## 5. Silhouette Analysis

In [None]:
# Calculate silhouette scores
silhouette_avg = silhouette_score(X_scaled, df['cluster'])
sample_silhouette_values = silhouette_samples(X_scaled, df['cluster'])

print(f"Average Silhouette Score: {silhouette_avg:.3f}")
print(f"\nSilhouette score by cluster:")
for i in range(k_final):
    cluster_silhouette = sample_silhouette_values[df['cluster'] == i]
    print(f"  Cluster {i}: {cluster_silhouette.mean():.3f}")

In [None]:
# Silhouette plot
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

y_lower = 10
for i in range(k_final):
    cluster_silhouette_values = sample_silhouette_values[df['cluster'] == i]
    cluster_silhouette_values.sort()
    
    size_cluster_i = cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    color = plt.cm.nipy_spectral(float(i) / k_final)
    ax.fill_betweenx(np.arange(y_lower, y_upper),
                      0, cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=0.7)
    
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10

ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Cluster")
ax.set_title(f"Silhouette Plot for {k_final} Clusters")
ax.axvline(x=silhouette_avg, color="red", linestyle="--", label=f"Avg: {silhouette_avg:.3f}")
ax.legend()
plt.show()

## 6. Cluster Interpretation

In [None]:
# Analyze cluster characteristics
cluster_summary = df.groupby('cluster')[feature_cols].mean()

print("\n" + "="*80)
print("CLUSTER CHARACTERISTICS (Mean values)")
print("="*80)
display(cluster_summary.round(2))

In [None]:
# Sample movies from each cluster
print("\n" + "="*80)
print("SAMPLE MOVIES FROM EACH CLUSTER")
print("="*80)

for cluster_id in range(k_final):
    print(f"\n--- Cluster {cluster_id} ({(df['cluster'] == cluster_id).sum()} movies) ---")
    sample = df[df['cluster'] == cluster_id][['title', 'vote_average', 'genres', 'year']].head(5)
    for idx, row in sample.iterrows():
        genres_str = ', '.join(row['genres']) if isinstance(row['genres'], list) else 'N/A'
        print(f"  â€¢ {row['title']} ({int(row['year'])}) - Rating: {row['vote_average']:.1f} - Genres: {genres_str}")

In [None]:
# Genre distribution by cluster
genre_cluster = pd.DataFrame()
for genre in top_genres:
    genre_cluster[genre] = df.groupby('cluster')[f'genre_{genre}'].mean() * 100

genre_cluster_T = genre_cluster.T

plt.figure(figsize=(12, 8))
sns.heatmap(genre_cluster_T, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Percentage of Movies'})
plt.xlabel('Cluster')
plt.ylabel('Genre')
plt.title('Genre Distribution by Cluster (%)')
plt.tight_layout()
plt.show()

## 7. Visualization (PCA for 2D)

In [None]:
# Reduce to 2D using PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:
# Scatter plot with clusters
plt.figure(figsize=(14, 10))

# Plot each cluster with different color
for i in range(k_final):
    cluster_points = X_pca[df['cluster'] == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
                label=f'Cluster {i}', alpha=0.6, s=100)

# Plot cluster centers
centers_pca = pca.transform(kmeans_final.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], 
            c='black', marker='X', s=500, edgecolors='white', linewidths=2,
            label='Centroids', zorder=10)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)
plt.title('Movie Clusters (PCA Visualization)', fontsize=14, fontweight='bold')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary & Insights

In [None]:
print("\n" + "="*80)
print("CLUSTERING SUMMARY")
print("="*80)
print(f"Number of clusters: {k_final}")
print(f"Average Silhouette Score: {silhouette_avg:.3f}")
print(f"Total movies clustered: {len(df)}")
print(f"\nCluster sizes:")
for i in range(k_final):
    count = (df['cluster'] == i).sum()
    percentage = count / len(df) * 100
    print(f"  Cluster {i}: {count} movies ({percentage:.1f}%)")

print("\n" + "="*80)