# Letterboxd Data Exploration

Explore rating patterns, genre preferences, director tendencies, and decade distributions from the enriched Letterboxd watch history.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

df = pd.read_parquet("../data/processed/rated_movies.parquet")
print(f"Total rated movies: {len(df)}")
print(f"Columns: {list(df.columns)}")
df.head()

## Rating Distribution

In [None]:
fig, ax = plt.subplots()
bins = [i / 2 for i in range(1, 12)]  # 0.5 to 5.0
ax.hist(df["memberRating"], bins=bins, edgecolor="black", alpha=0.7, color="#4A90D9")
ax.set_xlabel("Rating")
ax.set_ylabel("Count")
ax.set_title("Rating Distribution")
ax.set_xticks([i / 2 for i in range(1, 11)])
print(f"Mean: {df['memberRating'].mean():.2f}, Median: {df['memberRating'].median():.1f}")
plt.tight_layout()
plt.show()

## Genre Breakdown

In [None]:
# Watch frequency per genre
genre_counts = Counter(g for genres in df["genres"] for g in genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=["genre", "count"]).sort_values("count", ascending=False)

# Average rating per genre
genre_ratings = {}
for _, row in df.iterrows():
    for g in row["genres"]:
        genre_ratings.setdefault(g, []).append(row["memberRating"])
genre_df["avg_rating"] = genre_df["genre"].map(lambda g: sum(genre_ratings[g]) / len(genre_ratings[g]))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
sns.barplot(data=genre_df.head(15), x="count", y="genre", ax=ax1, color="#4A90D9")
ax1.set_title("Genre Watch Frequency")
sns.barplot(data=genre_df.sort_values("avg_rating", ascending=False).head(15), x="avg_rating", y="genre", ax=ax2, color="#E8A838")
ax2.set_title("Average Rating by Genre")
ax2.set_xlim(0, 5)
plt.tight_layout()
plt.show()

## Director Analysis

In [None]:
director_data = {}
for _, row in df.iterrows():
    for d in (row["director"] or []):
        director_data.setdefault(d, []).append(row["memberRating"])

dir_df = pd.DataFrame([
    {"director": d, "films": len(r), "avg_rating": sum(r) / len(r)}
    for d, r in director_data.items()
]).sort_values("films", ascending=False)

print("Most-watched directors:")
print(dir_df.head(15).to_string(index=False))

# Directors with 2+ films, ranked by avg rating
repeat_dirs = dir_df[dir_df["films"] >= 2].sort_values("avg_rating", ascending=False)
print("\nHighest-rated directors (2+ films):")
print(repeat_dirs.head(15).to_string(index=False))

## Decade Distribution

In [None]:
df["decade"] = (df["year"] // 10) * 10
decade_stats = df.groupby("decade").agg(
    count=("memberRating", "size"),
    avg_rating=("memberRating", "mean")
).reset_index()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.bar(decade_stats["decade"].astype(str), decade_stats["count"], color="#4A90D9")
ax1.set_title("Films Watched by Decade")
ax1.set_xlabel("Decade")
ax2.bar(decade_stats["decade"].astype(str), decade_stats["avg_rating"], color="#E8A838")
ax2.set_title("Average Rating by Decade")
ax2.set_xlabel("Decade")
ax2.set_ylim(0, 5)
plt.tight_layout()
plt.show()

## Your Ratings vs TMDB Average

In [None]:
from scipy.stats import pearsonr

valid = df.dropna(subset=["vote_average", "memberRating"])
valid = valid[valid["vote_average"] > 0]

fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(valid["vote_average"], valid["memberRating"], alpha=0.4, s=20, color="#4A90D9")
ax.set_xlabel("TMDB Average (0-10)")
ax.set_ylabel("My Rating (0.5-5.0)")
ax.set_title("My Ratings vs TMDB Crowd Average")

r, p = pearsonr(valid["vote_average"], valid["memberRating"])
ax.annotate(f"Pearson r = {r:.3f} (p = {p:.2e})", xy=(0.05, 0.95), xycoords="axes fraction", fontsize=11)
plt.tight_layout()
plt.show()

## Taste Clusters

Which genre + decade combos do I rate highest?

In [None]:
# Explode genres for per-genre-decade analysis
exploded = df.explode("genres")
pivot = exploded.groupby(["genres", "decade"])["memberRating"].mean().unstack(fill_value=0)

# Keep only genres with enough data
genre_film_counts = exploded["genres"].value_counts()
top_genres = genre_film_counts[genre_film_counts >= 5].index
pivot = pivot.loc[pivot.index.isin(top_genres)]

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlOrRd", ax=ax, vmin=1, vmax=5)
ax.set_title("Average Rating: Genre x Decade")
plt.tight_layout()
plt.show()