In [None]:

def safe_parse_crew(x):
    try:
        if isinstance(x, str):
            return ast.literal_eval(x)
        elif isinstance(x, list):
            return x
    except Exception:
        return []
    return []

def safe_parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    return []


def safe_parse_dict(x):
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return {}
    return {}


In [6]:
import pandas as pd
import ast
import itertools
import numpy as np
from collections import Counter
from statistics import median

movies_metadata = pd.read_csv("../cleaned_data/movies_metadata_cleaned.csv")
keywords = pd.read_csv("../cleaned_data/keywords_cleaned.csv")
credits = pd.read_csv("../cleaned_data/credits_cleaned.csv")
ratings = pd.read_csv("../cleaned_data/ratings_merged_cleaned.csv")
links = pd.read_csv("../cleaned_data/links_merged_cleaned.csv")

In [7]:
# Task 1

credits["crew"] = credits["crew"].apply(safe_parse_crew)

# --- Step 2: Flatten to one row per (movie, crew_member) ---
crew_expanded = credits.explode("crew", ignore_index=True)
crew_expanded = crew_expanded.dropna(subset=["crew"])

# --- Step 3: Extract job and name from the nested dicts ---
crew_expanded["job"] = crew_expanded["crew"].apply(lambda d: d.get("job") if isinstance(d, dict) else None)
crew_expanded["name"] = crew_expanded["crew"].apply(lambda d: d.get("name") if isinstance(d, dict) else None)

# --- Step 4: Filter directors only ---
directors = crew_expanded[crew_expanded["job"] == "Director"][["id", "name"]]

# --- Step 5: Join with movies to get revenue + vote_average ---
merged = directors.merge(
    movies_metadata[["id", "revenue", "vote_average"]],
    on="id",
    how="left"
)

# --- Step 6: Group by director and aggregate ---
agg = (
    merged.groupby("name", dropna=True)
    .agg(
        movie_count=("id", "count"),
        median_revenue=("revenue", "median"),
        mean_vote_average=("vote_average", "mean"),
    )
    .reset_index()
)

# --- Step 7: Filter directors with ≥ 5 movies and sort descending by median revenue ---
top_directors = (
    agg[agg["movie_count"] >= 5]
    .sort_values("median_revenue", ascending=False)
    .head(10)
)

# --- Step 8: Display results ---
print("Top 10 directors (≥ 5 movies) by median revenue:\n")
print(top_directors.to_string(index=False))

Top 10 directors (≥ 5 movies) by median revenue:

            name  movie_count  median_revenue  mean_vote_average
    George Lucas            7     649398328.0           6.814286
Francis Lawrence            6     619388635.5           6.800000
     David Yates           10     583042696.5           6.030000
    Chris Renaud            5     543513985.0           6.700000
     Tom McGrath            5     532680671.0           6.400000
    Eric Darnell            5     532680671.0           6.340000
 Carlos Saldanha            7     484635760.0           6.042857
  Andrew Adamson            6     452030315.5           6.516667
     Michael Bay           13     449220945.0           6.392308
       Brad Bird            6     416438570.0           7.150000


In [14]:
# Task 2


# Ensure numeric vote_average (do NOT drop NaNs later)
movies_metadata["vote_average"] = pd.to_numeric(movies_metadata["vote_average"], errors="coerce")

# Parse cast JSON
credits["cast"] = credits["cast"].apply(safe_parse_list)

# Explode cast
cast_expanded = credits[["id", "cast"]].explode("cast", ignore_index=True)
cast_expanded = cast_expanded.dropna(subset=["cast"])

# Extract stable keys
cast_expanded["actor_id"] = cast_expanded["cast"].apply(lambda d: d.get("id") if isinstance(d, dict) else None)
cast_expanded["actor_name"] = cast_expanded["cast"].apply(lambda d: d.get("name") if isinstance(d, dict) else None)

# Keep valid ids
cast_expanded = cast_expanded.dropna(subset=["actor_id"])
cast_expanded["actor_id"] = cast_expanded["actor_id"].astype(int)

# Merge movie vote_average (inner join on numeric id)
mcols = movies_metadata[["id", "vote_average"]].copy()
mcols["id"] = pd.to_numeric(mcols["id"], errors="coerce")
mcols = mcols.dropna(subset=["id"])
mcols["id"] = mcols["id"].astype(int)

merged = cast_expanded.merge(mcols, on="id", how="inner")
# IMPORTANT: do NOT drop NaN vote_average here

# Build per-movie unique actor lists (dedup by ID)
movie_actors = (
    merged.groupby(["id", "vote_average"])["actor_id"]
          .apply(lambda s: sorted(set(s.dropna())))
          .reset_index(name="actor_ids")
)

# Generate pairs per movie (unordered, normalized)
pair_rows = []
for _, row in movie_actors.iterrows():
    ids = row["actor_ids"]
    if len(ids) >= 2:
        for a, b in itertools.combinations(ids, 2):
            a_id, b_id = (a, b) if a < b else (b, a)
            pair_rows.append((a_id, b_id, row["vote_average"]))

pairs_df = pd.DataFrame(pair_rows, columns=["actor1_id", "actor2_id", "vote_average"])

# Aggregate: co-appearances = number of movies they co-starred in (size),
# avg_vote_average = mean of movie vote_average (NaNs ignored by mean)
result = (
    pairs_df.groupby(["actor1_id", "actor2_id"], as_index=False)
            .agg(co_appearances=("vote_average", "size"),
                 avg_vote_average=("vote_average", "mean"))
)

# Keep pairs with ≥ 3 co-appearances
result = result[result["co_appearances"] >= 3]

# Map IDs to display names (most common name per id)
name_map = (
    cast_expanded.groupby("actor_id")["actor_name"]
                 .apply(lambda s: Counter(s.dropna()).most_common(1)[0][0] if len(s.dropna()) else None)
                 .to_dict()
)
result["actor1"] = result["actor1_id"].map(name_map)
result["actor2"] = result["actor2_id"].map(name_map)

# Final sort
result = result.sort_values(
    by=["co_appearances", "actor1", "actor2"],
    ascending=[False, True, True]
).reset_index(drop=True)

print("Top actor pairs with ≥ 3 co-appearances:\n")
print(result[["actor1", "actor2", "co_appearances", "avg_vote_average"]].head(20).to_string(index=False))


Top actor pairs with ≥ 3 co-appearances:

         actor1          actor2  co_appearances  avg_vote_average
     Huntz Hall      Leo Gorcey              35          3.631429
Charlie Chaplin  Edna Purviance              33          6.460606
   Oliver Hardy     Stan Laurel              30          6.376667
   Jeff Bennett     Rob Paulsen              27          6.225926
   Lou Costello      Bud Abbott              27          6.470370
   Grey Griffin    Frank Welker              25          6.708000
   Raymond Burr    Barbara Hale              25          4.980000
     John Wayne        Paul Fix              24          5.845833
     Frank Mayo      Jack Mower              21          5.385714
   Frank Welker    Jeff Bennett              21          6.233333
   Jim Cummings    Frank Welker              21          6.547619
   Jim Cummings    Jeff Bennett              21          6.271429
  Peter Cushing Christopher Lee              21          5.985714
 Bernard Gorcey      Leo Gorcey   

In [15]:
# Task 3

# --- Parse JSON columns ---
credits["cast"] = credits["cast"].apply(safe_parse_list)
movies_metadata["genres"] = movies_metadata["genres"].apply(safe_parse_list)

# --- Explode cast and genres ---
cast_expanded = credits[["id", "cast"]].explode("cast", ignore_index=True)
cast_expanded = cast_expanded.dropna(subset=["cast"])
cast_expanded["actor_name"] = cast_expanded["cast"].apply(lambda d: d.get("name") if isinstance(d, dict) else None)
cast_expanded = cast_expanded.dropna(subset=["actor_name"])

genres_expanded = movies_metadata[["id", "genres"]].explode("genres", ignore_index=True)
genres_expanded["genre_name"] = genres_expanded["genres"].apply(lambda g: g.get("name") if isinstance(g, dict) else None)
genres_expanded = genres_expanded.dropna(subset=["genre_name"])

# --- Merge actors ↔ genres by movie id ---
cast_genres = cast_expanded.merge(genres_expanded[["id", "genre_name"]], on="id", how="inner")

# --- Group by actor: collect distinct genres and movies they appeared in ---
actor_stats = (
    cast_genres.groupby("actor_name")
               .agg(
                   genres_set=("genre_name", lambda x: sorted(set(x))),
                   movie_count=("id", lambda s: s.nunique())
               )
               .reset_index()
)

# --- Compute genre_count + sample 5 genres ---
actor_stats["genre_count"] = actor_stats["genres_set"].apply(len)
actor_stats["example_genres"] = actor_stats["genres_set"].apply(lambda g: g[:5])

# --- Keep only actors with ≥ 10 credited movies ---
actor_stats = actor_stats[actor_stats["movie_count"] >= 10]

# --- Sort and select top 10 ---
actor_stats = actor_stats.sort_values(
    by=["genre_count", "movie_count", "actor_name"],
    ascending=[False, False, True]
).head(10)

# --- Display nicely ---
print("Top 10 actors (≥10 movies) with the widest genre breadth:\n")
for i, row in actor_stats.iterrows():
    print(f"{row['actor_name']:<25} | Genres: {row['genre_count']:>2d} | "
          f"Movies: {row['movie_count']:>2d} | Examples: {', '.join(row['example_genres'])}")


Top 10 actors (≥10 movies) with the widest genre breadth:

Christopher Lee           | Genres: 20 | Movies: 144 | Examples: Action, Adventure, Animation, Comedy, Crime
Donald Sutherland         | Genres: 20 | Movies: 106 | Examples: Action, Adventure, Animation, Comedy, Crime
Christopher Walken        | Genres: 20 | Movies: 96 | Examples: Action, Adventure, Animation, Comedy, Crime
Dennis Hopper             | Genres: 20 | Movies: 90 | Examples: Action, Adventure, Animation, Comedy, Crime
Keith David               | Genres: 20 | Movies: 87 | Examples: Action, Adventure, Animation, Comedy, Crime
Liam Neeson               | Genres: 20 | Movies: 82 | Examples: Action, Adventure, Animation, Comedy, Crime
James Earl Jones          | Genres: 20 | Movies: 77 | Examples: Action, Adventure, Animation, Comedy, Crime
Jim Broadbent             | Genres: 20 | Movies: 77 | Examples: Action, Adventure, Animation, Comedy, Crime
Martin Sheen              | Genres: 20 | Movies: 75 | Examples: Action, Adv

In [17]:
# Task 4

movies_metadata["collection_name"] = movies_metadata["belongs_to_collection"].apply(
    lambda d: d.get("name") if isinstance(d, dict) else None
)

# --- Ensure proper types ---
movies_metadata["revenue"] = pd.to_numeric(movies_metadata["revenue"], errors="coerce")
movies_metadata["vote_average"] = pd.to_numeric(movies_metadata["vote_average"], errors="coerce")
movies_metadata["release_date"] = pd.to_datetime(movies_metadata["release_date"], errors="coerce")

# --- Filter only movies belonging to a named collection ---
df = movies_metadata.dropna(subset=["collection_name", "revenue"])

# --- Group by collection and aggregate ---
agg = (
    df.groupby("collection_name", dropna=True)
      .agg(
          movie_count=("id", "count"),
          total_revenue=("revenue", "sum"),
          vote_list=("vote_average", lambda x: [v for v in x if pd.notna(v)]),
          earliest=("release_date", "min"),
          latest=("release_date", "max")
      )
      .reset_index()
)

# --- Compute median vote_average per collection ---
agg["median_vote_average"] = agg["vote_list"].apply(lambda x: median(x) if len(x) else None)
agg = agg.drop(columns=["vote_list"])

# --- Keep collections with ≥ 3 movies ---
agg = agg[agg["movie_count"] >= 3]

# --- Sort by total revenue (descending) and take top 10 ---
agg = agg.sort_values("total_revenue", ascending=False).head(10)

# --- Final tidy output ---
agg["earliest"] = agg["earliest"].dt.strftime("%Y-%m-%d")
agg["latest"] = agg["latest"].dt.strftime("%Y-%m-%d")

print("Top 10 film collections (≥3 movies) by total revenue:\n")
for _, row in agg.iterrows():
    print(
        f"{row['collection_name']:<40} | "
        f"Movies: {row['movie_count']:>2d} | "
        f"Revenue: ${row['total_revenue']:,.0f} | "
        f"Median vote: {row['median_vote_average']:.2f} | "
        f"{row['earliest']} → {row['latest']}"
    )


Top 10 film collections (≥3 movies) by total revenue:

Harry Potter Collection                  | Movies:  8 | Revenue: $7,707,367,425 | Median vote: 7.50 | 2001-11-16 → 2011-07-07
Star Wars Collection                     | Movies:  8 | Revenue: $7,434,494,790 | Median vote: 7.45 | 1977-05-25 → 2016-12-14
James Bond Collection                    | Movies: 26 | Revenue: $7,106,970,239 | Median vote: 6.30 | 1962-10-04 → 2015-10-26
The Fast and the Furious Collection      | Movies:  8 | Revenue: $5,125,098,793 | Median vote: 6.65 | 2001-06-22 → 2017-04-12
Pirates of the Caribbean Collection      | Movies:  5 | Revenue: $4,521,576,826 | Median vote: 6.90 | 2003-07-09 → 2017-05-23
Transformers Collection                  | Movies:  5 | Revenue: $4,366,101,244 | Median vote: 6.10 | 2007-06-27 → 2017-06-21
Despicable Me Collection                 | Movies:  6 | Revenue: $3,691,070,216 | Median vote: 6.90 | 2010-07-08 → 2017-06-15
The Twilight Collection                  | Movies:  5 | Revenue

In [19]:
# Task 5

movies_metadata["genres"] = movies_metadata["genres"].apply(safe_parse_list)

# --- Extract primary genre (first element) ---
movies_metadata["primary_genre"] = movies_metadata["genres"].apply(
    lambda lst: lst[0]["name"] if isinstance(lst, list) and len(lst) > 0 and isinstance(lst[0], dict) else None
)

# --- Ensure numeric runtime and valid release year ---
movies_metadata["runtime"] = pd.to_numeric(movies_metadata["runtime"], errors="coerce")
movies_metadata["release_date"] = pd.to_datetime(movies_metadata["release_date"], errors="coerce")

# --- Extract year and decade ---
movies_metadata["year"] = movies_metadata["release_date"].dt.year
movies_metadata["decade"] = (movies_metadata["year"] // 10) * 10

# --- Filter out invalid or missing data ---
df = movies_metadata.dropna(subset=["decade", "primary_genre", "runtime"])

# --- Group by decade + primary genre ---
grouped = (
    df.groupby(["decade", "primary_genre"], dropna=True)
      .agg(
          runtimes=("runtime", list),
          movie_count=("id", "count")
      )
      .reset_index()
)

# --- Compute median runtime for each group ---
grouped["median_runtime"] = grouped["runtimes"].apply(
    lambda vals: median([v for v in vals if pd.notna(v)]) if len(vals) else None
)

grouped = grouped.drop(columns=["runtimes"])

# --- Sort by decade, then median_runtime descending ---
grouped = grouped.sort_values(by=["decade", "median_runtime"], ascending=[True, False]).reset_index(drop=True)

# --- Display top 20 for inspection ---
print("Median runtime and movie count by decade × primary genre:\n")
for _, row in grouped.head(20).iterrows():
    print(
        f"{int(row['decade'])}s – {row['primary_genre']:<20} | "
        f"Median runtime: {row['median_runtime']:>6.1f} min | "
        f"Movies: {row['movie_count']}"
    )


Median runtime and movie count by decade × primary genre:

1870s – Documentary          | Median runtime:    1.0 min | Movies: 2
1880s – Documentary          | Median runtime:    1.0 min | Movies: 4
1890s – Fantasy              | Median runtime:    1.5 min | Movies: 8
1890s – Action               | Median runtime:    1.0 min | Movies: 1
1890s – Comedy               | Median runtime:    1.0 min | Movies: 9
1890s – Documentary          | Median runtime:    1.0 min | Movies: 27
1890s – Drama                | Median runtime:    1.0 min | Movies: 2
1890s – Family               | Median runtime:    1.0 min | Movies: 1
1890s – History              | Median runtime:    1.0 min | Movies: 1
1890s – Horror               | Median runtime:    1.0 min | Movies: 3
1900s – History              | Median runtime:   60.0 min | Movies: 1
1900s – Adventure            | Median runtime:   15.0 min | Movies: 4
1900s – Action               | Median runtime:    9.0 min | Movies: 2
1900s – Drama                |

In [20]:
# Task 6

credits["cast"] = credits["cast"].apply(safe_parse_list)

# --- Parse dates and extract decade ---
movies_metadata["release_date"] = pd.to_datetime(movies_metadata["release_date"], errors="coerce")
movies_metadata["year"] = movies_metadata["release_date"].dt.year
movies_metadata["decade"] = (movies_metadata["year"] // 10) * 10

# --- Merge cast and decade ---
merged = credits.merge(
    movies_metadata[["id", "decade"]],
    on="id",
    how="left"
)
merged = merged.dropna(subset=["decade"])

# --- Explode cast into individual rows ---
cast_expanded = merged[["id", "decade", "cast"]].explode("cast", ignore_index=True)
cast_expanded = cast_expanded.dropna(subset=["cast"])

# --- Extract gender and order fields ---
cast_expanded["gender"] = cast_expanded["cast"].apply(lambda c: c.get("gender") if isinstance(c, dict) else None)
cast_expanded["order"] = cast_expanded["cast"].apply(lambda c: c.get("order") if isinstance(c, dict) else None)

# --- Filter to valid gender (1=female, 2=male), keep top 5 by order ---
cast_expanded = cast_expanded[cast_expanded["gender"].isin([1, 2])]
cast_expanded = cast_expanded.dropna(subset=["order"])
cast_expanded["order"] = cast_expanded["order"].astype(int)

# --- Keep top-billed 5 per movie ---
top5 = cast_expanded[cast_expanded["order"] < 5]

# --- Compute female proportion per movie ---
female_prop = (
    top5.groupby(["id", "decade"])
        .agg(
            female_count=("gender", lambda g: (g == 1).sum()),
            total_count=("gender", "count")
        )
        .reset_index()
)
female_prop["female_ratio"] = female_prop["female_count"] / female_prop["total_count"]

# --- Aggregate by decade: average female proportion & movie count ---
result = (
    female_prop.groupby("decade")
               .agg(
                   avg_female_prop=("female_ratio", "mean"),
                   movie_count=("id", "count")
               )
               .reset_index()
)

# --- Sort by avg_female_prop descending ---
result = result.sort_values("avg_female_prop", ascending=False).reset_index(drop=True)

# --- Display nicely ---
print("Average female proportion (top-5 cast) per decade:\n")
for _, row in result.iterrows():
    print(
        f"{int(row['decade'])}s  |  Avg female proportion: {row['avg_female_prop']:.3f}  "
        f"|  Movies: {row['movie_count']}"
    )


Average female proportion (top-5 cast) per decade:

1870s  |  Avg female proportion: 1.000  |  Movies: 1.0
1890s  |  Avg female proportion: 0.500  |  Movies: 2.0
1900s  |  Avg female proportion: 0.500  |  Movies: 11.0
2010s  |  Avg female proportion: 0.377  |  Movies: 10229.0
1930s  |  Avg female proportion: 0.369  |  Movies: 1239.0
2000s  |  Avg female proportion: 0.363  |  Movies: 9411.0
1940s  |  Avg female proportion: 0.358  |  Movies: 1425.0
1910s  |  Avg female proportion: 0.341  |  Movies: 142.0
1990s  |  Avg female proportion: 0.339  |  Movies: 5076.0
1950s  |  Avg female proportion: 0.337  |  Movies: 1985.0
1920s  |  Avg female proportion: 0.328  |  Movies: 368.0
1960s  |  Avg female proportion: 0.325  |  Movies: 2363.0
1980s  |  Avg female proportion: 0.320  |  Movies: 3511.0
1970s  |  Avg female proportion: 0.311  |  Movies: 3101.0


In [22]:
# Task 7 — replicate Mongo query 7 in pandas

# --- Convert types ---
movies_metadata["vote_average"] = pd.to_numeric(movies_metadata["vote_average"], errors="coerce")
movies_metadata["vote_count"] = pd.to_numeric(movies_metadata["vote_count"], errors="coerce")
movies_metadata["release_date"] = pd.to_datetime(movies_metadata["release_date"], errors="coerce")

# --- Define case-insensitive regex for noir / neo-noir (non-capturing group) ---
pattern = r"(?i)\b(?:neo[- ]?noir|noir)\b"

keywords["keywords"] = keywords["keywords"].apply(safe_parse_list)

# --- Extract keyword names ---
keywords["keyword_names"] = keywords["keywords"].apply(
    lambda lst: [kw.get("name", "") for kw in lst if isinstance(kw, dict)]
)

# --- Expand keywords into strings for regex search ---
keywords["keywords_str"] = keywords["keyword_names"].apply(lambda ks: " ".join(ks))

# --- Filter by vote count ---
filtered_movies = movies_metadata[movies_metadata["vote_count"] >= 50].copy()

# --- Branch A: overview/tagline search ---
mask_text = (
    filtered_movies["overview"].fillna("").str.contains(pattern, regex=True)
    | filtered_movies["tagline"].fillna("").str.contains(pattern, regex=True)
)
branch_text = filtered_movies[mask_text].copy()
branch_text["source"] = "text"

# --- Branch B: keyword search ---
mask_keywords = keywords["keywords_str"].str.contains(pattern, regex=True)
branch_keywords = keywords[mask_keywords][["id"]].merge(
    filtered_movies, on="id", how="inner"
)
branch_keywords["source"] = "keywords"

# --- Combine both branches ---
combined = pd.concat([branch_text, branch_keywords], ignore_index=True)

# --- Deduplicate by movie id ---
combined = combined.drop_duplicates(subset="id", keep="first")

# --- Extract year ---
combined["year"] = combined["release_date"].dt.year

# --- Select relevant columns ---
result = combined[["title", "year", "vote_average", "vote_count"]].copy()

# --- Sort and limit to top 20 ---
top20 = (
    result.sort_values(by=["vote_average", "vote_count"], ascending=[False, False])
          .head(20)
          .reset_index(drop=True)
)

# --- Display neatly ---
print("Top 20 noir / neo-noir movies (vote_count ≥ 50):\n")
for i, row in top20.iterrows():
    print(
        f"{i+1:2d}. {row['title']:<40} "
        f"({int(row['year']) if pd.notna(row['year']) else '----'})  "
        f"Rating {row['vote_average']:.2f}  "
        f"Votes {int(row['vote_count'])}"
    )


Top 20 noir / neo-noir movies (vote_count ≥ 50):

 1. Leon: The Professional                   (1994)  Rating 8.20  Votes 4293
 2. Sunset Boulevard                         (1950)  Rating 8.20  Votes 533
 3. High and Low                             (1963)  Rating 8.20  Votes 123
 4. Le Trou                                  (1960)  Rating 8.20  Votes 73
 5. Memento                                  (2000)  Rating 8.10  Votes 4168
 6. Vertigo                                  (1958)  Rating 8.00  Votes 1162
 7. Double Indemnity                         (1944)  Rating 8.00  Votes 425
 8. Blade Runner                             (1982)  Rating 7.90  Votes 3833
 9. Casablanca                               (1942)  Rating 7.90  Votes 1462
10. Chinatown                                (1974)  Rating 7.90  Votes 939
11. The Third Man                            (1949)  Rating 7.90  Votes 431
12. The Night of the Hunter                  (1955)  Rating 7.90  Votes 335
13. Diabolique                    

In [27]:
# Task 8 — Top 20 director–actor pairs (≥3 collaborations; vote_count ≥ 100)

credits["cast"] = credits["cast"].apply(safe_parse_list)
credits["crew"] = credits["crew"].apply(safe_parse_list)

# --- Ensure numeric id and join with movie info ---
movies_metadata["id"] = pd.to_numeric(movies_metadata["id"], errors="coerce")
credits["id"] = pd.to_numeric(credits["id"], errors="coerce")

merged = credits.merge(
    movies_metadata[["id", "vote_average", "vote_count", "revenue"]],
    on="id", how="inner"
)

# --- Filter only movies with ≥100 votes ---
merged = merged[merged["vote_count"] >= 100].copy()

# --- Extract directors (list of dicts where job == "Director") ---
merged["directors"] = merged["crew"].apply(
    lambda lst: [c for c in lst if isinstance(c, dict) and c.get("job") == "Director"]
)

# --- Explode directors and cast ---
merged = merged.explode("directors", ignore_index=True)
merged = merged.explode("cast", ignore_index=True)

# --- Drop missing entries ---
merged = merged.dropna(subset=["directors", "cast"])

# --- Extract IDs and names ---
merged["director_id"] = merged["directors"].apply(lambda d: d.get("id") if isinstance(d, dict) else None)
merged["director"] = merged["directors"].apply(lambda d: d.get("name") if isinstance(d, dict) else None)
merged["actor_id"] = merged["cast"].apply(lambda c: c.get("id") if isinstance(c, dict) else None)
merged["actor"] = merged["cast"].apply(lambda c: c.get("name") if isinstance(c, dict) else None)

# --- Drop missing ids or names ---
merged = merged.dropna(subset=["director_id", "actor_id", "director", "actor"])

# --- Group by (director_id, actor_id) pair ---
pair_stats = (
    merged.groupby(
        ["director_id", "director", "actor_id", "actor"], dropna=True
    ).agg(
        films=("id", "nunique"),
        mean_vote_average=("vote_average", "mean"),
        mean_revenue=("revenue", "mean")
    ).reset_index()
)

# --- Keep only pairs with ≥3 films ---
pair_stats = pair_stats[pair_stats["films"] >= 3]

# --- Sort by mean_vote_average (desc) then films (desc) ---
pair_stats = pair_stats.sort_values(
    by=["mean_vote_average", "films"], ascending=[False, False]
).head(20).reset_index(drop=True)

# --- Display neatly ---
print("Top 20 Director–Actor Pairs (≥3 collaborations; vote_count ≥100):\n")
for i, row in pair_stats.iterrows():
    print(
        f"{i+1:2d}.  Director: {row['director']:<25}  "
        f"Actor: {row['actor']:<25}  "
        f"Films: {int(row['films']):2d}  "
        f"Avg Vote: {row['mean_vote_average']:.2f}  "
        f"Avg Revenue: ${row['mean_revenue']:.0f}"
    )


Top 20 Director–Actor Pairs (≥3 collaborations; vote_count ≥100):

 1.  Director: Akira Kurosawa             Actor: Eijirô Tôno                Films:  3  Avg Vote: 8.13  Avg Revenue: $90614
 2.  Director: Akira Kurosawa             Actor: Minoru Itô                 Films:  3  Avg Vote: 8.13  Avg Revenue: $90614
 3.  Director: Akira Kurosawa             Actor: Haruo Suzuki               Films:  3  Avg Vote: 8.13  Avg Revenue: $90614
 4.  Director: Charlie Chaplin            Actor: Hank Mann                  Films:  3  Avg Vote: 8.13  Avg Revenue: $6506394
 5.  Director: Quentin Tarantino          Actor: Harvey Keitel              Films:  3  Avg Vote: 8.10  Avg Revenue: $182573606
 6.  Director: Francis Ford Coppola       Actor: John Cazale                Films:  3  Avg Vote: 8.10  Avg Revenue: $99009751
 7.  Director: Akira Kurosawa             Actor: Atsushi Watanabe           Films:  3  Avg Vote: 8.10  Avg Revenue: $109027
 8.  Director: Akira Kurosawa             Actor: Toranosuke Og

In [28]:
# Task 9

movies_metadata["production_countries"] = movies_metadata["production_countries"].apply(safe_parse_list)
movies_metadata["production_companies"] = movies_metadata["production_companies"].apply(safe_parse_list)

# --- Filter movies ---
def has_us_involvement(row):
    countries = [c.get("name") for c in row["production_countries"] if isinstance(c, dict)]
    companies = [c.get("name") for c in row["production_companies"] if isinstance(c, dict)]
    return ("United States of America" in countries) or ("United States of America" in companies)

filtered = movies_metadata[
    (movies_metadata["original_language"] != "en") &
    (movies_metadata.apply(has_us_involvement, axis=1))
].copy()

# --- Group by language and aggregate ---
result = (
    filtered.groupby("original_language", dropna=True)
            .agg(
                count=("id", "count"),
                example_title=("title", "first")
            )
            .reset_index()
)

# --- Sort and limit ---
result = result.sort_values(by="count", ascending=False).head(10)

# --- Display ---
print("Top 10 non-English original languages with US involvement:\n")
print(result.to_string(index=False))


Top 10 non-English original languages with US involvement:

original_language  count       example_title
               fr    111    Wings of Courage
               es     71        Bitter Sugar
               it     55   Frankie Starlight
               de     51          Cold Fever
               ja     30       Godzilla 1985
               xx     14      Quest for Fire
               pt     14           Senseless
               nl     12    Come On, Rangers
               ru     11           Dark Eyes
               zh     11 Eat Drink Man Woman


In [30]:
# Task 10

movies_metadata["genres"] = movies_metadata["genres"].apply(safe_parse_list)

# --- Ensure numeric IDs ---
ratings["movieId"] = pd.to_numeric(ratings["movieId"], errors="coerce")
links["movieId"] = pd.to_numeric(links["movieId"], errors="coerce")
links["tmdbId"] = pd.to_numeric(links["tmdbId"], errors="coerce")
movies_metadata["id"] = pd.to_numeric(movies_metadata["id"], errors="coerce")

# --- Join ratings → links → movies ---
merged = ratings.merge(links, on="movieId", how="left")
merged = merged.merge(movies_metadata, left_on="tmdbId", right_on="id", how="left")

# --- Extract genre names safely ---
def extract_genre_names(val):
    if isinstance(val, list):
        return [g.get("name") for g in val if isinstance(g, dict) and g.get("name")]
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list):
                return [g.get("name") for g in parsed if isinstance(g, dict) and g.get("name")]
        except Exception:
            return []
    return []

merged["genre_names"] = merged["genres"].apply(extract_genre_names)


# --- Filter valid ratings ---
merged = merged.dropna(subset=["rating"])

# --- Aggregate per user ---
user_stats = (
    merged.groupby("userId", dropna=True)
          .agg(
              ratings_count=("rating", "count"),
              mean_rating=("rating", "mean"),
              ratings_list=("rating", list),
              genre_lists=("genre_names", lambda x: [g for sub in x for g in sub])
          )
          .reset_index()
)

# --- Compute population variance ---
def pop_variance(ratings):
    arr = np.array(ratings, dtype=float)
    return float(np.var(arr)) if len(arr) > 0 else np.nan

user_stats["variance"] = user_stats["ratings_list"].apply(pop_variance)

# --- Compute distinct genre count ---
user_stats["distinct_genres"] = user_stats["genre_lists"].apply(lambda g: len(set(g)))

# --- Filter users with ≥ 20 ratings ---
user_stats = user_stats[user_stats["ratings_count"] >= 20].copy()

# --- Prepare top 10 lists ---
top_genre_diverse = (
    user_stats.sort_values(["distinct_genres", "ratings_count"], ascending=[False, False])
               .head(10)
               .reset_index(drop=True)
)
top_high_variance = (
    user_stats.sort_values(["variance"], ascending=[False])
               .head(10)
               .reset_index(drop=True)
)

# --- Display results ---
print("\nTop 10 Most Genre-Diverse Users:\n")
print(top_genre_diverse[["userId", "ratings_count", "distinct_genres", "variance"]].to_string(index=False))

print("\nTop 10 Highest-Variance Users:\n")
print(top_high_variance[["userId", "ratings_count", "distinct_genres", "variance"]].to_string(index=False))



Top 10 Most Genre-Diverse Users:

 userId  ratings_count  distinct_genres  variance
  59908           2391               20  1.151955
 208730           1868               20  1.372515
 186783           1735               20  1.054376
 173274           1340               20  1.033055
 137664           1291               20  0.584818
 208501            726               20  0.712015
 266969            604               20  0.873284
  25872            323               20  0.583141
 126867            296               20  0.710270
 224621           1700               19  1.584291

Top 10 Highest-Variance Users:

 userId  ratings_count  distinct_genres  variance
  59142             36               14  3.479167
 179170             21               13  3.354875
 257799             53               14  3.060164
  52261             32               12  2.968750
 180286            140               19  2.871224
 183695             46               17  2.820416
  85200             20          