In [None]:
# Remove duplicate rows to avoid double-counting movies
movies_deduped = best_picture_all_clean.drop_duplicates(subset=["Film", "Year"])
print(f"Original rows: {len(best_picture_all_clean)}, Deduplicated: {len(movies_deduped)}")


In [None]:
import matplotlib.pyplot as plt

scatter_data = movies_deduped.dropna(subset=["imdbRating", "BoxOffice"])

plt.figure(figsize=(10, 6))
plt.scatter(scatter_data["imdbRating"], scatter_data["BoxOffice"], alpha=0.6)
plt.title("Box Office Revenue vs IMDb Rating (Per Movie)")
plt.xlabel("IMDb Rating")
plt.ylabel("Box Office Revenue (USD)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# List of genre columns
genre_columns = [
    "Action", "Adventure", "Animation", "Comedy", "Crime", "Documentary",
    "Drama", "Family", "Fantasy", "History", "Horror", "Music", "Mystery",
    "Romance", "Science Fiction", "TV Movie", "Thriller", "War", "Western"
]

# Group by year and sum genre counts
genre_by_year = best_picture_all_clean.groupby("Year")[genre_columns].sum().sort_index()

# Plot
plt.figure(figsize=(14, 7))
genre_by_year.plot(kind="area", stacked=True, alpha=0.85, figsize=(14, 7))
plt.title("Genre Distribution of Best Picture Films Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Films (Genre-Normalized)")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))
plt.tight_layout()
plt.show()


In [None]:
# Group by winner status
winner_group = movies_deduped.groupby("Status")[["imdbRating", "BoxOffice"]].mean().dropna()

# Plot
winner_group.plot(kind="bar", figsize=(10, 6))
plt.title("Average IMDb Rating and Box Office: Winner vs Nominee (Per Movie)")
plt.ylabel("Average Value")
plt.xticks(rotation=0)
plt.grid(axis="y")
plt.tight_layout()
plt.show()


In [None]:
# Remove duplicate rows to avoid double-counting movies
movies_deduped = best_picture_all_clean.drop_duplicates(subset=["Film", "Year"])
print(f"Original rows: {len(best_picture_all_clean)}, Deduplicated: {len(movies_deduped)}")


In [None]:
import matplotlib.pyplot as plt

scatter_data = movies_deduped.dropna(subset=["imdbRating", "BoxOffice"])

plt.figure(figsize=(10, 6))
plt.scatter(scatter_data["imdbRating"], scatter_data["BoxOffice"], alpha=0.6)
plt.title("Box Office Revenue vs IMDb Rating (Per Movie)")
plt.xlabel("IMDb Rating")
plt.ylabel("Box Office Revenue (USD)")
plt.grid(True)
plt.tight_layout()
plt.savefig("box_office_vs_rating.png")
plt.show()


In [None]:
genre_columns = [
    "Action", "Adventure", "Animation", "Comedy", "Crime", "Documentary",
    "Drama", "Family", "Fantasy", "History", "Horror", "Music", "Mystery",
    "Romance", "Science Fiction", "TV Movie", "Thriller", "War", "Western"
]

genre_by_year = best_picture_all_clean.groupby("Year")[genre_columns].sum().sort_index()

plt.figure(figsize=(14, 7))
genre_by_year.plot(kind="area", stacked=True, alpha=0.85, figsize=(14, 7))
plt.title("Genre Distribution of Best Picture Films Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Films (Genre-Normalized)")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))
plt.tight_layout()
plt.savefig("genre_distribution_over_time.png")
plt.show()


In [None]:
winner_group = movies_deduped.groupby("Status")[["imdbRating", "BoxOffice"]].mean().dropna()

winner_group.plot(kind="bar", figsize=(10, 6))
plt.title("Average IMDb Rating and Box Office: Winner vs Nominee (Per Movie)")
plt.ylabel("Average Value")
plt.xticks(rotation=0)
plt.grid(axis="y")
plt.tight_layout()
plt.savefig("winner_vs_nominee.png")
plt.show()


In [None]:
import seaborn as sns

# Unpivot genres for boxplot
genre_ratings = best_picture_all_clean.melt(
    id_vars=["imdbRating"], value_vars=genre_columns,
    var_name="Genre", value_name="IsPresent"
)

# Filter only rows where the genre is present
genre_ratings = genre_ratings[genre_ratings["IsPresent"] == 1]

plt.figure(figsize=(16, 6))
sns.boxplot(data=genre_ratings, x="Genre", y="imdbRating")
plt.title("IMDb Ratings by Genre (Genre-Normalized)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("genre_vs_rating_boxplot.png")
plt.show()


In [None]:
winners = movies_deduped[movies_deduped["Status"] == "Winner"].dropna(subset=["Year", "imdbRating"])

plt.figure(figsize=(12, 6))
plt.plot(winners["Year"], winners["imdbRating"], marker="o", linestyle="-")
plt.title("IMDb Rating of Best Picture Winners Over Time")
plt.xlabel("Year")
plt.ylabel("IMDb Rating")
plt.grid(True)
plt.tight_layout()
plt.savefig("winner_timeline.png")
plt.show()
