In [16]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import scipy.stats as stats

# Load data
df = pd.read_csv("movies_after_2000_final.csv")

# === Data Preprocessing ===
df = df.rename(columns={
    'Budget (USD)': 'Budget',
    'Revenue (USD)': 'Revenue',
    'IMDb Rating': 'Rating',
    'Genre': 'Genre'
})
df = df.dropna(subset=['Budget', 'Revenue', 'Rating', 'Genre'])
df = df[df['Genre'].str.lower() != 'western']

# Create ROI
df['ROI'] = (df['Revenue'] - df['Budget']) / df['Budget']

# ==========================
# 📊 1. Interactive Scatter Plots with Trendlines
# ==========================

def plot_scatter(x, y, title):
    fig = px.scatter(df, x=x, y=y, hover_name="Title", trendline="ols",
                     title=title, template='plotly')
    fig.update_traces(marker=dict(size=7, opacity=0.6))
    fig.show()

plot_scatter("Budget", "Rating", "Budget vs IMDb Rating")
print("Alternative Hypothesis: Budget affects IMDb rating.")
corr, pval = stats.pearsonr(df["Budget"], df["Rating"])
print(f"Correlation: r = {corr:.2f}, p = {pval:.2e}")
print("✅ Conclusion: IMDb rating decreases as budget increases to huge numbers" if pval < 0.05 else "❌ No significant evidence found.")
print(" ")

plot_scatter("Revenue", "Rating", "Revenue vs IMDb Rating")
corr, pval = stats.pearsonr(df["Revenue"], df["Rating"])
print(f"Alternative Hypothesis: Revenue affects IMDb rating.\nCorrelation: r = {corr:.2f}, p = {pval:.2e}")
print("✅ Conclusion:" if pval < 0.05 else "❌ We fail to reject the null hypothesis, revenue has no significant effect on rating.")
print(" ")

# ==========================
# 2. Genre-Based Bar Charts
# ==========================

def genre_bar(metric, ylabel, title):
    genre_means = df.groupby("Genre", observed=False)[metric].mean().sort_values(ascending=False)
    fig = px.bar(genre_means, x=genre_means.index, y=genre_means.values,
                 title=title, labels={'x': 'Genre', 'y': ylabel}, template='plotly')
    fig.show()

    # ANOVA Test
    groups = [group[metric].values for _, group in df.groupby("Genre", observed=False)]
    f_stat, p_val = stats.f_oneway(*groups)
    print(f"ANOVA for {metric} by Genre — F = {f_stat:.2f}, p = {p_val:.2e}")
    print("✅", metric, "among genres differs significantly." if p_val < 0.05 else "❌ No significant genre-based differences.")
    print(" ")

genre_bar("Rating", "Average Rating", "Average IMDb Rating by Genre")
genre_bar("Budget", "Average Budget", "Average Budget by Genre")
genre_bar("Revenue", "Average Revenue", "Average Revenue by Genre")


Alternative Hypothesis: Budget affects IMDb rating.
Correlation: r = -0.21, p = 4.03e-11
✅ Conclusion: IMDb rating decreases as budget increases to huge numbers
 


Alternative Hypothesis: Revenue affects IMDb rating.
Correlation: r = 0.03, p = 3.27e-01
❌ We fail to reject the null hypothesis, revenue has no significant effect on rating.
 


ANOVA for Rating by Genre — F = 13.57, p = 1.38e-22
✅ Rating among genres differs significantly.
 


ANOVA for Budget by Genre — F = 49.59, p = 1.74e-80
✅ Budget among genres differs significantly.
 


ANOVA for Revenue by Genre — F = 22.42, p = 4.08e-38
✅ Revenue among genres differs significantly.
 
