# Pixar Films Exploration

## Imports and Settings

In [None]:
# Helper libraries
import warnings

# Scientific and visual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import missingno as msno

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4)
sns.set_theme()
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

## Loading Datasets

We will leave aside pixar people dataset because its presence complicates the preprocessing and analysis tasks.

In [None]:
from pixarfilms import load_dataset

In [None]:
names = ("academy", "genres", "films", "public_response", "box_office")
pixar_datasets = {name: load_dataset(name, cache=True) for name in names}

## Merging and Wrangling

In [None]:
pixar_datasets["films"].rename(columns={"plot": "scenario"}, inplace=True)

merged_df = pixar_datasets["films"].merge(
    pixar_datasets["public_response"], on="film", how="left"
)

merged_df = merged_df.merge(pixar_datasets["box_office"], on="film", how="left")

genres_aggregated = (
    pixar_datasets["genres"]
    .groupby("film")["value"]
    .apply(lambda x: ", ".join(x))
    .reset_index()
)
genres_aggregated.rename(columns={"value": "genres"}, inplace=True)
merged_df = merged_df.merge(genres_aggregated, on="film", how="left")

# We will only extract all wins and nominations
award_counts = (
    pixar_datasets["academy"]
    .groupby(["film", "status"])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)
award_counts = award_counts[["film", "Nominated", "Won", "Won Special Achievement"]]
award_counts["n_wins"] = award_counts["Won"] + award_counts["Won Special Achievement"]
award_counts.drop(columns=["Won", "Won Special Achievement"], inplace=True)
award_counts.rename(columns={"Nominated": "n_nominations"}, inplace=True)

pixar = merged_df.merge(award_counts, on="film", how="left")

In [None]:
pixar.head()

## Missing Data Imputation

In [None]:
msno.matrix(pixar, fontsize=20, sparkline=False, figsize=(8, 4))
plt.show()

There are two columns with missing values: `cinema_score` and `budget`.

In [None]:
pixar.loc[pixar.cinema_score.isna(), ["film"]]

In [None]:
pixar.loc[pixar.budget.isna(), ["film"]]

It would be easy to ignore them but imputing these values should be quite accessible.

**Budget**:<br>
For Luca, a budget estimate of 120-160 million can be reasonably added based on IMDb and expert estimates from various forums. We can use the lower bound of the range for now.

**CinemaScore**:<br>
Luca, Soul, and Turning Red did not receive official Cinema score grades because these films were released directly to Disney+ during the COVID-19 pandemic. While we don't have their Cinema score data, we can note that these films were generally well-received by audiences:
- _Turning Red_ has an IMDb rating of 6.9 out of 10.
- _Luca_ has an IMDb rating of 7.4 out of 10.
- _Soul_ has an IMDb rating of 8.0 out of 10.

Thus it's possible to impute them based on IMDb audience scores as proxy for audience reception.

In [None]:
imputed_pixar = pixar.copy()

In [None]:
# Adjust the budget for "Luca"
imputed_pixar.loc[imputed_pixar["film"] == "Luca", "budget"] = 120_000_000

# Impute cinema score ratings
# We can map the score like this:
# > Turning Red = 6.9 => A-
# > Luca = 7.4 => A-
# > Soul = 8 => A
cinema_score_imputations = {
    "Turning Red": "A-",
    "Luca": "A-",
    "Soul": "A"
}

for film, score in cinema_score_imputations.items():
    imputed_pixar.loc[imputed_pixar["film"] == film, "cinema_score"] = score

## Pixar Data Analysis

In [None]:
df = imputed_pixar.copy()

### Performance Over Time

#### Two-panel: Box Office & Market Trend

The figure consists of two panels:
- The top panel shows stacked bar charts for US/Canada and international box office revenues (in millions USD) along with a trend line for the total revenue.
- The bottom panel displays an area chart representing the evolution of domestic and international market share percentages.

In [None]:
def plot_pixar_films_performance(df: pd.DataFrame) -> go.Figure:
    """
    Generate an interactive Plotly figure that visualizes Pixar films' box office performance and 
    market split evolution over time.
    """
    # Calculate total revenue (in millions) and percentage splits
    df["total_revenue"] = df["box_office_worldwide"] / 1e6
    df["domestic_pct"] = df["box_office_us_canada"] / df["box_office_worldwide"] * 100
    df["international_pct"] = df["box_office_other"] / df["box_office_worldwide"] * 100

    # Create a subplot with two rows: Box Office and Market Split Evolution
    fig = make_subplots(
        rows=2,
        cols=1,
        row_heights=[0.7, 0.3],
        vertical_spacing=0.1,
        subplot_titles=("Box Office Performance", "Market Split Evolution"),
    )

    # Top Panel: Box Office Performance
    # Domestic revenue bar chart
    fig.add_trace(
        go.Bar(
            name="US/Canada",
            x=df["release_date"],
            y=df["box_office_us_canada"] / 1e6,
            marker_color="#FF9999",
            hovertemplate=(
                "<b>%{text}</b><br>"
                "US/Canada: $%{y:.0f}M<br>"
                "Release: %{x|%B %Y}<br>"
                "<extra></extra>"
            ),
            text=df["film"],
        ),
        row=1,
        col=1,
    )

    # International revenue bar chart
    fig.add_trace(
        go.Bar(
            name="International",
            x=df["release_date"],
            y=df["box_office_other"] / 1e6,
            marker_color="#66B2FF",
            hovertemplate=(
                "<b>%{text}</b><br>"
                "International: $%{y:.0f}M<br>"
                "Release: %{x|%B %Y}<br>"
                "Rotten Tomatoes: %{customdata}%<extra></extra>"
            ),
            text=df["film"],
            customdata=df["rotten_tomatoes_score"],
        ),
        row=1,
        col=1,
    )

    # Trend line for total revenue (3-film rolling average)
    fig.add_trace(
        go.Scatter(
            name="Total Revenue Trend",
            x=df["release_date"],
            y=df["total_revenue"].rolling(window=3, min_periods=1).mean(),
            line=dict(color="#2E4053", dash="dot"),
            hovertemplate="Trend: $%{y:.0f}M<extra></extra>",
        ),
        row=1,
        col=1,
    )

    # Bottom Panel: Market Split Evolution
    # International market share area chart
    fig.add_trace(
        go.Scatter(
            name="International %",
            x=df["release_date"],
            y=df["international_pct"],
            mode="lines",
            fill="tonexty",
            line=dict(width=0.5),
            fillcolor="rgba(102, 178, 255, 0.5)",
            hovertemplate="International: %{y:.1f}%<extra></extra>",
        ),
        row=2,
        col=1,
    )

    # Domestic market share area chart
    fig.add_trace(
        go.Scatter(
            name="Domestic %",
            x=df["release_date"],
            y=df["domestic_pct"],
            mode="lines",
            fill="tozeroy",
            line=dict(width=0.5),
            fillcolor="rgba(255, 153, 153, 0.5)",
            hovertemplate="Domestic: %{y:.1f}%<extra></extra>",
        ),
        row=2,
        col=1,
    )

    fig.update_layout(
        title={
            "text": "Pixar Films: Box Office Performance and Market Distribution (1995-2024)",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": {"size": 24},
        },
        barmode="stack",
        plot_bgcolor="white",
        width=1200,
        height=900,
        showlegend=True,
        legend={"yanchor": "top", "y": 0.99, "xanchor": "left", "x": 0.01},
    )

    fig.update_xaxes(
        showline=True, linewidth=1, linecolor="lightgray", mirror=True
    )
    fig.update_yaxes(
        showline=True, linewidth=1, linecolor="lightgray", mirror=True
    )

    fig.update_yaxes(title_text="Box Office Revenue (Millions USD)", row=1, col=1)
    fig.update_yaxes(title_text="Market Share (%)", row=2, col=1)

    return fig

In [None]:
plot_pixar_films_performance(df)

#### Two-Panel: Rating Over Time & Score Distribution

In [None]:
def plot_critical_reception(df: pd.DataFrame) -> go.Figure:
    """
    Generate an interactive Plotly figure that analyzes the critical reception of Pixar films over time.
    """
    # Scale IMDB score
    df["imdb_score_100"] = df["imdb_score"] * 10

    # Compute average score across rating systems
    df["avg_score"] = df[
        ["rotten_tomatoes_score", "imdb_score_100", "metacritic_score"]
    ].mean(axis=1)

    # Identify the top 3 and lowest 3 films based on average score
    top_films = df.nlargest(3, "avg_score")
    lowest_films = df.nsmallest(3, "avg_score")

    # Create subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        row_heights=[0.7, 0.3],
        vertical_spacing=0.15,
        subplot_titles=("Critical Reception Over Time", "Rating Distribution"),
    )

    # Define properties for each rating system
    rating_systems = {
        "Rotten Tomatoes": {
            "color": "#FF2020",
            "scores": "rotten_tomatoes_score",
            "counts": "rotten_tomatoes_counts",
        },
        "IMDB": {
            "color": "#F5C518",
            "scores": "imdb_score_100",
            "counts": "imdb_counts",
        },
        "Metacritic": {
            "color": "#001D3D",
            "scores": "metacritic_score",
            "counts": "metacritic_counts",
        },
    }

    # Add traces for each rating system 
    for name, info in rating_systems.items():
        fig.add_trace(
            go.Scatter(
                name=name,
                x=df["release_date"],
                y=df[info["scores"]],
                mode="lines+markers",
                line=dict(color=info["color"], width=2),
                marker=dict(
                    size=(df[info["counts"]] / df[info["counts"]].max() * 20) + 5,
                    opacity=0.7,
                ),
                hovertemplate=(
                    "<b>%{text}</b><br>"
                    f"{name}: %{{y:.1f}}/100<br>"
                    "Reviews: %{customdata:,}<br>"
                    "Release: %{x|%B %Y}<extra></extra>"
                ),
                text=df["film"],
                customdata=df[info["counts"]],
            ),
            row=1,
            col=1,
        )

    # Add a 3-film rolling average trend line
    fig.add_trace(
        go.Scatter(
            name="3-Film Rolling Average",
            x=df["release_date"],
            y=df["avg_score"].rolling(window=3, center=True).mean(),
            mode="lines",
            line=dict(color="#2ECC71", width=2, dash="dash"),
            hovertemplate="Average Rating: %{y:.1f}/100<extra></extra>",
        ),
        row=1,
        col=1,
    )

    # Add box plots for each rating system
    for name, info in rating_systems.items():
        fig.add_trace(
            go.Box(
                name=name,
                y=df[info["scores"]],
                boxpoints="all",
                jitter=0.3,
                pointpos=-1.8,
                marker_color=info["color"],
                showlegend=False,
            ),
            row=2,
            col=1,
        )

    fig.update_layout(
        title={
            "text": "Pixar Films: Critical Reception Analysis (1995-2024)",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": {"size": 24},
        },
        plot_bgcolor="white",
        width=1200,
        height=900,
        showlegend=True,
        legend={"yanchor": "top", "y": 0.95, "xanchor": "right", "x": 0.99},
    )

    fig.update_xaxes(showline=True, linewidth=1, linecolor="lightgray", mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor="lightgray", mirror=True)

    fig.update_yaxes(title_text="Rating (0-100 Scale)", range=[50, 100], row=1, col=1)
    fig.update_yaxes(title_text="Score Distribution", range=[50, 100], row=2, col=1)

    for film in pd.concat([top_films.head(1), lowest_films.head(1)]).itertuples():
        fig.add_annotation(
            x=film.release_date,
            y=film.avg_score,
            text=f"{film.film}<br>({film.avg_score:.1f}/100)",
            showarrow=True,
            arrowhead=1,
            ax=0,
            ay=-40,
        )

    return fig

In [None]:
plot_critical_reception(df)

### IMDB Rating Prediction

We can try to see whether pixar movie information can be associated with IMDB rating. For that we can frame this problem as a regression task.

In [None]:
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#### Regression Analysis of IMDB Score

In [None]:
# Some feature extraction from release_date
df["year_release"] = pd.to_datetime(df["release_date"]).dt.year

# Prepare features
numeric_features = [
    "run_time",
    "box_office_worldwide",
    "budget",
    "n_nominations",
    "n_wins",
]

categorical_features = [
    "year_release", 
] 

X = df[numeric_features + categorical_features]
y = df["imdb_score"]

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Create full pipeline
model = Pipeline([("preprocessor", preprocessor), ("regressor", LinearRegression())])
model.fit(X, y)

# Create coefficient table
feature_names = numeric_features + [
    f"{cat}"
    for cat in model.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out()
]

coefficients = pd.DataFrame(
    {"Feature": feature_names, "Coefficient": model.named_steps["regressor"].coef_}
)

result = (
    coefficients
    .sort_values(by="Coefficient", ascending=False)
    .reset_index(drop=True)
)

# Compute Regression metrics
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
rmse = root_mean_squared_error(y, y_pred)

#### Model Results

In [None]:
print("Model Results:\n")
print("R-squared:", round(r2, 3))
print("RMSE:", f"${rmse}")
print("\nCoefficients:")
result

In [None]:
# Error distribution
errors = (y_pred - y)
sns.kdeplot(errors);