
## Installs



## Imports


In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# ==========================================
# Loading the dataset
# ==========================================
df = pd.read_csv("movie_metadata.csv")
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [3]:
# ==========================================
# 1. Preprocessing
# ==========================================
print("Preprocessing data...")

# Drop unique/text identifiers
drop_cols = ['movie_title', 'plot_keywords', 'movie_imdb_link', 'color', 'aspect_ratio']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Fill missing values for ALL numeric columns (vectorised)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Fill categorical missing values
cat_cols = ['genres', 'director_name', 'actor_1_name', 'actor_2_name',
            'actor_3_name', 'content_rating', 'country', 'language']
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Process Genres (avoid Python loops per row)
if 'genres' in df.columns:
    df['genres'] = df['genres'].astype(str)
    # get_dummies directly on split series
    genres_expanded = df['genres'].str.get_dummies(sep='|')
    genres_expanded = genres_expanded.add_prefix('genre_')
    df = pd.concat([df.drop(columns=['genres']), genres_expanded], axis=1)

# One-hot encode others (Top 10) – use vectorised masking instead of apply
for col in ['content_rating', 'country', 'language']:
    if col in df.columns:
        vc = df[col].value_counts()
        top_cats = vc.nlargest(10).index
        df.loc[~df[col].isin(top_cats), col] = 'Other'
        df = pd.get_dummies(df, columns=[col], drop_first=True)

# Encode Top Directors (vectorised)
if 'director_name' in df.columns:
    top_directors = df['director_name'].value_counts().nlargest(10).index
    director_dummies = pd.get_dummies(df['director_name'])
    director_dummies = director_dummies.loc[:, director_dummies.columns.isin(top_directors)]
    director_dummies = director_dummies.add_prefix('director_')
    df = pd.concat([df.drop(columns=['director_name']), director_dummies], axis=1)

# Encode Top Actors (avoid df.apply inside loop)
actor_cols = ['actor_1_name', 'actor_2_name', 'actor_3_name']
existing_actor_cols = [c for c in actor_cols if c in df.columns]
if existing_actor_cols:
    all_actors = pd.concat([df[c] for c in existing_actor_cols])
    top_actors = all_actors.value_counts().nlargest(10).index

    for actor in top_actors:
        mask = False
        for c in existing_actor_cols:
            mask = mask | (df[c] == actor)
        df[f'actor_{actor}'] = mask.astype(np.int8)

    df = df.drop(columns=existing_actor_cols)

print(f"Preprocessing complete. Final df shape: {df.shape}")

Preprocessing data...
Preprocessing complete. Final df shape: (5043, 91)


In [4]:
# --- Target Definitions ---
y_cols = ["imdb_score", "gross"]
X_df = df.drop(columns=y_cols)
y_df = df[y_cols]

feature_names = X_df.columns.tolist()
budget_idx = feature_names.index("budget")

print(f"Features: {feature_names}")
print(f"Budget is at index: {budget_idx}")

Features: ['num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster', 'num_user_for_reviews', 'budget', 'title_year', 'actor_2_facebook_likes', 'movie_facebook_likes', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Film-Noir', 'genre_Game-Show', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Musical', 'genre_Mystery', 'genre_News', 'genre_Reality-TV', 'genre_Romance', 'genre_Sci-Fi', 'genre_Short', 'genre_Sport', 'genre_Thriller', 'genre_War', 'genre_Western', 'content_rating_G', 'content_rating_Not Rated', 'content_rating_Other', 'content_rating_PG', 'content_rating_PG-13', 'content_rating_R', 'content_rating_TV-14', 'content_rating_TV-MA', 'content_rating_Unknown', 'content_rating_Unrated', 'country_Canada', 'country_

In [5]:
X_np = X_df.to_numpy(dtype=np.float32)
y_np = y_df.to_numpy(dtype=np.float32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_tensor = torch.from_numpy(X_np).to(device)
y_tensor = torch.from_numpy(y_np).to(device)

print(f"Shape of X_tensor: {X_tensor.shape}")
print(f"Shape of y_tensor: {y_tensor.shape}")

Shape of X_tensor: torch.Size([5043, 89])
Shape of y_tensor: torch.Size([5043, 2])


In [6]:
x_means = X_tensor.mean(dim=0, keepdim=True)
x_deviations = X_tensor.std(dim=0, keepdim=True)
x_deviations = torch.where(
    x_deviations < 1e-5, torch.ones_like(x_deviations), x_deviations
)

y_mean = y_tensor.mean(dim=0, keepdim=True)
y_std = y_tensor.std(dim=0, keepdim=True)
y_std = torch.where(y_std < 1e-5, torch.ones_like(y_std), y_std)

In [7]:
class ResidualNet(nn.Module):
    def __init__(self, n_inputs, n_outputs, x_means, x_stds, y_mean, y_std):
        super().__init__()
        self.register_buffer("x_means", x_means)
        self.register_buffer("x_stds", x_stds)
        self.register_buffer("y_mean", y_mean)
        self.register_buffer("y_std", y_std)

        self.fc1 = nn.Linear(n_inputs, 128)
        self.act1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.1)

        self.fc2 = nn.Linear(128, 128)
        self.act2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.1)

        self.fc3 = nn.Linear(128, n_outputs)
        self.input_proj = nn.Linear(n_inputs, 128) if n_inputs != 128 else nn.Identity()

    def forward(self, x):
        x_norm = (x - self.x_means) / self.x_stds
        x0 = self.input_proj(x_norm)
        x1 = self.dropout1(self.act1(self.fc1(x_norm)))
        x2 = self.dropout2(self.act2(x1))
        x_res = x2 + x0
        y_scaled = self.fc3(x_res)
        y_descaled = y_scaled * self.y_std + self.y_mean
        return y_descaled, y_scaled

In [8]:
model = ResidualNet(X_tensor.shape[1], 2, x_means, x_deviations, y_mean, y_std).to(
    device
)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [9]:
print("\nTraining Neural Network...")
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(
    dataset, batch_size=512, shuffle=True
)

model.train()
n_epochs = 100

for epoch in range(n_epochs):
    epoch_loss = 0.0
    for xb, yb in loader:
        optimizer.zero_grad(set_to_none=True)
        _, pred_scaled = model(xb)
        yb_scaled = (yb - y_mean) / y_std
        loss = loss_fn(pred_scaled, yb_scaled)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)

    if epoch % 5 == 0:
        print(f"Epoch {epoch} | Loss: {epoch_loss / len(dataset):.4f}")


Training Neural Network...
Epoch 0 | Loss: 0.8518
Epoch 5 | Loss: 0.4572
Epoch 10 | Loss: 0.4228
Epoch 15 | Loss: 0.3929
Epoch 20 | Loss: 0.3746
Epoch 25 | Loss: 0.3638
Epoch 30 | Loss: 0.3544
Epoch 35 | Loss: 0.3431
Epoch 40 | Loss: 0.3353
Epoch 45 | Loss: 0.3252
Epoch 50 | Loss: 0.3230
Epoch 55 | Loss: 0.3250
Epoch 60 | Loss: 0.3120
Epoch 65 | Loss: 0.3024
Epoch 70 | Loss: 0.2998
Epoch 75 | Loss: 0.2999
Epoch 80 | Loss: 0.3008
Epoch 85 | Loss: 0.2908
Epoch 90 | Loss: 0.2974
Epoch 95 | Loss: 0.2872


In [10]:
def get_x_from_z(z):
    return clamp_min + (clamp_max - clamp_min) * torch.sigmoid(z)

In [11]:
target_vals = torch.tensor([[9.0, 1_000_000_000.0]], device=device)
target_scaled = (target_vals - y_mean) / y_std

w_score = 10.0
w_gross = 10.0
w_profit = 20.0

with torch.no_grad():
    clamp_min = X_tensor.min(dim=0, keepdim=True)[0]
    clamp_max = X_tensor.max(dim=0, keepdim=True)[0]

if budget_idx is not None:
    clamp_min[0, budget_idx] = 20_000_000.0
    clamp_max[0, budget_idx] = 200_000_000.0

z_init = torch.rand((1, X_tensor.shape[1]), device=device) * 0.4 + 0.3
z = torch.nn.Parameter(torch.logit(z_init.clamp(1e-6, 1 - 1e-6)))
optimizer_infer = torch.optim.Adam([z], lr=0.05)

In [None]:
model.eval()
n_steps = 10000
scale_factor = 1e8

for step in range(n_steps):
    optimizer_infer.zero_grad(set_to_none=True)

    x_guess = get_x_from_z(z)
    y_pred_real, y_pred_scaled = model(x_guess)

    # 1. Maximization Objectives (Score & Gross)
    diff_score = target_scaled[0, 0] - y_pred_scaled[0, 0]
    diff_gross = target_scaled[0, 1] - y_pred_scaled[0, 1]
    loss_score = w_score * torch.relu(diff_score) ** 2
    loss_gross = w_gross * torch.relu(diff_gross) ** 2

    # 2. Profitability Constraint: Gross >= 1.3 * Budget
    budget_val = x_guess[0, budget_idx]
    gross_val = y_pred_real[0, 1]
    loss_profit = (
        w_profit * torch.relu((1.3 * budget_val - gross_val) / scale_factor) ** 2
    )

    # 3. Regularize z
    loss_reg = 0.001 * torch.sum(z * z)

    loss = loss_score + loss_gross + loss_profit + loss_reg
    loss.backward()
    optimizer_infer.step()

    if step % 500 == 0 or step == n_steps - 1:
        s = y_pred_real[0, 0].item()
        g = y_pred_real[0, 1].item()
        b = budget_val.item()
        print(
            f"Step {step} | Score: {s:.2f}, Gross: ${g/1e6:.1f}M, Budget: ${b/1e6:.1f}M"
        )

Step 0 | Score: 10.38, Gross: $520.1M, Budget: $126.8M
Step 500 | Score: 9.95, Gross: $1441.5M, Budget: $153.8M
Step 1000 | Score: 10.05, Gross: $1421.7M, Budget: $152.6M
Step 1500 | Score: 10.14, Gross: $1395.2M, Budget: $150.7M
Step 2000 | Score: 10.17, Gross: $1363.2M, Budget: $148.2M
Step 2500 | Score: 10.16, Gross: $1326.1M, Budget: $145.0M
Step 3000 | Score: 10.10, Gross: $1284.6M, Budget: $141.1M
Step 3500 | Score: 10.03, Gross: $1239.7M, Budget: $136.6M
Step 4000 | Score: 9.91, Gross: $1193.3M, Budget: $131.6M
Step 4500 | Score: 9.76, Gross: $1146.3M, Budget: $126.5M
Step 5000 | Score: 9.60, Gross: $1098.8M, Budget: $121.5M
Step 5500 | Score: 9.43, Gross: $1051.6M, Budget: $117.2M
Step 6000 | Score: 9.26, Gross: $1004.8M, Budget: $114.0M
Step 6500 | Score: 9.09, Gross: $1001.6M, Budget: $113.2M
Step 7000 | Score: 9.00, Gross: $1000.9M, Budget: $112.7M
Step 7500 | Score: 9.01, Gross: $1010.1M, Budget: $112.6M
Step 8000 | Score: 9.06, Gross: $1008.8M, Budget: $112.1M
Step 8500 | 

In [15]:
with torch.no_grad():
    final_x = get_x_from_z(z)[0].cpu().numpy()
    final_out, _ = model(get_x_from_z(z))
    final_score = final_out[0, 0].item()
    final_gross = final_out[0, 1].item()
    final_budget = final_x[budget_idx]

print("\n" + "=" * 40)
print("OPTIMIZATION COMPLETE")
print(f"Predicted IMDB Score: {final_score:.2f}")

# print billion if > 1 billion
if final_gross > 1e9:
    print(f"Predicted Gross:      ${final_gross/1e9:.1f}B")
else:
    print(f"Predicted Gross:      ${final_gross/1e6:.1f}M")
if final_budget > 1e9:
    print(f"Required Budget:      ${final_budget/1e9:.1f}B")
else:
    print(f"Required Budget:      ${final_budget/1e6:.1f}M")
print(f"Gross/Budget Ratio:   {final_gross/final_budget:.2f}x")
print("=" * 40)

if 20_000_000.0 <= final_budget <= 200_000_000.0:
    print("✅ Budget Constraint Satisfied (20M-200M).")
else:
    print(f"⚠️ Budget Constraint Violated: ${final_budget:,.0f}")


if 9 <= final_score <= 10:
    print("✅ IMDB score Constraint Satisfied (9-10).")
else:
    print(f"⚠️ IMDB score Constraint Violated: ${final_score}")

print("\nKey Features for this Outcome:")
cats = [
    (feature_names[i], val)
    for i, val in enumerate(final_x)
    if i != budget_idx and val > 0.5
]
cats.sort(key=lambda x: x[1], reverse=True)

for name, val in cats[:20]:
    print(f"- {name}: {val:.2f}")


OPTIMIZATION COMPLETE
Predicted IMDB Score: 9.04
Predicted Gross:      $1.0B
Required Budget:      $110.9M
Gross/Budget Ratio:   9.09x
✅ Budget Constraint Satisfied (20M-200M).
✅ IMDB score Constraint Satisfied (9-10).

Key Features for this Outcome:
- num_voted_users: 1097348.75
- cast_total_facebook_likes: 428051.62
- movie_facebook_likes: 173733.78
- actor_1_facebook_likes: 153314.42
- actor_2_facebook_likes: 69303.48
- actor_3_facebook_likes: 11338.59
- director_facebook_likes: 11146.63
- num_user_for_reviews: 2551.37
- title_year: 1962.93
- num_critic_for_reviews: 424.98
- duration: 300.77
- facenumber_in_poster: 20.60
- genre_Reality-TV: 0.55
- director_Clint Eastwood: 0.52
- language_Unknown: 0.52
- director_Unknown: 0.51
- genre_News: 0.51
- director_Steven Soderbergh: 0.51
- genre_Family: 0.51
- content_rating_TV-MA: 0.51
