In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
plt.style.use("../assets/plot_styles.mplstyle")
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import ast
import sys
import json
import redis

from dateutil.relativedelta import relativedelta
import shap

sys.path.append('../library')
from core import flattenWithGenerator
from plotting import loadPalette, loadTableStyles, createBoxplotWithTTests

from IPython.display import display, Markdown
from matplotlib.ticker import FuncFormatter
import string

import cpi
from adjustText import adjust_text
from tqdm.notebook import tqdm
import requests

color_palette = loadPalette()

In [119]:
save_image_path = '../assets/savedImages/howToMakeMoneyWithAHorror'

if not os.path.exists(save_image_path):
    os.makedirs(save_image_path)

TMDB_AUTH_TOKEN = os.getenv('TMDB_AUTH_TOKEN')

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_AUTH_TOKEN}"
}

### Load TMDB Df

In [120]:
tmdb_df_raw = pd.read_csv('../data/tmdbDetails.csv')
tmdb_df_raw.drop_duplicates('imdb_id', keep='last', inplace=True)
tmdb_df_raw = tmdb_df_raw[tmdb_df_raw['adult'] == False]
tmdb_df_raw = tmdb_df_raw[tmdb_df_raw['genres'].str.contains('Animation') == False]
tmdb_df_raw['release_date'] = pd.to_datetime(tmdb_df_raw['release_date'])
tmdb_df_raw['year'] = tmdb_df_raw['release_date'].dt.year
tmdb_df_raw = tmdb_df_raw[tmdb_df_raw['revenue'] > 0]
tmdb_df_raw = tmdb_df_raw[tmdb_df_raw['budget'] > 0]
tmdb_df_raw['RBR'] = tmdb_df_raw['revenue'] / tmdb_df_raw['budget']

In [121]:
tmdb_df_raw['first_genre'] = tmdb_df_raw['genres'].str.split("'name': '").str[1].str.split("'").str[0]

In [None]:
tmdb_df_raw = tmdb_df_raw[tmdb_df_raw['first_genre'] != 'TV Movie']
# Assuming tmdb_df_raw is your DataFrame
grouped = tmdb_df_raw.groupby('first_genre')

# Extract the data for each group
data = [group['RBR'].values for name, group in grouped]

# Create the box plot
fig, ax = plt.subplots(figsize=(10, 6))
medianprops=dict(color='k', linewidth=3)
bp = ax.boxplot(data, whis=.75, patch_artist=True, medianprops=medianprops)

means = [group['RBR'].median() for name, group in grouped]
max_mean_idx = means.index(max(means))
bp['boxes'][max_mean_idx].set_facecolor(color_palette['lime'])

for i, _ in enumerate(bp['boxes']):
    if i == max_mean_idx:
        continue
    bp['boxes'][i].set_facecolor(color_palette['canvas'])

plt.suptitle('')  # Suppress the default title to avoid overlap
plt.xlabel('Primary Genre', fontsize=14)
plt.ylabel('RBR', fontsize=14)

# Set x-axis labels
ax.set_xticklabels(grouped.groups.keys())

# Set y-axis limit
ax.set_ylim(0, 10)
ax.set_xticklabels(grouped.groups.keys(), rotation=45, ha='right')
fig.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.1, hspace=0.2, wspace=0.15)


# Save image
image_file_path = os.path.join(save_image_path, 'genreMedians.png')
plt.savefig(image_file_path, dpi=300)

# Display the plot
plt.show()

### Filter for horror & adjust for inflation

In [9]:
from datetime import datetime

In [124]:
# Extract the year from the release_date column
tmdb_df_raw['release_year'] = tmdb_df_raw['release_date'].dt.year

# Filter the DataFrame for Horror genre
horror_df = tmdb_df_raw[tmdb_df_raw['genres'].str.contains('Horror')]

# Find adj year
horror_df['adjust_to'] = horror_df['release_year'] + ((2024 - horror_df['release_year'])/1.75)

horror_df.dropna(subset='adjust_to', inplace=True)

# Define a function to adjust a whole column for inflation
def adj_for_inflation(column, years):
    return [cpi.inflate(value, year, to=2023) if year < 2023 else value for value, year in zip(column, years)]

# Adjust the revenue and budget columns for inflation
horror_df['revenue_adj'] = adj_for_inflation(horror_df['revenue'], horror_df['adjust_to'].astype(int))
horror_df['budget_adj'] = adj_for_inflation(horror_df['budget'], horror_df['adjust_to'].astype(int))

In [125]:
all_ids = list(tmdb_df_raw['imdb_id'].unique())

with open('../data/all_ids.json', 'w') as f:
    json.dump(all_ids, f)

In [None]:
all_genres = list(flattenWithGenerator([ast.literal_eval(e) for e in list(tmdb_df_raw['genres'])]))
unique_genres = set([e['name'] for e in all_genres])
print(unique_genres)

In [None]:
horror_df['RBR'] = horror_df['revenue'] / horror_df['budget']
horror_df['RBR_adj'] = horror_df['revenue_adj'] / horror_df['budget_adj']

print(f"WE HAVE {len(horror_df)} HORROR MOVIES")

### Budget vs. Revenue Scsatter

In [None]:
# Define formatter function
def millions(x, pos):
    return f'{x * 1e-6:.1f}M'
def billions(x, pos):
    return f'{x * 1e-9:.1f}B'

fix, ax = plt.subplots()

ax.scatter(horror_df['budget_adj'], horror_df['revenue_adj'], c = loadPalette()['blue_grey_dark'], zorder=1)

# Define line points
x_vals = np.array(ax.get_xlim())
y_vals = x_vals  # Slope of one

# Plot line
ax.plot(x_vals, y_vals, '--', color=loadPalette()['cherry'], label='Revenue = Budget')

# Apply formatter to x and y axes
ax.xaxis.set_major_formatter(FuncFormatter(millions))
ax.yaxis.set_major_formatter(FuncFormatter(billions))
ax.set_xlabel('Adjusted Budget', fontsize=14)
ax.set_ylabel('Adjusted Revenue', fontsize=14)

# Annotate outliers
texts = []
for i, row in horror_df.iterrows():
    if row['RBR'] > 450:
        texts.append(ax.text(
            row['budget'], row['revenue'], row['title'],
            fontsize=8, fontfamily='monospace', alpha=1, zorder=2
        ))

for i, row in horror_df.iterrows():
    if row['revenue_adj'] > 500_000_000:
        texts.append(ax.text(
            row['budget_adj'], row['revenue_adj'], row['title'],
            fontsize=8, fontfamily='monospace', alpha=1, zorder=2
        ))

# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.25))
        
plt.legend()
plt.tight_layout(pad=2.0, w_pad=0.5, h_pad=1.0)

# Save image
image_file_path = os.path.join(save_image_path, 'xyScatterAll.png')
plt.savefig(image_file_path, dpi=300)

plt.show()

# Load Film Classifications

In [129]:
with open('../data/horror_classifications.json') as f:
    classifications = json.load(f)

### Raw boxplots

In [130]:
class_df = pd.DataFrame.from_dict(classifications).T
scores_df = pd.json_normalize(class_df[2])
scores_df.index = class_df.index

class_df.drop(2, axis=1, inplace=True)

class_df = pd.merge(class_df, scores_df, left_index=True, right_index=True)

In [131]:
rbr_dict = horror_df.set_index('imdb_id')['RBR'].to_dict()

In [132]:
class_df['RBR'] = class_df.index.map(rbr_dict)

In [133]:
class_df.rename(columns={0:'classification',1:'confidence'}, inplace=True)

In [134]:
class_df.dropna(subset='RBR', inplace=True)

In [None]:
grouped = class_df.groupby('classification')

# Extract the data for each group
data = [group['RBR'].values for name, group in grouped]


means = [group['RBR'].median() for name, group in grouped]
maxMeanIdx = means.index(max(means))
bp['boxes'][maxMeanIdx].set_facecolor(color_palette['lime'])

# Create the box plot
fig, ax = plt.subplots(figsize=(10, 6))
medianprops=dict(color='k', linewidth=3)
bp = ax.boxplot(data, whis=.75, patch_artist=True, medianprops=medianprops)

for i, _ in enumerate(bp['boxes']):
    bp['boxes'][i].set_facecolor(color_palette['canvas'])

plt.suptitle('')  # Suppress the default title to avoid overlap
plt.xlabel('Primary Classification', fontsize=14)
plt.ylabel('RBR', fontsize=14)

# Set x-axis labels
ax.set_xticklabels(grouped.groups.keys())

# Set y-axis limit
ax.set_ylim(0, 15)
ax.set_xticklabels(grouped.groups.keys(), rotation=45, ha='right')
fig.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.1, hspace=0.2, wspace=0.15)

# Save image
image_file_path = os.path.join(save_image_path, 'classification_medians.png')
plt.savefig(image_file_path, dpi=300)

# Display the plot
plt.show()

### XG BOOST
- We're going to fit an xgboost model with several possibly meaningful features
- Then we'll look at the shapley values that are returned from those features

In [137]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, root_mean_squared_error  # For classification tasks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

### Feature Engineering

#### Get Ratings

In [141]:
r5 = redis.Redis(
    host='127.0.0.1',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db=5
)

redis_keys = r5.keys('*')
redis_values = [float(i) for i in r5.mget(redis_keys) if i != 'None']

ratings_dict = dict(zip(redis_keys,redis_values))

horror_df['imdb_rating'] = horror_df['imdb_id'].map(ratings_dict)

#### Get Cast

In [151]:
r6 = redis.Redis(
    host='127.0.0.1',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db=6
)

redis_keys = r6.keys('*')
redis_values = [json.loads(i) for i in r6.mget(redis_keys)]

cast_dict = dict(zip(redis_keys,redis_values))

In [157]:
horror_ids = set(horror_df['imdb_id'].unique())
cast_dict = {imdb_id: people for imdb_id, people in cast_dict.items() if imdb_id in horror_ids}

In [193]:
filtered_credits_dict = {}

for imdb_id, people in cast_dict.items():
    filtered_credits_dict[imdb_id] = {
        'top_credits': [{'name':a['name'], 'id':a['id'], 'gender':a['gender']} for a in people['cast'] if a['order'] < 6],
        'directors': [{'name':d['name'], 'id':d['id'], 'gender':d['gender']} for d in people['crew'] if d['job'] == 'Director']
    }

In [194]:
release_dates_dict = horror_df.set_index('imdb_id')['release_date'].to_dict()

In [None]:
for imdb_id, people in tqdm(filtered_credits_dict.items()):
    try:
        release_date = pd.to_datetime(release_dates_dict[imdb_id], utc=True)
        top_credits_updated = []
        directors_credits_updated = []
        for person in people['top_credits']:
            try:
                person_id = person['id']
                
                url = f"https://api.themoviedb.org/3/person/{person_id}?language=en-US"
                details = requests.get(url, headers=headers)

                birthday = pd.to_datetime(details.json()['birthday'], utc=True)

                age = relativedelta(release_date, birthday).years
                person['age'] = age

                top_credits_updated.append(person)
            except Exception as e:
                pass
        for person in people['directors']:
            try:
                person_id = person['id']
                
                url = f"https://api.themoviedb.org/3/person/{person_id}?language=en-US"
                details = requests.get(url, headers=headers)

                birthday = pd.to_datetime(details.json()['birthday'], utc=True)

                age = relativedelta(release_date, birthday).years

                person['age'] = age

                directors_credits_updated.append(person)
            except:
                pass

        filtered_credits_dict[imdb_id] = {
            'directors': directors_credits_updated,
            'top_credits': top_credits_updated
        }
    except:
        pass

In [243]:
imdb_features_dict = {}

people_ids_dict = {}

for imdb_id, people in filtered_credits_dict.items():
    if len(people['directors']) == 0:
        continue
    if len(people['top_credits']) == 0:
        continue

    director_age = np.mean([p['age'] for p in people['directors']])
    director_gender = people['directors'][0]['gender']
    
    cast_mean_age = np.mean([p['age'] for p in people['top_credits']])
    lead_gender = people['top_credits'][0]['gender']

    imdb_features_dict[imdb_id] = {
        'director_age': director_age,
        'director_gender': director_gender,
        'cast_mean_age': cast_mean_age,
        'lead_gender': lead_gender
    }

    people_ids_dict[imdb_id] = {
        'cast_encoded': [i['id'] for i in people['top_credits']],
        'director_encoded': [i['id'] for i in people['directors']][0]
    }

In [None]:
data = pd.DataFrame.from_dict(people_ids_dict).T
data.rename(columns = {'index':'imdb_id'}, inplace=True)

unique_cast_ids =  sorted(set(list(flattenWithGenerator(data['cast_encoded']))))
unique_director_ids = sorted(set(list(flattenWithGenerator(data['director_encoded']))))
data.head()

In [277]:
horror_df['release_month'] = horror_df['release_date'].dt.month
horror_df.set_index('imdb_id', inplace=True)

In [285]:
imdb_features = pd.DataFrame.from_dict(imdb_features_dict).T

In [None]:
model_df

In [292]:
model_df = pd.merge(data, horror_df[['imdb_rating','RBR_adj','runtime','release_month']], left_index=True, right_index=True)

model_df = pd.merge(imdb_features, model_df, left_index=True, right_index=True)

In [299]:
cast_sequential_map = {i: idx for idx,i in enumerate(unique_cast_ids)}
director_sequential_map = {i: idx for idx,i in enumerate(unique_director_ids)}

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Prepare the data
# Assuming data is your DataFrame as defined earlier
model_df['director_encoded'] = pd.to_numeric(model_df['director_encoded'], errors='coerce')  # Convert to numeric, coerce errors
model_df['cast_encoded'] = model_df['cast_encoded'].apply(lambda x: [cast_sequential_map[int(i)] for i in x])  # Ensure cast_encoded is of int type
model_df['director_encoded'] = model_df['director_encoded'].apply(lambda x: director_sequential_map[x])


max_cast_id = max([max(cast) for cast in model_df['cast_encoded']])
num_casts = len(set([item for sublist in model_df['cast_encoded'] for item in sublist]))  # Unique cast ids
num_directors = model_df['director_encoded'].nunique()  # Unique director ids

# Create input tensor for cast and director
cast_tensors = torch.nn.utils.rnn.pad_sequence(
    [torch.tensor(cast) for cast in model_df['cast_encoded']],
    batch_first=True
)


# Check if there are any NaN values after conversion
if model_df['director_encoded'].isnull().any():
    print("Warning: There are NaN values in director_encoded after conversion.")

# Convert director_encoded to a tensor
director_tensors = torch.tensor(model_df['director_encoded'].values.astype(int))  # Ensure it's int
# Prepare target tensor
targets = torch.tensor(model_df['RBR_adj'].values).float()

# Check tensors
print("Cast Tensor Shape:", cast_tensors.shape)
print("Director Tensor Shape:", director_tensors.shape)
print("Targets Shape:", targets.shape)

In [None]:
max_cast_id = max([max(cast) for cast in model_df['cast_encoded']])
max_director_id = max(model_df['director_encoded'])
print("Max cast ID:", max_cast_id)
print("Max director ID:", max_director_id)

In [None]:
# Define the embedding model
class MovieEmbeddingModel(nn.Module):
    def __init__(self, num_actors, num_directors, embedding_dim):
        super(MovieEmbeddingModel, self).__init__()
        self.actor_embedding = nn.Embedding(num_actors, embedding_dim)
        self.director_embedding = nn.Embedding(num_directors, embedding_dim)

    def forward(self, actor_ids, director_ids):
        # Get embeddings for actors
        actor_embeds = self.actor_embedding(actor_ids)  # Shape: (batch_size, num_actors, embedding_dim)
        # Average the actor embeddings
        actor_avg_embedding = actor_embeds.mean(dim=1)  # Shape: (batch_size, embedding_dim)

        # Get the director embedding
        director_embed = self.director_embedding(director_ids)  # Shape: (batch_size, embedding_dim)

        # Combine actor and director embeddings
        combined_embedding = actor_avg_embedding + director_embed
        return combined_embedding

# Training parameters
embedding_dim =  cast_tensors.shape[0]  # Size of the embeddings
model = MovieEmbeddingModel(num_actors=num_casts + 1, num_directors=num_directors + 1, embedding_dim=embedding_dim)

# Training setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
model.train()
num_epochs = 100  # Adjust based on your needs
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(cast_tensors, director_tensors)  # Forward pass
    loss = criterion(outputs.squeeze(), targets)  # Calculate loss
    loss.backward()  # Backward pass
    optimizer.step()  # Optimize
    if (epoch + 1) % 10 == 0:  # Print every 10 epochs
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Extract embeddings
actor_embeddings = model.actor_embedding.weight.data.numpy()
director_embeddings = model.director_embedding.weight.data.numpy()

### Modeling

In [333]:
actor_embeddings = np.random.rand(model_df.shape[0], cast_tensors.shape[0])  # Replace with actual embedding size and values
director_embeddings = np.random.rand(model_df.shape[0], cast_tensors.shape[0])  # Same here

feature_cols = ['director_age', 'director_gender', 'cast_mean_age', 'lead_gender', 'imdb_rating', 'runtime', 'release_month']

means = model_df[feature_cols].mean()

model_df.fillna(means, inplace=True)

# Combine features
features = np.hstack((
    model_df[feature_cols].values,
    actor_embeddings,
    director_embeddings
))

In [335]:
scaler = Normalizer()

features_normed = scaler.fit_transform(features)

X = features
y = model_df['RBR_adj'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],     # Number of trees
    'max_depth': [3, 5, 7],              # Depth of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'min_child_weight': [1, 3, 5],       # Minimum child weight
    'gamma': [0, 0.1, 0.2],              # Minimum loss reduction to split
    'subsample': [0.8, 1.0],             # Fraction of data for boosting
    'colsample_bytree': [0.8, 1.0],      # Fraction of features for each tree
    'reg_alpha': [0.1, 0.5],             # L1 regularization term
    'reg_lambda': [1.0, 2.0],            # L2 regularization term
}

# Initialize the model
xgb_reg = xgb.XGBRegressor(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, scoring='neg_mean_squared_error', 
                        cv=3, verbose=1, n_jobs=-1)

# X_train_d = dtrain.get_float_info('data')
# y_train_d = dtrain.get_label()

# Fit the model
grid_search.fit(X_train, y_train)

# Output the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(root_mean_squared_error(y_test, y_pred))
print(f"RMSE after tuning: {rmse}")

In [None]:
# Initialize SHAP explainer for your XGBoost model
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(dtrain)

# Plot a summary of feature importance with direction of effect
plt.figure()

fig = shap.summary_plot(shap_values, dtrain, show=False)
image_file_path = os.path.join(save_image_path, 'shapValues.png')
plt.savefig(image_file_path, dpi=300)

plt.show()

In [None]:
shap.dependence_plot(5, shap_values, X_test)  # 0 refers to the index of the feature