### Imports

In [None]:
import pandas as pd
import statsmodels as sm
import os
from tabulate import tabulate
import matplotlib.pyplot as plt
plt.style.use("../assets/plot_styles.mplstyle")
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from collections import Counter
import ast
import sys
import json
import redis

sys.path.append('../library')
from core import flattenWithGenerator, createSlidingWindows
from midStats import linearModelGeneral
from plotting import loadPalette, loadTableStyles

from IPython.display import display, Markdown
from datetime import timedelta
from dateutil.relativedelta import relativedelta

from scipy.interpolate import interp1d
from matplotlib.ticker import FuncFormatter
from matplotlib.animation import FuncAnimation
from matplotlib.patches import Ellipse
from adjustText import adjust_text
from tabulate import tabulate
from tqdm import tqdm
import requests

colorPalette = loadPalette()

### Overall Script Flow
- Look at the composition of weekly top tens as a function of time
- Examine points where this trend changed
- Look at the RBR of different movies as a function of some of their other attributes

### Load Data
- Using data from tmdb and then two other sources of box office data

In [None]:
tmdbDf = pd.read_csv(os.path.join('../data/', 'tmdbDetails.csv'))
tmdbDf.drop_duplicates(subset='imdb_id', inplace=True, keep='last')
tmdbDict = tmdbDf.set_index('imdb_id').to_dict('index')

bomDf = pd.read_csv(os.path.join('../data/', 'allBoxOffice.csv'))
bomDf.drop_duplicates(subset=['imdbId','dayNumber'], inplace=True, keep = 'last')
bomDf.dropna(subset=['date'], inplace=True)

tnDf = pd.read_csv(os.path.join('../data/', 'numbersBoxOffice.csv'))

### Global Vars

In [None]:
saveImagePath = '../assets/savedImages/whatHappenedToTheComedy'

if not os.path.exists(saveImagePath):
    os.makedirs(saveImagePath)

### Do some basic cleaning

In [None]:
tmdbDf.rename(columns={'imdb_id':'imdbId'}, inplace=True)
tmdbWithBoxOfficeDf = tmdbDf[tmdbDf['imdbId'].isin(tnDf['imdbId'].unique())]
tmdbWithBoxOfficeDf = tmdbWithBoxOfficeDf[['budget','imdbId','revenue','genres']]
tmdbWithBoxOfficeDf.drop_duplicates(subset='imdbId', keep='last', inplace=True)
tmdbWithBoxOfficeDf.set_index('imdbId',inplace=True, drop=True)
tmdbBODict = tmdbWithBoxOfficeDf.to_dict('index')

In [None]:
tnDf['tmdbData'] = tnDf['imdbId'].map(tmdbBODict)
tnDf[['budget','revenue','genres']] = pd.json_normalize(tnDf['tmdbData'])

In [None]:
def extractGenres(g):
    gList = ast.literal_eval(g)
    genOut = [e['name'] for e in gList]
    return genOut

tnDf['genresExtracted'] = tnDf['genres'].apply(lambda x: extractGenres(x))

### Calculate weekly rankings -- loaded ranks don't seem to work

In [None]:
tnDf.sort_values(by=['dateDt', 'weekGross'], ascending=[False, False], inplace=True)

tnDf['rank'] = tnDf.groupby('dateDt').cumcount() + 1

### Get percentage of total weekly box office each genre accounts for
- This is more robust than just top ten
- We're doing some mapping here of the more obscure genres to more standardized ones. This could have an impact on results, but is a useful step

In [None]:
omitGenres = []

genreMap = {
    'Action/Thriller': ['Action','Western','War', 'Adventure'],
    'Suspense':['Thriller','Crime','Mystery'],
    'Romance': ['Romance'],
    'Animated/Family': ['Animation','Family'],
    'Sci-Fi/Fantasy': ['Science Fiction','Fantasy'],
    'Comedy': ['Comedy'],
    'Drama': ['Drama'],
    'Documentary':['Documentary'],
    'Misc':['TV Movie','Music','History','']
}

allGenres = list(genreMap.keys())

def calculateRatios(row: pd.Series):
    lst = row['genresExtracted']
    weekGross = row['weekGross']
    lstMapped = list(flattenWithGenerator([[k for k,v in genreMap.items() if el in v] for el in lst]))

    total_count = len(lstMapped)
    counter = Counter(lstMapped)

    proportions = {key: (value / total_count)*weekGross for key, value in counter.items()}

    for key in genreMap.keys():
        if key not in proportions.keys():
            proportions[key] = 0

    return proportions

tnDf['proportions'] = tnDf.apply(lambda row: calculateRatios(row), axis = 1)

### Calculate proportions of weekly totals

In [None]:
weeklyTotals = tnDf.groupby('dateDt')['weekGross'].agg('sum')

In [None]:
def calculateOverallContribution(group):
    allDicts = group['proportions']
    totalGross = group['weekGross'].sum()

    overallContributions = {}
    for k in genreMap.keys():
        kSum = 0
        for d in allDicts:
            kSum += d[k]
        
        if ratio:
            overallContributions[k] = kSum/totalGross
        else:
            overallContributions[k] = kSum

    return overallContributions

overallContributions = tnDf.groupby('dateDt').apply(lambda group: calculateOverallContribution(group))

In [None]:
conDict = overallContributions.to_dict()
conDf = pd.DataFrame.from_dict(conDict).T
conDf.reset_index(drop=False, inplace=True)
conDf['index'] = pd.to_datetime(conDf['index'])
conDf.set_index('index', inplace=True, drop=True)

In [None]:
plotDf = conDf[conDf.index > pd.to_datetime('1900-01-01', utc=True )]

yearRoll = 5*52

rollingDf = plotDf.rolling(window=(yearRoll)).mean()
normRollingDf = rollingDf.div(rollingDf.sum(axis=1), axis=0)


# Create the stack plot
fig, ax = plt.subplots(figsize=(8, 4))

# Plotting the stack plot
ax.stackplot(normRollingDf.index, normRollingDf.T, labels=normRollingDf.columns)

# Adding labels and title
ax.set_title(f'{int(yearRoll/52)} Year Rolling Average')
ax.set_xlabel('Date')
ax.set_ylabel('Box Office Proportion')
# Reverse the order of the legend labels
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='upper left')

# Save image
imageFilePath = os.path.join(saveImagePath, 'distributionsOverTime.png')
plt.savefig(imageFilePath, dpi=300)

# Display the plot
plt.show()

### Turn above image into an animation

In [None]:
confirmation = input("Are you sure you want to proceed (This takes a longggg time)? (yes/no): ")

if confirmation.lower() == 'yes':
    plotDf = conDf[conDf.index > pd.to_datetime('1900-01-01', utc=True)]

    rollingDf = plotDf.rolling(window=(yearRoll)).mean()
    rollingDf.dropna(how='all', axis=0, inplace=True)
    normRollingDf = rollingDf.div(rollingDf.sum(axis=1), axis=0)

    # Create the stack plot
    fig, ax = plt.subplots(figsize=(8, 4))

    def update(i):
        ax.clear()
        plotDf = normRollingDf.iloc[0:i+1]
        ax.stackplot(plotDf.index, plotDf.T, labels=plotDf.columns)

        ax.set_title(f'{int(yearRoll / 52)} Year Rolling Average')
        ax.set_xlabel('Date')
        ax.set_ylabel('Box Office Proportion')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[::-1], labels[::-1], loc='upper left')

    fps = 50

    ani = FuncAnimation(fig, update, frames=range(len(normRollingDf)))

    # Save the animation to an mp4 file
    ani.save(os.path.join(saveImagePath, 'genreDistributions.mp4'), writer='ffmpeg', fps=fps, dpi=200)

### Generate line plots

In [None]:
conDf.dropna(how = 'all', inplace=True)

rollingDf = conDf.rolling(window=(yearRoll)).mean()

# Plotting
fig, ax = plt.subplots(figsize=(8, 4))

for c in rollingDf.columns:
    if c in ['Romance','Comedy','Suspense','Sci-Fi/Fantasy','Action/Thriller']:
        lw=3
        opacity = 1
    else:
        lw=.5
        opacity = .5

    ax.plot(rollingDf[c], label=f"{c}", lw=lw, alpha = opacity)

ax.set_title(f'{int(yearRoll/52)} Year Rolling Average')
ax.set_xlabel('Date')
ax.set_ylabel('Box Office Proportion')
ax.legend(loc='upper left')

# Save image
imageFilePath = os.path.join(saveImagePath, 'linePlotsOverTime.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

## Statistics

### Basic Regressions
- Using date as the x variable will tell us if there is a meaningful relationship between time and proportion

In [None]:
outputTable = []
tests = ['Comedy','Sci-Fi/Fantasy','Action/Thriller', 'Romance', 'Suspense']

for test in tests:
    y = conDf[test]
    X = np.arange(0, len(y))

    model = linearModelGeneral(X, y, [1])

    outputTable.append(
        [test, model['params']['x1'], model['F'], model['p']]
    )

print(tabulate(outputTable, headers=['Genre', 'x1', 'F', 'p']))

### Inflection point analysis
- We're now going to look at whether there are corners in the line that are sharper than expected
- To do this, we'll essentially be doing some smoothed calculation second derivative stuff... but a bit different

In [None]:
def bowtieAnalysis(slopeWindow):
    """
        - On a sliding window, if we calculate the slope differential across an interval of slopes,
        we'll essentially find the sharpest X's on a graph

    """
    curveDifferential = slopeWindow[-1] - slopeWindow[0]
    
    return abs(curveDifferential)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))  # Create a 2x2 grid of subplots
# scaler = MinMaxScaler()

for idx, test in enumerate(['Comedy', 'Sci-Fi/Fantasy', 'Action/Thriller', 'Suspense']):  # Limit to the first 4 tests for the 2x2 grid
    ax = axs[idx // 2, idx % 2]
    plotDf = rollingDf.dropna(subset=test)
    # plotDf = pd.DataFrame(scaler.fit_transform(rollingDf), columns=rollingDf.columns, index=rollingDf.index)
    values = np.array(plotDf[test])

    windowSize = 150
    overlap = int(windowSize / 1.05)

    windows = createSlidingWindows(l=values, windowSize=windowSize, overlap=overlap)

    windowPreds = []
    slopes = []
    for i, window in enumerate(windows):
        y = window
        X = np.arange(0, len(y))
        model = linearModelGeneral(X, y, [1])
        slope = model['params'][-1]

        coefficients = np.polyfit(X,y,1)
        slope = coefficients[0]

        overallIndices = [((windowSize * i) - (overlap * i)) + j for j, k in enumerate(X)]
        preds = model['ypred']

        windowPreds.append(list(zip(overallIndices, preds))) # This is for plotting
        slopes.append(slope) # This is for determining most turbulant windows

    # Find max standard deviation across three slopes
    slopeWindowSize = 10
    slopeStep = slopeWindowSize - 1
    slopeWindows = createSlidingWindows(l=slopes, windowSize = slopeWindowSize, overlap = slopeStep) # the reason that we're sliding here is because we want to consider all possible windows

    windowStds = [bowtieAnalysis(window) for window in slopeWindows]

    maxIdxs = np.argsort([s for s in windowStds if s == s])[::-1]

    maxIdxsFiltered = []
    for j,i in enumerate(maxIdxs):
        if j==0:
            maxIdxsFiltered.append(i)
        else:
            if any(np.abs(np.array(maxIdxsFiltered) - i) < slopeWindowSize*3):
                continue

            if len(maxIdxsFiltered) >= 3:
                break

            maxIdxsFiltered.append(i)

    maxIdxsFull = []
    for idx in maxIdxsFiltered:
        startIdx = (idx * slopeWindowSize) - (slopeStep * idx)
        newIdxs = [i for i in range(startIdx, startIdx + slopeWindowSize)]
        maxIdxsFull += newIdxs


    ax.plot(np.arange(len(plotDf)), values, c='k', alpha=1, zorder=2)
    
    for i, pred in enumerate(windowPreds):
        Xplot, yplot = zip(*pred)
        
        color = ('cherry',.75,2) if i in maxIdxsFull else ('blue_grey_dark',.5,1)
        ax.plot(Xplot, yplot, c=loadPalette()[color[0]], alpha=color[1], lw=4, zorder=color[2])

    # Convert index to datetime and extract year
    plotDf.index = pd.to_datetime(plotDf.index)
    custom_ticks = np.arange(0, len(plotDf), step=256)
    custom_labels = plotDf.index[::256].year

    # Set custom ticks and labels
    ax.set_xticks(custom_ticks)
    ax.set_xticklabels(custom_labels)

    ax.set_title(f'{test} - Rolling Window Predictions')
    ax.set_xlabel('Year')
    ax.set_ylabel('Box Office Proportion Normed')

# fig.subplots_adjust(hspace=0.35, wspace=0.15)
fig.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, hspace=0.35, wspace=0.15)

# Save image
imageFilePath = os.path.join(saveImagePath, 'windowedAverages.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

### Studio prevalence
- Essentially repeating genre analysis, but substituting studio for genre

In [None]:
prodDf = tmdbDf.copy(deep=True)

prodDf = prodDf[prodDf['production_companies'] != '[]']

def getCompanies(s):
    j = ast.literal_eval(s)
    return [e['name'] for e in j]

prodDf['productionCompanies'] = prodDf['production_companies'].apply(lambda x: getCompanies(x))

In [None]:
companyDict = prodDf.set_index('imdbId')['productionCompanies'].to_dict()
companyDict = {k:v for k,v in companyDict.items() if v!=['Private']}

In [None]:
tnDf['productionCompany'] = tnDf['imdbId'].map(companyDict)
expDf = tnDf.explode('productionCompany')

In [None]:
distDf = pd.DataFrame(expDf.groupby(['dateDt','productionCompany'])['weekGross'].sum()).reset_index(drop=False)
distDf['proportions'] = distDf.groupby('dateDt')['weekGross'].transform(lambda g: g / g.sum())

distDf = distDf[['dateDt','productionCompany','proportions']]

pivotDf = distDf.pivot(index='dateDt', columns='productionCompany', values='proportions')
pivotDf.fillna(0, inplace=True)

# Only keep columns where there is a min value of .1

yearRoll = 2*52

rollingDf = pivotDf.rolling(window=(yearRoll)).mean()
rollingDf = rollingDf.loc[:, (rollingDf >= .04).any()]
print(f"NUMBER OF STUDIOS AFTER FILTER: {len(rollingDf.columns)}")

In [None]:
# Create the stack plot
fig, ax = plt.subplots(figsize=(8, 4))

normRollingDf = rollingDf.div(rollingDf.sum(axis=1), axis=0)


# Plotting the stack plot
ax.stackplot(normRollingDf.index, normRollingDf.T, labels=normRollingDf.columns)

# Adding labels and title
ax.set_title(f'Box Office by Studio') 
ax.set_xlabel('Date')
ax.set_ylabel('Box Office Proportion')
# Reverse the order of the legend labels
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='upper left')

# Select 8 evenly spaced ticks
num_ticks = 8
tick_positions = np.linspace(0, len(normRollingDf.index) - 1, num_ticks, dtype=int)
custom_ticks = normRollingDf.index[tick_positions]

# Format the ticks to display only the year
custom_labels = [pd.to_datetime(tick).year for tick in custom_ticks]

# Set custom ticks and labels
ax.set_xticks(tick_positions)
ax.set_xticklabels(custom_labels)

# Save image
imageFilePath = os.path.join(saveImagePath, 'studioDistributions.png')
plt.savefig(imageFilePath, dpi=300)

# Display the plot
plt.show()

### Examination of the early 90s

In [None]:
minDate1 = pd.to_datetime('1987-01-01', utc=True)
maxDate1 = pd.to_datetime('1990-01-01', utc=True)
minDate2 = pd.to_datetime('1990-01-01', utc=True)
maxDate2 = pd.to_datetime('1993-01-01', utc=True)

dateRanges = [(minDate1, maxDate1), (minDate2, maxDate2)]

for minDate,maxDate in dateRanges:
    tmdbDf['release_date'] = pd.to_datetime(tmdbDf['release_date'], utc=True)

    df1992 = tmdbDf[((tmdbDf['release_date'] >= minDate) & (tmdbDf['release_date'] < maxDate))]

    df1992.drop_duplicates(subset='imdbId', inplace=True, keep='last')

    df1992.sort_values(by='revenue', inplace=True, ascending = False)
    df1992['genresExtracted'] = df1992['genres'].apply(lambda x: extractGenres(x))

    dfDisplay = df1992[['title', 'revenue', 'release_date', 'genresExtracted']].head(10)
    dfDisplay.columns = ['Title','Lifetime Revenue', 'Release Date', 'Genres']
    dfDisplay.reset_index(drop=True, inplace=True)

    # Define formatting function
    def formatMillions(x):
        return f"${x / 1_000_000:.2f}M"
    
    def formatRelease(x):
        return f"{x.year}-{x.month}-{x.day}"

    # Apply formatting to 'Lifetime Revenue' column
    dfDisplay['Lifetime Revenue'] = dfDisplay['Lifetime Revenue'].apply(formatMillions)
    dfDisplay['Release Date'] = dfDisplay['Release Date'].apply(formatRelease)

    styled_df = dfDisplay.style.set_table_styles(loadTableStyles())
    display(Markdown(f"<h3 style='font-family:monospace; background-color:{loadPalette()['canvas_dark']}; color:black; padding:10px; margin-bottom:0;'>Movies Released Between {minDate.date()} and {maxDate.date()}</h3>"))
    display(styled_df)

### Examination of comedy RBRs

In [None]:
tmdbDf.dropna(subset='genres', inplace=True)
comedyDf = tmdbDf[tmdbDf['genres'].str.contains('Comedy')]

In [None]:
comedyDf['RBR'] = comedyDf['revenue'] / comedyDf['budget']
comedyDf = comedyDf[comedyDf['revenue']>0]

In [None]:
# Define formatter function
def millions(x, pos):
    return f'{x * 1e-6:.1f}M'

fix, ax = plt.subplots()

ax.scatter(comedyDf['budget'], comedyDf['revenue'], c = loadPalette()['blue_grey_dark'])

# Define line points
x_vals = np.array(ax.get_xlim())
y_vals = x_vals  # Slope of one

# Plot line
ax.plot(x_vals, y_vals, '--', color=loadPalette()['cherry'], label='Revenue == Budget')

# Apply formatter to x and y axes
formatter = FuncFormatter(millions)
ax.xaxis.set_major_formatter(formatter)
ax.yaxis.set_major_formatter(formatter)
ax.set_xlabel('Budget')
ax.set_ylabel('Revenue')

# Annotate outliers
texts = []
for i, row in comedyDf.iterrows():
    if row['RBR'] > 5 and row['budget'] >= 100_000_000:
        texts.append(ax.text(
            row['budget'], row['revenue'], row['title'],
            fontsize=8, fontfamily='monospace', alpha=0.75
        ))

# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.25))
        
plt.legend()


# Save image
imageFilePath = os.path.join(saveImagePath, 'xyScatterAll.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

In [None]:
midBudgetDf = comedyDf[((comedyDf['budget']<75_000_000) & (comedyDf['budget'] > 19_000_000))]
midBudgetDf = midBudgetDf[midBudgetDf['genres'].str.contains('Animation') == False]

In [None]:
# Assuming midBudgetDf and loadPalette() are already defined

fix, ax = plt.subplots()

# Replace infinite values with NaN and drop rows with NaN in 'RBR'
midBudgetDf.replace([np.inf, -np.inf], np.nan, inplace=True)
midBudgetDf.dropna(subset=['RBR'], inplace=True)

# Calculate Q1, Q3, and IQR
Q1 = midBudgetDf['RBR'].quantile(0.15)
Q3 = midBudgetDf['RBR'].quantile(0.85)
IQR = Q3 - Q1

# Define outliers
outliers = midBudgetDf[(midBudgetDf['RBR'] < Q1 - 1.5 * IQR) | (midBudgetDf['RBR'] > Q3 + 1.5 * IQR)]
outliers.drop_duplicates(subset='imdbId', keep='last', inplace=True)
# Plot regular points
ax.scatter(midBudgetDf['budget'], midBudgetDf['revenue'], c=loadPalette()['blue_grey_dark'], zorder=1)

# Plot outliers
ax.scatter(outliers['budget'], outliers['revenue'], c=loadPalette()['cherry'], s=10, zorder=2)

# Apply formatter to x and y axes
formatter = FuncFormatter(millions)
ax.xaxis.set_major_formatter(formatter)
ax.yaxis.set_major_formatter(formatter)
ax.set_xlabel('Budget')
ax.set_ylabel('Revenue')

# Annotate outliers
texts = []
for i, row in outliers.iterrows():
    texts.append(ax.text(
        row['budget'], row['revenue'], row['title'],
        fontsize=8, fontfamily='monospace', alpha=0.75
    ))

# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.25))

# Define line points
ax.set_xlim(19_000_000, 75_000_000)
x_vals = np.array(ax.get_xlim())
y_vals = x_vals  # Slope of one

# Plot line
ax.plot(x_vals, y_vals, '--', color=loadPalette()['cherry'], label='Revenue == Budget')
plt.legend()

# Save image
imageFilePath = os.path.join(saveImagePath, 'xyScatterOutliers.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

### Get cast list

In [None]:
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_AUTH_TOKEN = os.getenv("TMDB_AUTH_TOKEN")

In [None]:
def getCast(imdbId, justNames: bool = True):
    
    url = f"https://api.themoviedb.org/3/movie/{imdbId}/credits?language=en-US"

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {TMDB_AUTH_TOKEN}"
    }

    response = requests.get(url, headers=headers)
    rJ = json.loads(response.text)['cast']

    if justNames:

        names = [person['name'] for person in rJ if person['known_for_department'] == 'Acting'][:5]
        
        return ', '.join(names)

    else:
        return rJ

In [None]:
midBudgetDf.drop_duplicates(subset='imdbId', keep='last',inplace=True)
goodDfDisplay = midBudgetDf[['title','revenue','budget','RBR', 'imdbId']].sort_values(by='RBR', ascending = False).head(20)
goodDfDisplay['cast'] = goodDfDisplay['imdbId'].apply(lambda x: getCast(x))
goodDfDisplay.drop(columns=['imdbId'], inplace=True)
goodDfDisplay.columns = ['Title','Lifetime Revenue','Budget', 'RBR', 'Cast']
goodDfDisplay.reset_index(drop=True, inplace=True)

# Apply formatting to 'Lifetime Revenue' column
goodDfDisplay['Lifetime Revenue'] = goodDfDisplay['Lifetime Revenue'].apply(formatMillions)
goodDfDisplay['Budget'] = goodDfDisplay['Budget'].apply(formatMillions)
goodDfDisplay['RBR'] = goodDfDisplay['RBR'].round(2)

styledDf = goodDfDisplay.style.set_table_styles(loadTableStyles()).format({'RBR': '{:.2f}'})
display(Markdown(f"<h3 style='font-family:monospace; background-color:{loadPalette()['canvas_dark']}; color:black; padding:10px; margin-bottom:0;'>Highest comedy RBRs</h3>"))
display(styledDf)

In [None]:
midBudgetDf.drop_duplicates(subset='imdbId', keep='last',inplace=True)
badDfDisplay = midBudgetDf[['title','revenue','budget','RBR', 'imdbId']].sort_values(by='RBR', ascending = True).head(20)
badDfDisplay['cast'] = badDfDisplay['imdbId'].apply(lambda x: getCast(x))
badDfDisplay.drop(columns=['imdbId'], inplace=True)
badDfDisplay.columns = ['Title','Lifetime Revenue','Budget', 'RBR', 'Cast']
badDfDisplay.reset_index(drop=True, inplace=True)

# Apply formatting to 'Lifetime Revenue' column
badDfDisplay['Lifetime Revenue'] = badDfDisplay['Lifetime Revenue'].apply(formatMillions)
badDfDisplay['Budget'] = badDfDisplay['Budget'].apply(formatMillions)
badDfDisplay['RBR'] = badDfDisplay['RBR'].round(4)

styledDf = badDfDisplay.style.set_table_styles(loadTableStyles()).format({'RBR': '{:.4f}'})
display(Markdown(f"<h3 style='font-family:monospace; background-color:{loadPalette()['canvas_dark']}; color:black; padding:10px; margin-bottom:0;'>Lowest comedy RBRs</h3>"))
display(styledDf)

### Merge Dfs with Classifications

In [None]:
goodDf = midBudgetDf.sort_values(by='RBR', ascending = False).head(50)
badDf = midBudgetDf.sort_values(by='RBR', ascending = True).head(50)

goodDf['classification'] = 'good'
badDf['classification'] = 'bad'

mergedDf = pd.concat([goodDf, badDf])
mergedDict = mergedDf.to_dict('records')
castDict = list(mergedDf['imdbId'].apply(lambda x: getCast(x, False)))

### Assess the star power
- We're going to loop through all of the cast members of the movies and we are going to get:
    - Their age at the time of movie release
    - The success of the last x of their movies
    - Their billing success...

        - We could potentially run something that is a bit more robust here... but that's a story for a differnet day
            - Essentially create a simple ml algorithm, and determine shapley vals for each person

### We're doing some really janky API calls here. 
- To get cast details on every movie, we'll set this up as an asyncio function, but since it's only 100, I'll just take the dog for a walk while it runs

In [None]:
totalStarPower = {}
confirmation = input("Are you sure you want to proceed (This takes a longggg time)? (yes/no): ")

if confirmation.lower() == 'yes':

    for i, movie in tqdm(enumerate(castDict), total=len(castDict)):
        curMovie = mergedDict[i]
        curMovieRelease = curMovie['release_date']

        movieTotalAttributions = 0

        # Only going to keep the top 10 billings
        for k, person in enumerate(movie[:5], start=1):
            personId = person['id']

            url0 = f"https://api.themoviedb.org/3/person/{personId}?language=en-US"
            url1 = f"https://api.themoviedb.org/3/person/{personId}/combined_credits?language=en-US"
            personDetails = requests.get(url0, headers=headers).json()
            creditDetails = requests.get(url1, headers=headers).json()['cast']

            previousAttributions = 0

            # Loop through cast members' past credits
            for credit in creditDetails:
                creditId = credit['id']

                imdbUrl = f"https://api.themoviedb.org/3/movie/{creditId}/external_ids"

                imdbDetails = requests.get(imdbUrl,headers=headers)

                if imdbDetails.status_code != 200:
                    continue

                imdbId = imdbDetails.json()['imdb_id']
                
                if imdbId == None or imdbId == ''  or imdbId not in tmdbDict.keys():
                    continue

                movieDetails = tmdbDict[imdbId]

                if movieDetails['revenue'] == 0 or movieDetails['revenue'] != movieDetails['revenue']:
                    continue
                if movieDetails['budget'] == 0 or movieDetails['budget'] != movieDetails['budget']:
                    continue

                releaseDelta = (curMovieRelease - pd.to_datetime(movieDetails['release_date'], utc=True)).days

                if releaseDelta <= 0:
                    continue
            
                #okay... so we now have only movies release before the release date... let's calculate the RBR as a function of billing
                RBR = movieDetails['revenue'] / movieDetails['budget']

                movieCast = getCast(imdbId, False)

                billing = [j for j,e in enumerate(movieCast, start=1) if e['id'] == personId]

                if len(billing) == 0:
                    continue 
                
                # attribution
                attribution = (RBR / billing[0]) / k # weighting this based on the current cast list

                # Finally, weight the attribution based on how long ago the movie occurred
                attributionWeighted = attribution / (releaseDelta / 10)

                previousAttributions += attribution

            movieTotalAttributions += previousAttributions

        totalStarPower[curMovie['imdbId']] = movieTotalAttributions
    with open('../data/starPower.json', 'w') as f:
        json.dump(totalStarPower, f)
else:

    print('Phew, let\'s continue')
    with open('../data/starPower.json') as f:
        totalStarPower = json.load(f)

mergedDf['starPower'] = mergedDf['imdbId'].map(totalStarPower)

### Get mean age of stars at time of release

In [None]:
ageDict = {}

for i, movie in tqdm(enumerate(castDict), total=len(castDict)):
    curMovie = mergedDict[i]
    curMovieRelease = curMovie['release_date']

    ages = []

    # Only going to keep the top n billings
    for k, person in enumerate(movie[:5], start=1):
        personId = person['id']

        url0 = f"https://api.themoviedb.org/3/person/{personId}?language=en-US"
        details = requests.get(url0,headers=headers)

        if details.status_code != 200:
            continue

        birthday = pd.to_datetime(details.json()['birthday'], utc=True)

        age = relativedelta(curMovieRelease, birthday).years

        ages.append(age)

    
    ageDict[curMovie['imdbId']] = np.mean(ages)

In [None]:
mergedDf['starAges'] = mergedDf['imdbId'].map(ageDict)

### Retrieve ratings from redis
- I have a separate redis db that has a mapping of imdb to imdb rating

In [None]:
r5 = redis.Redis(
    host='127.0.0.1',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db=5
)

redisKeys = r5.keys('*')
redisValues = [float(i) for i in r5.mget(redisKeys)]

redisDict = dict(zip(redisKeys,redisValues))

In [None]:
mergedDf['imdbRating'] = mergedDf['imdbId'].map(redisDict)

### Final analyses

In [None]:
from scipy import stats
# Function to create a boxplot and overlay t-test results
def create_boxplot_with_ttest(ax, df, column):
    group_a = df[df['classification'] == 'good'][column]
    group_b = df[df['classification'] == 'bad'][column]
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(group_a, group_b)
    
    # Create boxplot
    ax.boxplot([group_a, group_b], labels=['High RBR', 'Low RBR'])
    ax.set_title(f'{column}')
    ax.set_xlabel('Group')
    ax.set_ylabel('Value')
    
    # Overlay t-test results
    ax.text(1.5, max(group_a.max(), group_b.max()), f't-stat: {t_stat:.2f}\np-value: {p_value:.3f}', 
            horizontalalignment='center', verticalalignment='top', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

# Create a 4x4 grid of plots
fig, axes = plt.subplots(3, 2, figsize=(8, 8))


renameCols = {
    'budget':'Budget', 
    'revenue':'Revenue', 
    'starAges':'Star Ages',
    'starPower':'Star Power',
    'runtime':'Runtime', 
    'imdbRating':'IMDB Rating', 
}

mergedDf.rename(columns=renameCols, inplace=True)

for ax, column in zip(axes.flatten(), renameCols.values()):
    create_boxplot_with_ttest(ax, mergedDf, column)

plt.tight_layout()

# Save image
imageFilePath = os.path.join(saveImagePath, 'boxplots.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

### Comedies since the hangover with highest RBRs

In [None]:
comedyDf.head(2)

In [None]:
recentDf = comedyDf[comedyDf['release_date'] >= pd.to_datetime('2010-01-01', utc=True)]
recentDf = recentDf[~recentDf['genres'].str.contains('Animation')]
recentDf = recentDf[~recentDf['genres'].str.contains('Action')]
recentDf = recentDf[~recentDf['genres'].str.contains('Drama')]
recentDf = recentDf[recentDf['budget'] > 10_000_000]

In [None]:
recentDf.sort_values(by='RBR', inplace=True, ascending=False)
recentDf.replace([np.inf, -np.inf], np.nan, inplace=True)
recentDf.dropna(subset=['RBR'], inplace=True)

In [None]:
# Define formatter function
fix, ax = plt.subplots()

ax.scatter(recentDf['budget'], recentDf['revenue'], c = loadPalette()['blue_grey_dark'])

# Define line points
x_vals = np.array(ax.get_xlim())
y_vals = x_vals  # Slope of one

# Plot line
ax.plot(x_vals, y_vals, '--', color=loadPalette()['cherry'], label='Revenue == Budget')

# Apply formatter to x and y axes
formatter = FuncFormatter(millions)
ax.xaxis.set_major_formatter(formatter)
ax.yaxis.set_major_formatter(formatter)
ax.set_xlabel('Budget')
ax.set_ylabel('Revenue')


# Calculate Q1, Q3, and IQR
Q1 = recentDf['RBR'].quantile(0.25)
Q3 = recentDf['RBR'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers
outliers = recentDf[(recentDf['RBR'] < Q1 - 1.5 * IQR) | (recentDf['RBR'] > Q3 + 1.5 * IQR)]
outliers.drop_duplicates(subset='imdbId', keep='last', inplace=True)
outliers.sort_values(by='RBR', ascending=False, inplace=True)
# Plot outliers
ax.scatter(outliers['budget'], outliers['revenue'], c=loadPalette()['cherry'], s=10, zorder=2)

# Annotate outliers
texts = []
for i, row in outliers.iterrows():

    if len(texts)>15:
        break
    texts.append(ax.text(
        row['budget'], row['revenue'], row['title'],
        fontsize=8, fontfamily='monospace', alpha=0.75
    ))

# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.25))
        
plt.legend()


# Save image
imageFilePath = os.path.join(saveImagePath, 'recentOutliers.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()