In [1]:
import pandas as pd
import statsmodels as sm
import os
from tabulate import tabulate
import matplotlib.pyplot as plt
plt.style.use("../assets/plot_styles.mplstyle")
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from collections import Counter
import ast
import sys
import json
import redis
from PIL import Image, ImageDraw, ImageOps
from io import BytesIO
import cv2
from bs4 import BeautifulSoup
from matplotlib.animation import FuncAnimation

sys.path.append('../library')
from core import flattenWithGenerator, createSlidingWindows, exceptionOutput
from midStats import linearModelGeneral
from plotting import loadPalette, loadTableStyles, createBoxplotWithTTests
from imageProcessing import *

from IPython.display import display, Markdown
from datetime import timedelta
import time
from matplotlib.ticker import FuncFormatter
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import matplotlib.colors as mcolors

from adjustText import adjust_text
from tabulate import tabulate
from tqdm.notebook import tqdm
import requests

colorPalette = loadPalette()

In [2]:
saveImagePath = '../assets/savedImages/whoIsTheBiggestMovieStarInTheWorld'

if not os.path.exists(saveImagePath):
    os.makedirs(saveImagePath)

TMDB_AUTH_TOKEN = os.getenv('TMDB_AUTH_TOKEN')

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_AUTH_TOKEN}"
}

### Load credits from redis

In [None]:
r = redis.Redis(
    host='127.0.0.1',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db=6
)

keys = r.keys('*')
values = r.mget(keys)

creditsDict = []

for i,v in enumerate(tqdm(values)):
    vJ = json.loads(v)
    imdbId = {'imdbId':keys[i]}
    cast = [{**c, **imdbId} for c in vJ['cast']]
    creditsDict += cast
    

In [4]:
creditsDf = pd.DataFrame.from_dict(creditsDict)

### Load TMDB Df

In [5]:
tmdbDfRaw = pd.read_csv('../data/tmdbDetails.csv')
tmdbDfRaw.drop_duplicates('imdb_id', keep='last', inplace=True)
tmdbDfRaw = tmdbDfRaw[tmdbDfRaw['adult'] == False]
tmdbDfRaw = tmdbDfRaw[tmdbDfRaw['genres'].str.contains('Animation') == False]

In [6]:
allIds = list(tmdbDfRaw['imdb_id'].unique())

with open('../data/allIds.json', 'w') as f:
    json.dump(allIds, f)

### Filter TMDB for what we have credits on

In [7]:
def preprocessData(tmdbDf):
    """
    Turning this into a fn as we are going to be repeating these steps on some different versions of the df
    It isn't doing anything fancy, just a bunch of pandas business
    """
    tmdbDf = tmdbDf[tmdbDf['imdb_id'].isin(creditsDf['imdbId'].unique())]
    tmdbDf.drop_duplicates(subset=['imdb_id'], inplace=True, keep='last')
    tmdbDf = tmdbDf[tmdbDf['revenue'] > 0]
    tmdbDf = tmdbDf[tmdbDf['budget'] > 0]
    tmdbDf = tmdbDf[['imdb_id','release_date','revenue','budget']]
    tmdbDf.set_index('imdb_id', inplace=True, drop=True)
    tmdbDict = tmdbDf.to_dict('index')

    creditsDf['tmdbInfo'] = creditsDf['imdbId'].map(tmdbDict)
    expDf = pd.json_normalize(creditsDf['tmdbInfo'])
    resDf = creditsDf.drop(columns=['tmdbInfo']).join(expDf)
    resDf = resDf[resDf['known_for_department'] == 'Acting']
    resDf['contribution'] = resDf['revenue'] / (resDf['cast_id'] + 1) # indexing starts at 0

    idNameMap = resDf[['id','name']].set_index('id').to_dict()['name']

    totalSums = pd.DataFrame(resDf.groupby('id')[['revenue','contribution']].agg({'sum', 'size'}))
    totalSums.columns = ['_'.join(col).strip() for col in totalSums.columns.values]
    totalSums.sort_values(by='revenue_sum', ascending=False, inplace=True)
    totalSums.drop(columns='contribution_size', inplace=True)
    totalSums.rename(columns={'revenue_size':'size'}, inplace=True)
    totalSums['name'] = totalSums.index.map(idNameMap)

    # get headshots for outliers - going to just select top 50
    outliersY1 = totalSums.sort_values(by='revenue_sum', ascending=False).head(50)
    outliersY2 = totalSums.sort_values(by='contribution_sum', ascending=False).head(50)

    # annotations = [p for p in list(outliersX.index) if p in outliersY.index.unique()]
    annotations = [list(outliersY1.index), list(outliersY2.index)]
    annotationsFlat = list(flattenWithGenerator(annotations))

    cutoff = 20

    print(f"NUMBER OF OUTLIERS = {len (annotationsFlat)}")
    print(f"ONLY PLOTTING {cutoff}")

    for annotation in tqdm(annotationsFlat[:cutoff]):
        getHeadshot(annotation, headers)

    for annotation in tqdm(annotationsFlat[:cutoff]):
        try:
            if len(os.listdir(f'../data/headshots/{annotation}')) == 0:
                continue
            filePath = f'../data/headshots/{annotation}/{annotation}_0.jpg'
            extractFaces(filePath, makePretty=True)
        except Exception as e:
            print(exceptionOutput(e))
            print(annotation)

    return totalSums, annotations, annotationsFlat, idNameMap

### Pulling headshots for people in our outliers
- We'll move this to an asyncio when pulling for all actors

In [None]:
totalSums, annotations, annotationsFlat, idNameMap = preprocessData(tmdbDfRaw)

In [None]:
# Define formatter function
def billions(x, pos):
    return f'{x * 1e-9:.1f}B'

fig, axs = plt.subplots(2,1, figsize=(8,8))

for idx, targetCol in enumerate(['revenue_sum','contribution_sum']):
    ax = axs[idx]

    ax.scatter(totalSums['size'], totalSums[targetCol], c = loadPalette()['blue_grey_dark'])

    # Define line points
    x_vals = np.array(ax.get_xlim())
    y_vals = x_vals  # Slope of one

    # Plot line

    # Apply formatter to x and y axes
    formatter = FuncFormatter(billions)
    ax.yaxis.set_major_formatter(formatter)
    ax.set_xlabel('Number of Credits (non-animated)')

    if idx == 0:
        ax.set_ylabel('Cumulative Revenue')
    else:
        ax.set_ylabel('Adjusted Cumulative Revenue')

    xLim = 0
    for annotation in annotations[idx][:20]:
        try:
            imgPath = f'../data/headshots/{annotation}/00_faceExtracted.png'
            annotationData = totalSums[totalSums.index == annotation]
            
            if annotationData['size'].iloc[0] > xLim:
                xLim = annotationData['size'].iloc[0]

            x, y = annotationData['size'].iloc[0], annotationData[targetCol].iloc[0]
            img = Image.open(imgPath)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
            img = img.resize((45, 45), Image.LANCZOS)  # Resize image with high-quality interpolation

            imagebox = OffsetImage(img, zoom=0.5)  # Adjust zoom to fit the resized image
            ab = AnnotationBbox(imagebox, (x, y), frameon=False)
            ax.add_artist(ab)
        except Exception as e:
            pass

    ax.set_xlim(0, xLim)

fig.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.05, hspace=0.2, wspace=0.15)


# Save image
imageFilePath = os.path.join(saveImagePath, 'outlierScatter.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

### Nice df showing rankings across three columns

In [10]:
totalSums = totalSums[totalSums['size'] > 20]

totalSums['perMovieRevAvg'] = totalSums['revenue_sum'] / totalSums['size']
totalSums['perMovieContAvg'] = totalSums['contribution_sum'] / totalSums['size']

totalSums.drop(columns='size', inplace=True)


rankDict = {}
for column in [c for c in totalSums.columns if c != 'name']:
    ranks = totalSums[['name',column]].sort_values(by=column, ascending = False)
    rankDict[column] = list(ranks['name'])[:20]

In [11]:
rankDf = pd.DataFrame.from_dict(rankDict)
rankDf.index.name = 'Rank'
rankDf.columns = ['Total Revenue','Total Revenue Adjusted','Avg. Revenue','Avg. Revenue Adjusted']

In [None]:
# Map Colors
uniqueValues = rankDf.values.ravel('K')
counts = Counter(uniqueValues)
min3 = [k for k,v in counts.items() if v > 2]
colors = list(loadPalette().values())[:len(min3)]
color_map = {val: f'background-color: {color}' for val, color in zip(min3, colors)}

# Apply colors
def highlightCells(val):
    return color_map.get(val, '')

styledDf = rankDf.style.applymap(highlightCells).set_table_styles(loadTableStyles())
display(Markdown(f"<h3 style='font-family:monospace; background-color:{loadPalette()['canvas_dark']}; color:black; padding:10px; margin-bottom:0;font-weight:bold;'>Revenue Rankings (minimum 20 credits)</h3>"))
display(styledDf)

### Filter Out Franchises
- Note that the belongs_to_collection column is not perfect (e.g., "The Eternals" is missed)
- We'll do a secondary filter on production company perhaps...

In [13]:
def getLowestLevelDict(d):
    if not isinstance(d, dict):
        return [d]
    
    values = []
    for value in d.values():
        values.extend(getLowestLevelDict(value))
    
    return values

In [14]:
with open('../data/franchiseMappings2.json') as f:
    franchiseData = json.load(f)

franchiseIds = getLowestLevelDict(franchiseData)

In [None]:
f1 = len(tmdbDfRaw)
standaloneDf = tmdbDfRaw[tmdbDfRaw['belongs_to_collection'] != tmdbDfRaw['belongs_to_collection']]
f2 = len(standaloneDf)
standaloneDf = standaloneDf[~standaloneDf['imdb_id'].isin(franchiseIds)]
f3 = len(standaloneDf)

# Filter marvel
standaloneDf = standaloneDf[standaloneDf['production_companies'].str.contains('Marvel') == False]
f4 = len(standaloneDf)

# Filter Star Wars
standaloneDf = standaloneDf[standaloneDf['production_companies'].str.contains('Lucasfilm') == False]
f5 = len(standaloneDf)

standaloneDf.sort_values(by='revenue', ascending=False, inplace=True)
print([f1,f2,f3,f4,f5])

In [None]:
nonIPTotalSums, nonIPAnnotations, nonIPAnnotationsFlat, nonIPIdNameMap = preprocessData(standaloneDf)

In [None]:
# Define formatter function
fig, axs = plt.subplots(2,1, figsize=(8,8))

for idx, targetCol in enumerate(['revenue_sum','contribution_sum']):
    ax = axs[idx]

    ax.scatter(nonIPTotalSums['size'], nonIPTotalSums[targetCol], c = loadPalette()['blue_grey_dark'])

    # Define line points
    x_vals = np.array(ax.get_xlim())
    y_vals = x_vals  # Slope of one

    # Plot line

    # Apply formatter to x and y axes
    formatter = FuncFormatter(billions)
    ax.yaxis.set_major_formatter(formatter)
    ax.set_xlabel('Number of Credits (non-animated)')

    if idx == 0:
        ax.set_ylabel('Cumulative Revenue')
    else:
        ax.set_ylabel('Adjusted Cumulative Revenue')

    xLim = 0
    for annotation in nonIPAnnotations[idx][:20]:
        try:
            imgPath = f'../data/headshots/{annotation}/00_faceExtracted.png'
            annotationData = nonIPTotalSums[nonIPTotalSums.index == annotation]
            
            if annotationData['size'].iloc[0] > xLim:
                xLim = annotationData['size'].iloc[0]

            x, y = annotationData['size'].iloc[0], annotationData[targetCol].iloc[0]
            img = Image.open(imgPath)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
            img = img.resize((45, 45), Image.LANCZOS)  # Resize image with high-quality interpolation

            imagebox = OffsetImage(img, zoom=0.5)  # Adjust zoom to fit the resized image
            ab = AnnotationBbox(imagebox, (x, y), frameon=False)
            ax.add_artist(ab)
        except Exception as e:
            pass

    ax.set_xlim(0, xLim)

fig.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.05, hspace=0.2, wspace=0.15)


# Save image
imageFilePath = os.path.join(saveImagePath, 'outlierScatterNonFranchise.png')
plt.savefig(imageFilePath, dpi=300)

plt.show()

### Compare awards across the two groups of outliers

In [18]:
with open('../assets/actingAwards.html', 'r', encoding='utf-8') as file:
    htmlContent = file.read()

# Step 3: Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(htmlContent, 'html.parser')

In [19]:
subgroups = soup.find_all(class_='result-subgroup')
allTextBlob = ""
for subgroup in subgroups:
    internals = subgroup.find_all('div')
    links = []
    
    for div in internals:        
        nominationLink = div.find(class_='nominations-link')
        if nominationLink:
            linkText = nominationLink.get_text(strip=True)
            if linkText not in links:
                allTextBlob += linkText
                links.append(linkText)

In [20]:
IPAwards = {}
nonIPAwards = {}
for annotation in annotations[1]:
    name = idNameMap[annotation]

    IPAwards[annotation] = allTextBlob.count(name)

for annotation in nonIPAnnotations[1]:
    name = nonIPIdNameMap[annotation]

    nonIPAwards[annotation] = allTextBlob.count(name)

In [21]:
ipDf = pd.DataFrame.from_dict({'count':IPAwards})
ipDf['group'] = 'IP'
nonIpDf = pd.DataFrame.from_dict({'count':nonIPAwards})
nonIpDf['group'] = 'Non-IP'

tDf = pd.concat([ipDf, nonIpDf], axis=0)

In [None]:
from scipy import stats
fig, ax = plt.subplots(figsize=(5,5))
group_a = tDf[tDf['group'] == 'IP']['count']
group_b = tDf[tDf['group'] == 'Non-IP']['count']

# Perform t-test
t_stat, p_value = stats.ttest_ind(group_a, group_b)

# Create boxplot
ax.boxplot([group_a, group_b], labels=['IP','Non-IP'])
ax.set_title(f'{column}')
ax.set_xlabel('Group')
ax.set_ylabel('Number of Oscar Nominations')

# Overlay t-test results
ax.text(1.5, max(group_a.max(), group_b.max()), f't-stat: {t_stat:.2f}\np-value: {p_value:.3f}', 
    horizontalalignment='center', verticalalignment='top', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

### Who is current biggest movie star in the world?

In [54]:
def topNColumns(row, n):
        return row.nlargest(n).index.tolist()

def inTopN(df, n):
        data = df.to_numpy()

        topIdxs = np.argsort(data, axis=1)[:, -n:]

        topCols = np.zeros(df.shape[1], dtype=bool)
        for rowIdxs in topIdxs:
                topCols[rowIdxs] = True

        return df.columns[topCols]

def prepDfForHorseRace(
        df: pd.DataFrame = pd.DataFrame(),
        yearRoll: int = 10,
        targetCol: str = 'contribution',
        department: str = 'Acting',
        topN: int = 10,
        cutoffDate: int = 1940
):
        timeDf = df.copy(deep=True)

        print('PREPPING TIMEDF')
        timeDf['year'] = pd.to_datetime(timeDf['release_date']).dt.year
        timeDf = timeDf[timeDf['imdb_id'].isin(creditsDf['imdbId'].unique())]
        timeDf.drop_duplicates(subset=['imdb_id'], inplace=True, keep='last')
        timeDf['release_date'] = pd.to_datetime(timeDf['release_date'])
        timeDf = timeDf[timeDf['release_date'] > pd.to_datetime(str(cutoffDate), format='%Y')]
        timeDf = timeDf[timeDf['revenue'] > 0]
        timeDf = timeDf[timeDf['budget'] > 0]
        timeDf = timeDf[['imdb_id','release_date','revenue','budget']]
        timeDf.set_index('imdb_id', inplace=True, drop=True)
        tmdbDict = timeDf.to_dict('index')

        print('MERGING IN CREDITS')
        creditsDf['tmdbInfo'] = creditsDf['imdbId'].map(tmdbDict)
        expDf = pd.json_normalize(creditsDf['tmdbInfo'])
        resDf = creditsDf.drop(columns=['tmdbInfo']).join(expDf)
        resDf = resDf[resDf['known_for_department'] == department]

        print('CALCULATING COLS')
        # Define good target cols
        resDf['contribution'] = resDf['revenue'] / (resDf['cast_id'] + 1) # indexing starts at 0
        resDf['RBR'] = resDf['revenue'] / resDf['budget'] # indexing starts at 0

        resDf['starPower'] = resDf['RBR'] * resDf['contribution']

        resDf.dropna(subset='release_date', inplace=True)
        resDf['year'] = pd.to_datetime(resDf['release_date']).dt.year

        print('GROUPING COLS')
        gDf = pd.DataFrame(resDf.groupby(['year','id'])[targetCol].sum())
        gDf.reset_index(drop=False, inplace=True)

        print('PIVOT!')
        pivDf = gDf.pivot(index='year', columns='id', values=targetCol)

        pivDf.fillna(0, inplace=True)
        pivDf.index = pd.to_datetime(pivDf.index.astype(str), format='%Y')

        # Fill in missing years with 0
        resampledDf = pivDf.resample('Y').sum()
        resampledDf = resampledDf.fillna(0)

        # Resample to monthly frequency and apply polynomial interpolation
        resampledDf = resampledDf.resample('M').interpolate(method='polynomial', order=2)
        resampledDf[resampledDf < 0] = 0

        pivRollDf = resampledDf.ewm(span=(yearRoll*12), adjust=False).mean()
        pivRollDf.dropna(how='all', inplace=True)
        
        print(f"NUMBER OF ROWS: {len(pivRollDf)}")

        # filtering out 1) future years and 2) People who have never been in the top 10 of any given year
        pivRollDf = pivRollDf.iloc[:-1]
        keepDict = inTopN(pivRollDf, topN)
        resampledRollDfFiltered = pivRollDf[list(keepDict)]

        return resampledRollDfFiltered

In [55]:
annotations = {
    "1910's":[pd.to_datetime('1910', format='%Y'), pd.to_datetime('1919', format='%Y')],
    "1920's":[pd.to_datetime('1919', format='%Y'), pd.to_datetime('1929', format='%Y')],
    "1930's":[pd.to_datetime('1929', format='%Y'), pd.to_datetime('1939', format='%Y')],
    "1940's":[pd.to_datetime('1939', format='%Y'), pd.to_datetime('1949', format='%Y')],
    "1950's":[pd.to_datetime('1949', format='%Y'), pd.to_datetime('1959', format='%Y')],
    "1960's":[pd.to_datetime('1959', format='%Y'), pd.to_datetime('1969', format='%Y')],
    "1970's":[pd.to_datetime('1969', format='%Y'), pd.to_datetime('1979', format='%Y')],
    "1980's":[pd.to_datetime('1979', format='%Y'), pd.to_datetime('1989', format='%Y')],
    "1990's":[pd.to_datetime('1989', format='%Y'), pd.to_datetime('1999', format='%Y')],
    "2000's":[pd.to_datetime('1999', format='%Y'), pd.to_datetime('2009', format='%Y')],
    "2010's":[pd.to_datetime('2009', format='%Y'), pd.to_datetime('2019', format='%Y')],
    "2020's":[pd.to_datetime('2019', format='%Y'), pd.to_datetime('2029', format='%Y')]
}

def createHorseRace(
        df: pd.DataFrame = pd.DataFrame,
        figSize: tuple = (12,6),
        annotations: dict = {},
        windowSize: int = 72,
        fps: int = 17,
        dpi: int = 300, 
        xLabel: str = "Date",
        yLabel: str = "Box Office Power",
        filename: str = '',
        saveImagePath: str = '',
        yearRoll: int = 10
    ):
    # Assertions
    assert filename != '', AssertionError('ERROR: SPECIFY A FILE NAME!')
    assert saveImagePath != '', AssertionError('ERROR: SPECIFY YOUR GLOBAL IMAGE PATH!')
    assert all([isinstance(c, int) for c in df.columns]), AssertionError('ERROR: COLUMNS OF DF ARE NOT OF CORRECT TYPE!')
    assert isinstance(df.index, pd.DatetimeIndex), AssertionError('ERROR: INDEX OF DF IS NOT A DATETIME INDEX!')
    if annotations != {}:
        assert all(isinstance(v, list) and all(isinstance(date, pd.Timestamp) for date in v) for v in annotations.values()), AssertionError('ERROR: ANNOTATIONS VALUES ARE NOT LISTS OF pd.Timestamp!')

    from matplotlib.font_manager import FontProperties
    fontProp = FontProperties(fname='../assets/fonts/Lora/Lora-VariableFont_wght.ttf')

    # We'll make sure that we have everyone's faces

    for personId in tqdm(df.columns):
        try:
            headshotPath = f'../data/headshots({personId})'
            if os.path.exists(headshotPath):
                if '00_faceExtracted.png' in os.listdir(os.path.exists):
                    continue

            getHeadshot(personId, headers=headers)

            filePath = f'../data/headshots/{personId}/{personId}_0.jpg'
            
            if os.path.exists(filePath):
                extractFaces(filePath, makePretty=True)
        except Exception as e:
            exceptionOutput(e)
            pass

    fig, ax = plt.subplots(figsize=figSize)

    def update(i):
        ax.clear()
        index = list(df.index)

        # Set window boundaries
        curMinRef = max(0, int(i-(windowSize*.9)))
        curMaxRef = min(i+1, len(df)-1)

        # Use boundaries to define plotted df
        plotDf = df.iloc[curMinRef:curMaxRef]
        ax.plot(plotDf, zorder=2)
        
        lastDate = plotDf.index.max()
        lastRow = plotDf.iloc[-1]
        
        # Draw faces :) 
        visibleCols = topNColumns(lastRow, 10)
        allYs = []
        for personId in reversed(visibleCols):
            try:
                imgPath = f'../data/headshots/{personId}/00_faceExtracted.png'

                img = Image.open(imgPath)
                img = img.resize((80, 80), Image.LANCZOS)  # Resize image with high-quality interpolation

                imagebox = OffsetImage(img, zoom=0.5)  # Adjust zoom to fit the resized image
                x = index[i]
                y = lastRow[personId]
                allYs.append(y)
                ab = AnnotationBbox(imagebox, (x, y), frameon=False)
                ax.add_artist(ab)
            except Exception as e:
                # print([personId, idNameMap[personId]])
                # print(exceptionOutput(e))
                pass

        if annotations != {}:
            try:
                annotation = [k for k,v in annotations.items() if ((lastDate > v[0]) and (lastDate <= v[1]))][0]
                ax.text(0.5, 0.5, annotation, transform=ax.transAxes, fontsize=170, verticalalignment='center', horizontalalignment='center', alpha=0.25, color = loadPalette()['canvas_dark'], fontproperties=fontProp, zorder=1)
            except:
                pass
            
        ax.set_title(f'{yearRoll} Year Weighted Rolling Average (EWM)', fontsize=20)
        ax.set_xlabel(xLabel, fontsize=14)
        ax.set_ylabel(yLabel, fontsize=14)
        ax.set_xlim(index[curMinRef], index[min(curMaxRef + 20, len(index)-1)])
        ax.set_ylim(np.min(allYs)*.85, np.max(plotDf.values)*1.15)
    
    # Number of frames to hold the last frame (10 seconds at 24 fps)
    hold_frames = 10 * fps

    # Create the animation
    ani = FuncAnimation(fig, update, frames=list(range(len(df))) + [len(df)-1]*hold_frames)


    plt.tight_layout(pad=2.0, w_pad=0.5, h_pad=1.0)

    # Save the animation to an mp4 file
    ani.save(os.path.join(saveImagePath, f'{filename}.mp4'), writer='ffmpeg', fps=fps, dpi=dpi)

### Raw Horserace

In [None]:
# confirmation = input("Are you sure you want to proceed (This takes a longggg time)? (yes/no): ")

# if confirmation.lower() == 'yes':
allActingDf = prepDfForHorseRace(df = tmdbDfRaw)
print(f"WE'RE KEEPING {len(allActingDf.columns)} COLS")

createHorseRace(
    df = allActingDf,
    annotations = annotations,
    filename = 'actorHorseRaceBoxOfficePowerRaw',
    saveImagePath=saveImagePath,
    fps=24,
    dpi=300
)
# else:
print(f"PHEW! ON TO THE NEXT")

### Remove IP

In [None]:
# confirmation = input("Are you sure you want to proceed (This takes a longggg time)? (yes/no): ")

# if confirmation.lower() == 'yes':
nonIPDf = prepDfForHorseRace(df = standaloneDf, topN=5)
print(f"WE'RE KEEPING {len(nonIPDf.columns)} COLS")


with open('../assets/nonIPIds.json', 'w') as f:
    json.dump(list(nonIPDf.columns), f)

createHorseRace(
    df = nonIPDf,
    annotations = annotations,
    filename = 'actorHorseRaceBoxOfficePowerNonIP',
    saveImagePath=saveImagePath,
    fps=24
)
# else:
print(f"PHEW! ON TO THE NEXT")

### Use RBR

In [None]:
confirmation = input("Are you sure you want to proceed (This takes a longggg time)? (yes/no): ")

if confirmation.lower() == 'yes':
    # Eliminate micro budget movies
    tmdbDfFiltered = tmdbDfRaw[tmdbDfRaw['budget'] > 5_000_000]
    tmdbDfFiltered = tmdbDfFiltered[tmdbDfFiltered['revenue'] > 5_000_000]


    nonIPDf = prepDfForHorseRace(df = tmdbDfFiltered, targetCol = 'starPower')
    print(f"WE'RE KEEPING {len(nonIPDf.columns)} COLS")

    createHorseRace(
        df = nonIPDf,
        annotations = annotations,
        filename = 'actorHorseRaceStarPower',
        saveImagePath=saveImagePath,
        fps=32,
        yLabel = 'Star Power'
    )
else:
    print(f"PHEW! ON TO THE NEXT")