In [48]:
import json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import seaborn as sns
import csv
import sqlite3
import pandasql

In [65]:
#Final dataframe init
final_df = pd.read_csv('../data/final.csv')

In [66]:
#Fill null values in Director(primary_name) and Genre with 'Unknown'
final_df.loc[:, ['primary_name', 'genre']] = final_df.loc[:, ['primary_name', 'genre']].fillna(value = 'Unknown', axis = 1)

#Creating new column for Director Experience (# of movies in curated df)

dir_counts = final_df['primary_name'].value_counts()
final_df['director_experience'] = final_df['primary_name'].map(dir_counts)

#Getting rid of 'Unknown' values

unk_dir_mask = final_df['primary_name'].isin(['Unknown'])
final_df = final_df.loc[~unk_dir_mask]

#Curating df for director analysis

dir_df = final_df[['primary_name', 'director_experience', 'title', 'production_budget', 'genre', 'roi']].copy()
dir_df.head()

Unnamed: 0,primary_name,director_experience,title,production_budget,genre,roi
0,Lawrence Kasanoff,1,Foodfight!,45000000.0,Family Animation Comedy Action,-0.998362
1,Simon McQuoid,1,Mortal Kombat,20000000.0,Fantasy Adventure Action,5.106661
2,Jed I. Goodman,1,The Overnight,200000.0,Mystery Comedy,4.82998
3,Walter Salles,1,On the Road,25000000.0,Romance Drama Adventure,-0.627468
4,Ben Stiller,2,The Secret Life of Walter Mitty,91000000.0,Fantasy Comedy Drama Adventure,1.064409


In [67]:
#Initial Director Experience filter
experience_filter = [1, 2, 3]

exp_dir_mask = dir_df['director_experience'].isin(experience_filter)

experienced_dir_df = dir_df.loc[~exp_dir_mask]

experienced_dir_df.head()

Unnamed: 0,primary_name,director_experience,title,production_budget,genre,roi
13,Shawn Levy,4,Real Steel,110000000.0,Drama Family Sci-Fi Action,1.398912
17,Steven Spielberg,8,Lincoln,65000000.0,Biography Drama History,3.205327
26,Antoine Fuqua,5,The Equalizer,55000000.0,Crime Thriller Action,2.507339
33,Simon West,5,The Mechanic,42500000.0,Crime Thriller Action,0.796409
40,Martin Scorsese,4,Silence,46500000.0,History Drama Adventure,-0.48975


In [68]:
#Getting rid of duplicate values

no_dup_exp_dir_df = experienced_dir_df.drop_duplicates(
  subset = ['primary_name', 'title'],
  keep = 'last').reset_index(drop = True)

In [69]:
#Fixing Director Experience
dir_counts = no_dup_exp_dir_df['primary_name'].value_counts()

no_dup_exp_dir_df['director_experience'] = no_dup_exp_dir_df['primary_name'].map(dir_counts)

In [70]:
#Curating Experience Filter to include directors with 3 titles in df

experience_filter = [1, 2]

exp_dir_mask = no_dup_exp_dir_df['director_experience'].isin(experience_filter)

no_dup_exp_dir_df_actual = no_dup_exp_dir_df.loc[~exp_dir_mask]

In [71]:
#Creation of 'Golden List' aka 'Top 20 Directors by Avg. ROI'

golden_list = no_dup_exp_dir_df_actual.groupby('primary_name').roi.mean().reset_index().sort_values(by='roi', ascending = False).head(20)

In [72]:
#Creation of alphabetically ordered list of directors for manual calculations

ordered_exp_dir_list = no_dup_exp_dir_df_actual.sort_values(by =['director_experience', 'roi'], ascending = [False, False]).groupby('primary_name')

In [73]:
for key, item in ordered_exp_dir_list:
    print(ordered_exp_dir_list.get_group(key), "\n\n")

      primary_name  director_experience                  title  \
2    Antoine Fuqua                    5          The Equalizer   
92   Antoine Fuqua                    5               Southpaw   
157  Antoine Fuqua                    5        The Equalizer 2   
122  Antoine Fuqua                    5     Olympus Has Fallen   
127  Antoine Fuqua                    5  The Magnificent Seven   

     production_budget                     genre       roi  
2           55000000.0     Crime Thriller Action  2.507339  
92          30000000.0        Sport Drama Action  2.138549  
157         77000000.0     Crime Thriller Action  1.473417  
122         70000000.0           Thriller Action  1.469699  
127         90000000.0  Western Adventure Action  0.805835   


    primary_name  director_experience  \
113  Brad Peyton                    5   
52   Brad Peyton                    5   
119  Brad Peyton                    5   
145  Brad Peyton                    5   
40   Brad Peyton             

In [74]:
#Manual Entry and Analysis of Golden List Top Performing Genres
top_perf_genre = ['Outlier', 'Outlier', 'Thriller-Horror+Mystery / Action', 'Thriller-Horror/Sci-fi', 'Thriller-Horror/Mystery', 'Thriller-Horror+Mystery / Action-Fantasy+Adventure', 'Musical-Drama', 'Outlier', 'Comedy-Romance/Adventure', 'Drama / Romance', 'Adventure-Family+Comedy / Adventure - Crime', 'Comedy-Flex', 'Drama', 'Drama-Action/Comedy', 'Sci-fi-Action/Thriller', 'Outlier', 'Drama-Sport/Crime', 'X-Men / Music', 'Adventure-Family/Drama / Animation', 'Comedy-Romance/Adventure / Animation']
golden_list['top_performing_genre'] = top_perf_genre

In [75]:
#Creation of Hardcoded Avg Budget column

golden_list['average budget'] = [18887500, 66975000, 92875000, 62000000, 8500000, 66000000, 83300000, 79300000, 49666667, 4333333, 109333333, 31375000, 12000000, 20116667, 108500000, 14500000, 31600000, 157000000, 92100000, 39500000]

In [76]:
#Hardcoding values for 'Chance of Positive ROI' Calculation (couldnt pull data from prev. dataframes for some reason)
golden_list['num_pos_mov'] =         [2, 4, 4, 5, 4, 3, 3, 3, 3, 3, 2, 4, 3, 5, 3, 3, 4, 4, 3, 4]
golden_list['director_experience'] = [6, 4, 4, 5, 4, 3, 3, 3, 3, 3, 3, 4, 3, 6, 3, 4, 5, 4, 3, 4]


#Attempt at code for columns

#golden_list['num_pos_mov'] = no_dup_exp_dir_df_actual['primary_name'][['roi'] if ['roi'].index() > 0].sum()
#golden_list['director_experience'] = no_dup_exp_dir_df_actual['director_experience']

In [77]:
#Chance of success based on genre, values from Heath's table
golden_list['genre_values'] = [0.59, 0.59, 0.59, 0.59, 0.59, 0.59, 0.6, 0.68, 0.67, 0.58, 0.76, 0.67, 0.58, 0.58, 0.68, 0.52, 0.58, 0.60, 0.83, 0.83]

In [78]:
#Calculation and Creation of 'Chance of Pos ROI' Column
golden_list['Chance of Pos ROI (%)'] = ((golden_list.genre_values)*((golden_list.num_pos_mov)/(golden_list.director_experience)))*100

In [79]:
# look at directors that match our top genre recommendations
golden_list

Unnamed: 0,primary_name,roi,top_performing_genre,average budget,num_pos_mov,director_experience,genre_values,Chance of Pos ROI (%)
12,James Wan,23.410492,Outlier,18887500,2,6,0.59,19.666667
3,Christopher Landon,11.837767,Outlier,66975000,4,4,0.59,59.0
9,David O. Russell,6.365257,Thriller-Horror+Mystery / Action,92875000,4,4,0.59,59.0
31,Steven Soderbergh,5.914333,Thriller-Horror/Sci-fi,62000000,5,5,0.59,59.0
15,John Madden,5.31537,Thriller-Horror/Mystery,8500000,4,4,0.59,59.0
2,Bryan Singer,5.01838,Thriller-Horror+Mystery / Action-Fantasy+Adven...,66000000,3,3,0.59,59.0
36,Will Gluck,4.701875,Musical-Drama,83300000,3,3,0.6,60.0
22,Nicholas Stoller,4.625088,Outlier,79300000,3,3,0.68,68.0
23,Paul Feig,4.181509,Comedy-Romance/Adventure,49666667,3,3,0.67,67.0
34,Tim Story,3.731528,Drama / Romance,4333333,3,3,0.58,58.0
