# Determinants of Box Office Success

## Imports

In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import iqr

# visualization tools
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
movies = pd.read_csv('movie_data.csv')
#print(movies.columns)
#print(movies.info())
#movies

**Numeric columns:** num_critic_reviews, duration, director_facebook_likes, actor_1_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes, movie_facebook_likes, gross, num_users_voted (int), cast_total_facebook_likes  (int), num_user_reviews, budget, title_year , movie_score, aspect_ratio (int)                                                          

## Explore and Clean Dataset

In [3]:
# remove missing values
movies.dropna(inplace=True)

# remove duplicate rows based on the 'movie_title' column
movies.drop_duplicates(subset=['movie_title'], keep=False, inplace=True)

# reset index
movies.reset_index(drop=True, inplace=True)

In [4]:
# transform budget and gross metrics
movies['budget_millions'] = round(movies['budget'] / 1000000, 2)
movies['gross_millions'] = round(movies['gross'] / 1000000, 2)

# add calcuated columns
movies['revenue_millions'] = movies['gross_millions'] - movies['budget_millions']
movies['roi'] = (movies['revenue_millions'] / movies['budget_millions']) * 100
movies['profit_margin'] = movies['revenue_millions'] / movies['gross_millions']

### Remove irrelevant movies

In [5]:
# remove all movies from before the year 1996 in order to focus on the most recent data points
movies = movies[movies['title_year']>=1996]

#keeping only the English speaking movies
movies = movies[movies['language'] == 'English']

movies.reset_index(drop=True, inplace=True)

## Explore Numeric Variables

In [6]:
movies.sort_values(by=['director_facebook_likes'], ascending=False).head()

Unnamed: 0,color,director_name,num_critic_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,title_year,actor_2_facebook_likes,movie_score,aspect_ratio,movie_facebook_likes,budget_millions,gross_millions,revenue_millions,roi,profit_margin
2501,Color,Joseph Gordon-Levitt,364.0,90.0,23000.0,694.0,Scarlett Johansson,23000.0,24475193.0,Comedy|Drama|Romance,...,2013.0,19000.0,6.6,2.35,33000,3.0,24.48,21.48,716.0,0.877451
1021,Color,Christopher Nolan,341.0,130.0,22000.0,19000.0,Hugh Jackman,23000.0,53082743.0,Drama|Mystery|Sci-Fi|Thriller,...,2006.0,20000.0,8.5,2.35,49000,40.0,53.08,13.08,32.7,0.24642
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2012.0,23000.0,8.5,2.35,164000,250.0,448.13,198.13,79.252,0.442126
55,Color,Christopher Nolan,645.0,152.0,22000.0,11000.0,Heath Ledger,23000.0,533316061.0,Action|Crime|Drama|Thriller,...,2008.0,13000.0,9.0,2.35,37000,185.0,533.32,348.32,188.281081,0.653116
83,Color,Christopher Nolan,642.0,148.0,22000.0,23000.0,Tom Hardy,29000.0,292568851.0,Action|Adventure|Sci-Fi|Thriller,...,2010.0,27000.0,8.8,2.35,175000,160.0,292.57,132.57,82.85625,0.453122


In [75]:
# separate numerical fields from the categorical
movies_numeric = movies.select_dtypes(include=['float64', 'int64'])

In [76]:
# define movies trimmed to hold trimmed variables
movies_trimmed = movies

movies.sort_values(by=['director_facebook_likes'], ascending=False).head()

Unnamed: 0,color,director_name,num_critic_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,genre_documentary,genre_drama,genre_family,genre_fantasy,genre_music,genre_mystery,genre_thriller,genre_horror,genre_romance,genre_sci-fi
351,Color,Tony Scott,81.0,140.0,527.0,619.0,Jake Busey,10000.0,111544445.0,Action|Crime|Drama|Mystery|Thriller,...,not documentary,drama,not family,not fantasy,not music,mystery,thriller,not horror,not romance,not sci-fi
2005,Color,Albert Brooks,97.0,97.0,527.0,745.0,Bradley Whitford,12000.0,11614236.0,Comedy,...,not documentary,not drama,not family,not fantasy,not music,not mystery,not thriller,not horror,not romance,not sci-fi
1969,Color,Denzel Washington,112.0,126.0,527.0,697.0,Jurnee Smollett-Bell,18000.0,30226144.0,Biography|Drama,...,not documentary,drama,not family,not fantasy,not music,not mystery,not thriller,not horror,not romance,not sci-fi
372,Color,David Fincher,377.0,152.5,527.0,495.0,Jake Gyllenhaal,21000.0,33048353.0,Crime|Drama|History|Mystery|Thriller,...,not documentary,drama,not family,not fantasy,not music,mystery,thriller,not horror,not romance,not sci-fi
1981,Color,Woody Allen,278.0,119.0,527.0,400.0,Mark Gatiss,19000.0,23089926.0,Drama|Romance|Thriller,...,not documentary,drama,not family,not fantasy,not music,not mystery,thriller,not horror,romance,not sci-fi


In [77]:
# replace outliers with upper whisker / lower whisker value
for column in movies_numeric:
    q1 = movies_trimmed[column].quantile(0.25)
    q3 = movies_trimmed[column].quantile(0.75)
    iqr = q3 - q1
    upper_whisker = q3 + (iqr*1.5)
    lower_whisker = q1 - (iqr*1.5)
    movies_trimmed.loc[movies_trimmed[column] > upper_whisker, column] = upper_whisker
    movies_trimmed.loc[movies_trimmed[column] < lower_whisker, column] = lower_whisker

In [78]:
#movies.sort_values(by=['director_facebook_likes'], ascending=False).head()

In [29]:
#movies_trimmed.sort_values(by=['director_facebook_likes'], ascending=False).head()

In [30]:
#boxplot = movies_trimmed.boxplot(column=['duration'])

In [31]:
#boxplot = movies_trimmed.boxplot(column=['num_critic_reviews'])

In [32]:
#boxplot = movies_trimmed.boxplot(column=['director_facebook_likes'])

In [33]:
#boxplot = movies_trimmed.boxplot(column=['actor_1_facebook_likes'])

In [34]:
#boxplot = movies_trimmed.boxplot(column=['actor_2_facebook_likes'])

In [35]:
#boxplot = movies_trimmed.boxplot(column=['actor_3_facebook_likes'])

In [36]:
#boxplot = movies_trimmed.boxplot(column=['movie_facebook_likes'])

In [37]:
#boxplot = movies_trimmed.boxplot(column=['cast_total_facebook_likes'])

In [38]:
#boxplot = movies_trimmed.boxplot(column=['gross_millions'])

In [39]:
#boxplot = movies_trimmed.boxplot(column=['budget_millions'])

In [40]:
#boxplot = movies_trimmed.boxplot(column=['movie_score'])

## Success Metrics
Which of the metrics that we have access to demonstrate the success of a film?

Potential measures of success:
1. num_critic_review
2. gross
3. num_user_reviews
4. movie_score

## Explore Relationships in the Data

In [23]:
#sns.scatterplot(data = movies_gross_filter,x ='movie_score',y='gross_millions')

In [24]:
#sns.scatterplot(data = movies_gross_filter,x ='num_critic_reviews',y='gross_millions')

In [25]:
corr_matrix = movies.corr()
corr_matrix.to_csv('corr_matrix.csv')

corr_matrix.head(5)

Unnamed: 0,num_critic_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_users_voted,cast_total_facebook_likes,num_user_reviews,budget,title_year,actor_2_facebook_likes,movie_score,aspect_ratio,movie_facebook_likes,budget_millions,gross_millions,revenue_millions,roi,profit_margin
num_critic_reviews,1.0,0.342007,0.26054,0.203297,0.306828,0.468421,0.731484,0.32025,0.641283,0.39029,0.533642,0.255204,0.438234,0.24423,0.711135,0.39029,0.468421,0.239848,0.207001,0.27274
duration,0.342007,1.0,0.279712,0.18084,0.251282,0.284007,0.407486,0.263182,0.430204,0.359339,0.028947,0.210649,0.405586,0.274454,0.265055,0.359339,0.284006,0.03403,-0.038326,0.031714
director_facebook_likes,0.26054,0.279712,1.0,0.080624,0.169594,0.159314,0.275338,0.164894,0.260489,0.191359,0.012941,0.105992,0.24351,0.116658,0.180845,0.191359,0.159314,0.018927,0.012053,0.048956
actor_3_facebook_likes,0.203297,0.18084,0.080624,1.0,0.457013,0.299079,0.260407,0.614249,0.198525,0.285447,0.075708,0.764108,0.05081,0.085667,0.181419,0.285447,0.299077,0.139457,0.03857,0.134213
actor_1_facebook_likes,0.306828,0.251282,0.169594,0.457013,1.0,0.234177,0.356501,0.949536,0.251123,0.270241,0.126885,0.57536,0.220865,0.162111,0.251933,0.270241,0.234178,0.051224,0.002142,0.077901


In [41]:
corr_matrix_trimmed = movies_trimmed.corr()
corr_matrix_trimmed.to_csv('corr_matrix_trimmed.csv')

corr_matrix_trimmed

Unnamed: 0,num_critic_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_users_voted,cast_total_facebook_likes,num_user_reviews,budget,title_year,actor_2_facebook_likes,movie_score,aspect_ratio,movie_facebook_likes,budget_millions,gross_millions,revenue_millions,roi,profit_margin
num_critic_reviews,1.0,0.342007,0.26054,0.203297,0.306828,0.468421,0.731484,0.32025,0.641283,0.39029,0.533642,0.255204,0.438234,0.24423,0.711135,0.39029,0.468421,0.239848,0.207001,0.27274
duration,0.342007,1.0,0.279712,0.18084,0.251282,0.284007,0.407486,0.263182,0.430204,0.359339,0.028947,0.210649,0.405586,0.274454,0.265055,0.359339,0.284006,0.03403,-0.038326,0.031714
director_facebook_likes,0.26054,0.279712,1.0,0.080624,0.169594,0.159314,0.275338,0.164894,0.260489,0.191359,0.012941,0.105992,0.24351,0.116658,0.180845,0.191359,0.159314,0.018927,0.012053,0.048956
actor_3_facebook_likes,0.203297,0.18084,0.080624,1.0,0.457013,0.299079,0.260407,0.614249,0.198525,0.285447,0.075708,0.764108,0.05081,0.085667,0.181419,0.285447,0.299077,0.139457,0.03857,0.134213
actor_1_facebook_likes,0.306828,0.251282,0.169594,0.457013,1.0,0.234177,0.356501,0.949536,0.251123,0.270241,0.126885,0.57536,0.220865,0.162111,0.251933,0.270241,0.234178,0.051224,0.002142,0.077901
gross,0.468421,0.284007,0.159314,0.299079,0.234177,1.0,0.678198,0.275303,0.548891,0.698069,0.106414,0.304119,0.217524,0.117558,0.314723,0.698069,1.0,0.646828,0.425282,0.523856
num_users_voted,0.731484,0.407486,0.275338,0.260407,0.356501,0.678198,1.0,0.37288,0.812196,0.502887,0.147753,0.313899,0.528849,0.178639,0.539092,0.502887,0.678197,0.427372,0.299406,0.369902
cast_total_facebook_likes,0.32025,0.263182,0.164894,0.614249,0.949536,0.275303,0.37288,1.0,0.265089,0.302708,0.137308,0.728906,0.208624,0.154215,0.27054,0.302708,0.275303,0.080909,0.012549,0.09451
num_user_reviews,0.641283,0.430204,0.260489,0.198525,0.251123,0.548891,0.812196,0.265089,1.0,0.435769,-0.028604,0.229037,0.398325,0.170865,0.360015,0.435769,0.548891,0.315838,0.260703,0.333234
budget,0.39029,0.359339,0.191359,0.285447,0.270241,0.698069,0.502887,0.302708,0.435769,1.0,0.089184,0.314703,0.079945,0.226168,0.226584,1.0,0.698068,-0.039633,-0.152278,0.096573


## Analyze director success

Do certain director's tend to make more successful movies?

In [42]:
#group by director; average gross income
directors_gross = movies[['director_name','gross']].groupby(['director_name']).mean().sort_values(by='gross', ascending=False)
directors_gross['gross_millions'] = round(directors_gross['gross'] / 1000000, 2)

#directors_gross.head(10)

In [43]:
#group by director; count number of movies
directors_count = movies[['director_name','color']].groupby(['director_name']).count().sort_values(by='color',ascending = False)
directors_count = directors_count.rename(columns = {'color': 'num_movies'})

#directors_count

In [44]:
#group by director; average movie score
directors_score = movies[['director_name','movie_score']].groupby(['director_name']).mean().sort_values(by='movie_score', ascending = False)

#directors_score.head(10)

In [45]:
#group by director; average budget
directors_budget = movies[['director_name','budget']].groupby(['director_name']).mean().sort_values(by='budget', ascending = False)
directors_budget['budget_millions'] = round(directors_budget['budget'] / 1000000, 2)

#directors_score.head(10)

In [46]:
#group by director; average facebook likes
director_fblikes = movies[['director_name','director_facebook_likes']].groupby(['director_name']).mean().sort_values(by='director_facebook_likes', ascending = False)

#director_fblikes.head(10)

In [47]:
#group by director; average movie facebook likes
movie_fblikes = movies[['director_name','movie_facebook_likes']].groupby(['director_name']).mean().sort_values(by='movie_facebook_likes', ascending = False)

#movie_fblikes.head(10)

In [48]:
# merge director data into one df
directors_df_lst = [directors_gross, directors_budget, directors_count, directors_score, director_fblikes, movie_fblikes]
directors_df = reduce(lambda left,right: pd.merge(left,right,on='director_name'), directors_df_lst)
directors_df = directors_df.drop(columns=['gross', 'budget'])
directors_df = directors_df.rename(columns = {'gross_millions': 'avg_gross_millions',
                                             'budget_millions': 'avg_budget_million',
                                             'movie_score': 'avg_movie_score',
                                             'director_facebook_likes':'avg_director_fb_likes',
                                             'movie_facebook_likes':'avg_movie_fb_likes'})

directors_df['avg_movie_score'] = round(directors_df['avg_movie_score'],2)
directors_df['avg_movie_fb_likes'] = round(directors_df['avg_movie_fb_likes'],2)
directors_df.to_csv('directors_df.csv')

# filter df to focus on most successful directors
directors_df_filter = directors_df[directors_df['avg_gross_millions']>=100]

In [50]:
# scatterplot to show relationship between number of movies directed and avg. gross per movie

#sns.set(rc={'figure.figsize':(8,12)})
#sns.set(font_scale=2)
#sns.scatterplot(data=directors_df, x="avg_gross_millions", y="num_movies", s = 70)
#plt.savefig('director_scatter_nummovies_avggross.png')

In [None]:
# top grossing directors
directors_df_filter.head(10)

In [None]:
# directors with top average movie scores
directors_df_filter.sort_values(by = 'avg_movie_score', ascending = False).head(10)

In [None]:
# directors who directed the highest number of movies
directors_df_filter.sort_values(by = 'num_movies', ascending = False).head(10)

## Analzye Genre Success

In [53]:
movies['genres'].value_counts()

Comedy                                 124
Comedy|Drama|Romance                   123
Comedy|Romance                         122
Comedy|Drama                           119
Drama                                  107
                                      ... 
Comedy|Crime|Musical|Mystery             1
Drama|Fantasy|Music|Romance              1
Adventure|Mystery|Thriller               1
Adventure|Comedy|Romance                 1
Adventure|Animation|Family|Thriller      1
Name: genres, Length: 632, dtype: int64

In [None]:
# create genre dfs
genre_action = movies[movies['genres'].str.contains('Action')]
genre_adventure = movies[movies['genres'].str.contains('Adventure')]
genre_fantasy = movies[movies['genres'].str.contains('Fantasy')]
genre_thriller = movies[movies['genres'].str.contains('Thriller')]
genre_scifi = movies[movies['genres'].str.contains('Sci-Fi')]
genre_crime = movies[movies['genres'].str.contains('Crime')]
genre_comedy = movies[movies['genres'].str.contains('Comedy')]
genre_drama = movies[movies['genres'].str.contains('Drama')]
genre_doc = movies[movies['genres'].str.contains('Documentary')]
genre_romance = movies[movies['genres'].str.contains('Romance')]
genre_horror = movies[movies['genres'].str.contains('Horror')]

In [61]:
# add binary genre variable to movies df
genre_lst = ['Action','Adventure','Animation','Comedy','Crime','Documentary','Drama','Family','Fantasy','Music','Mystery','Thriller','Horror','Romance','Sci-Fi']

# create column for each genre in genre_lst
for x in genre_lst:
    movies['genre_' + x.lower()] = ""

# populate columns with binary field
for x in range(len(movies['genres'])):
    for y in genre_lst:
        if y in movies['genres'][x]:
            movies['genre_' + y.lower()][x] = y.lower()
        else:
            movies['genre_' + y.lower()][x] = 'not ' + y.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


### Create df to analyzing average outcome by genre

In [62]:
genre_avg = pd.DataFrame()
for x in genre_lst:
    x = movies.groupby(['genre_' + x.lower()]).mean()
    x = x.rename_axis('genre', axis= 0)
    genre_avg = genre_avg.append(x)

genre_avg = genre_avg[genre_avg.index.str.contains("not")==False]
genre_avg.to_csv('genre_avg.csv')

#### barplot - average gross by genre

In [69]:
#genre_avg_gross = sns.barplot(x= genre_avg.index, y="gross_millions", data=genre_avg)
#genre_avg_gross.set_xticklabels(genre_avg_gross.get_xticklabels(),rotation=45)
#sns.set(rc={'figure.figsize':(20,20)})
#sns.set(font_scale=2)
#plt.savefig('genre_avg_gross.png')

#### barplot - average budget by genre

In [70]:
#genre_avg_budget = sns.barplot(x= genre_avg.index, y="budget_millions", data=genre_avg)
#genre_avg_budget.set_xticklabels(genre_avg_budget.get_xticklabels(),rotation=45)

#### barplot - average revenue by genre

In [71]:
#genre_avg_revenue = sns.barplot(x= genre_avg.index, y="revenue_millions", data=genre_avg)
#genre_avg_revenue.set_xticklabels(genre_avg_revenue.get_xticklabels(),rotation=45)
#sns.set(rc={'figure.figsize':(20,15)})
#sns.set(font_scale=2)
#plt.savefig('genre_avg_revenue.png')

### Create df to analyzing total outcome (sum) by genre

In [65]:
genre_sum = pd.DataFrame()
for x in genre_lst:
    x = movies.groupby(['genre_' + x.lower()]).sum()
    x = x.rename_axis('genre', axis= 0)
    genre_sum = genre_sum.append(x)

genre_sum = genre_sum[genre_sum.index.str.contains("not")==False]
#genre_sum.to_csv('genre_sum.csv')

#### barplot - total gross by genre

In [72]:
#genre_sum_gross = sns.barplot(x= genre_sum.index, y="gross_millions", data=genre_sum)
#genre_sum_gross.set_xticklabels(genre_sum_gross.get_xticklabels(),rotation=45)
#sns.set(rc={'figure.figsize':(20,15)})
#sns.set(font_scale=2)
#plt.savefig('genre_sum_gross.png')

#### barplot - total budget by genre

In [73]:
#genre_sum_budget = sns.barplot(x= genre_sum.index, y="budget_millions", data=genre_sum)
#genre_sum_budget.set_xticklabels(genre_sum_budget.get_xticklabels(),rotation=45)

#### barplot - total revenue by genre

In [74]:
#genre_sum_revenue = sns.barplot(x= genre_sum.index, y="revenue_millions", data=genre_sum)
#genre_sum_revenue.set_xticklabels(genre_sum_revenue.get_xticklabels(),rotation=45)
#sns.set(rc={'figure.figsize':(20,15)})
#sns.set(font_scale=2)
#plt.savefig('genre_sum_revenue.png')