In [1]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
bom_gross = pd.read_csv('Data/Zipped_Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/Zipped_Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/Zipped_Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/Zipped_Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/Zipped_Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/Zipped_Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/Zipped_Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/Zipped_Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/Zipped_Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/Zipped_Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [2]:
#Remove punctuation from producton and worldwide. Switch variables from string to float.
tn_budget['production_budget'] = tn_budget['production_budget'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['worldwide_gross'] = tn_budget['worldwide_gross'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['production_budget'] = pd.to_numeric(tn_budget.production_budget, downcast = 'float', errors = 'coerce')
tn_budget['worldwide_gross'] = pd.to_numeric(tn_budget.worldwide_gross, downcast = 'float', errors = 'coerce')

In [7]:
#Create column for net profit (worldwide gross minus production budget).
tn_budget['net'] = list(tn_budget.apply(lambda x: x.worldwide_gross - x.production_budget, axis=1))

#Create column for ratio (net profit divided by production budget)
tn_budget['ratio'] = list(tn_budget.apply(lambda x: x.net / x.production_budget, axis=1))

# Rename column 'movie' to 'title'
tn_budget = tn_budget.rename(columns = {'movie': 'title'})

#remove whitespace, potential extra words, punctuation, and case from titles
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
tn_budget.title = tn_budget.title.replace(years, value='', regex=True)
tn_budget.title = tn_budget.title.str.strip()
tn_budget.title = tn_budget.title.apply(lambda x: x.lower())
tn_budget.title = tn_budget.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget.title = tn_budget.title.replace(['the', 'and'], value='', regex=True)

#Remove month and year from date. Drop redundant columns
tn_budget['year'] = list(tn_budget['release_date'].str[-4:])
tn_trimmed = tn_budget.drop(columns = ['id', 'domestic_gross', 'release_date'])

In [9]:
#Remove 'studio', 'domestic_gross', 'foreign_gross' column. Set 'foreign_gross' to float and 'year' to string
bom_gross = bom_gross.drop(columns = ['domestic_gross', 'foreign_gross'])
bom_gross['year'] = bom_gross['year'].astype(str)

In [10]:
#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
bom_gross.title = bom_gross.title.replace(years, value='', regex=True)
bom_gross.title = bom_gross.title.str.strip()
bom_gross.title = bom_gross.title.apply(lambda x: x.lower())
bom_gross.title = bom_gross.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
bom_gross.title = bom_gross.title.replace(['the', 'and'], value='', regex=True)

In [11]:
#Merge tn_trimmed and bom_gross
merged_df = pd.merge(tn_trimmed, bom_gross, on = ["title", "year"], how = "left")

In [12]:
#Replace null studio values
merged_df['studio'] = merged_df.studio.fillna('Unknown')

In [13]:
#Create 'year'column
imbd_basics['year'] = imbd_basics['start_year'].astype(str)

In [14]:
imbd_basics = imbd_basics.drop(columns = ['original_title', 'start_year'])

In [15]:
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title'})

In [17]:
imbd_basics.title = imbd_basics.title.str.strip()
imbd_basics.title = imbd_basics.title.apply(lambda x: x.lower())
imbd_basics.title = imbd_basics.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.title = imbd_basics.title.replace(['the', 'and'], value='', regex=True)

In [18]:
merged_df2 = pd.merge(merged_df, imbd_basics, on = ["title", "year"], how = "left")

In [19]:
merged_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5846 entries, 0 to 5845
Data columns (total 10 columns):
title                5846 non-null object
production_budget    5846 non-null float32
worldwide_gross      5846 non-null float32
net                  5846 non-null float64
ratio                5846 non-null float64
year                 5846 non-null object
studio               5846 non-null object
tconst               1623 non-null object
runtime_minutes      1596 non-null float64
genres               1617 non-null object
dtypes: float32(2), float64(3), object(5)
memory usage: 456.7+ KB


In [23]:
merged_df2.head()

Unnamed: 0,title,production_budget,worldwide_gross,net,ratio,year,studio,tconst,runtime_minutes,genres
0,avatar,425000000.0,2776345344.0,2351345344.0,5.53,2009,Unknown,Unknown,,Unknown
1,pirates of caribbean on stranger tides,410600000.0,1045663872.0,635063872.0,1.55,2011,BV,tt1298650,136.0,"Action,Adventure,Fantasy"
2,dark phoenix,350000000.0,149762352.0,-200237648.0,-0.57,2019,Unknown,tt6565702,113.0,"Action,Adventure,Sci-Fi"
3,avengers age of ultron,330600000.0,1403014016.0,1072414016.0,3.24,2015,BV,tt2395427,141.0,"Action,Adventure,Sci-Fi"
4,star wars ep viii last jedi,317000000.0,1316721792.0,999721792.0,3.15,2017,Unknown,Unknown,,Unknown


In [None]:

bom_20 = bom_gross.loc[(bom_gross.domestic_gross >= bom_gross.domestic_gross.quantile(.8)) 
             | (bom_gross.foreign_gross >= bom_gross.foreign_gross.quantile(.8))]

In [27]:
merged_df2.year.unique()

array(['2009', '2011', '2019', '2015', '2017', '2018', '2007', '2012',
       '2013', '2010', '2016', '2014', '2006', '2008', '2005', '1997',
       '2004', '1999', '1995', '2003', '2001', '2020', '2002', '1998',
       '2000', '1991', '1994', '1996', '1993', '1992', '1988', '1990',
       '1989', '1978', '1981', '1984', '1982', '1985', '1980', '1963',
       '1987', '1986', '1983', '1979', '1977', '1970', '1969', '1976',
       '1965', '1962', '1964', '1959', '1966', '1974', '1956', '1975',
       '1973', '1960', '1967', '1968', '1971', '1951', '1972', '1961',
       '1946', '1944', '1953', '1954', '1957', '1952', '1930', '1939',
       '1925', '1950', '1948', '1958', '1943', '1940', '1945', '1947',
       '1938', '1927', '1949', '1955', '1936', '1937', '1941', '1942',
       '1933', '1935', '1931', '1916', '1929', '1934', '1915', '1920'],
      dtype=object)

In [28]:
merged_df2['year'] = merged_df2['year'].astype(int)

In [32]:
merged_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5846 entries, 0 to 5845
Data columns (total 10 columns):
title                5846 non-null object
production_budget    5846 non-null float32
worldwide_gross      5846 non-null float32
net                  5846 non-null float64
ratio                5846 non-null float64
year                 5846 non-null int32
studio               5846 non-null object
tconst               5846 non-null object
runtime_minutes      1596 non-null float64
genres               5846 non-null object
dtypes: float32(2), float64(3), int32(1), object(4)
memory usage: 433.9+ KB


In [33]:
merged_df2 = merged_df2.loc[merged_df2.year >= 2010]

In [37]:
merged_df2['year'] = merged_df2['year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
merged_df2.head()

Unnamed: 0,title,production_budget,worldwide_gross,net,ratio,year,studio,tconst,runtime_minutes,genres
1,pirates of caribbean on stranger tides,410600000.0,1045663872.0,635063872.0,1.55,2011,BV,tt1298650,136.0,"Action,Adventure,Fantasy"
2,dark phoenix,350000000.0,149762352.0,-200237648.0,-0.57,2019,Unknown,tt6565702,113.0,"Action,Adventure,Sci-Fi"
3,avengers age of ultron,330600000.0,1403014016.0,1072414016.0,3.24,2015,BV,tt2395427,141.0,"Action,Adventure,Sci-Fi"
4,star wars ep viii last jedi,317000000.0,1316721792.0,999721792.0,3.15,2017,Unknown,Unknown,,Unknown
5,star wars ep vii force awakens,306000000.0,2053311232.0,1747311232.0,5.71,2015,Unknown,Unknown,,Unknown


In [68]:
studio_ratio_mean = merged_df2.groupby(['studio']).ratio.mean()

In [69]:
studio_ratio_mean.sort_values(ascending=False)[:20]

studio
WB (NL)    15.12
UTV        10.13
FD          8.94
Orch.       8.54
GrtIndia    7.78
BH Tilt     6.03
Uni.        5.54
Par.        5.13
MBox        4.88
A24         4.71
TriS        4.54
ParV        4.48
PNT         4.28
Eros        3.99
Wein.       3.44
FoxS        3.40
LGF         3.20
Affirm      3.04
W/Dim.      2.98
CJ          2.87
Name: ratio, dtype: float64

In [70]:
studio_ratio_median = merged_df2.groupby(['studio']).ratio.median()

In [71]:
studio_ratio_median.sort_values(ascending=False)[:20]

studio
Orch.      8.54
GrtIndia   7.78
UTV        5.66
PNT        5.26
MBox       4.88
TriS       3.46
Eros       3.37
BH Tilt    3.12
Affirm     3.04
CJ         2.87
PFR        2.66
P/DW       2.63
WB (NL)    2.57
W/Dim.     2.49
3D         2.30
BV         2.28
Uni.       2.15
SGem       1.92
Fox        1.81
A24        1.78
Name: ratio, dtype: float64

In [72]:
merged_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2258 entries, 1 to 5844
Data columns (total 10 columns):
title                2258 non-null object
production_budget    2258 non-null float32
worldwide_gross      2258 non-null float32
net                  2258 non-null float64
ratio                2258 non-null float64
year                 2258 non-null object
studio               2258 non-null object
tconst               2258 non-null object
runtime_minutes      1596 non-null float64
genres               2258 non-null object
dtypes: float32(2), float64(3), object(5)
memory usage: 176.4+ KB


In [80]:
imbd_name.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [82]:
imbd_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [83]:
imbd_name_prin = pd.merge(imbd_principals, imbd_name, on = ["nconst"], how = "left")

In [86]:
imbd_name_prin = imbd_name_prin.drop(columns = ['ordering', 'nconst', 
                               'job', 'characters', 'birth_year', 'death_year', 
                               'primary_profession', 'known_for_titles'])

In [90]:
directors = imbd_name_prin.loc[imbd_name_prin.category == 'director']

In [91]:
merged_df2.head()

Unnamed: 0,title,production_budget,worldwide_gross,net,ratio,year,studio,tconst,runtime_minutes,genres
1,pirates of caribbean on stranger tides,410600000.0,1045663872.0,635063872.0,1.55,2011,BV,tt1298650,136.0,"Action,Adventure,Fantasy"
2,dark phoenix,350000000.0,149762352.0,-200237648.0,-0.57,2019,Unknown,tt6565702,113.0,"Action,Adventure,Sci-Fi"
3,avengers age of ultron,330600000.0,1403014016.0,1072414016.0,3.24,2015,BV,tt2395427,141.0,"Action,Adventure,Sci-Fi"
4,star wars ep viii last jedi,317000000.0,1316721792.0,999721792.0,3.15,2017,Unknown,Unknown,,Unknown
5,star wars ep vii force awakens,306000000.0,2053311232.0,1747311232.0,5.71,2015,Unknown,Unknown,,Unknown


In [93]:
merged_df3 = pd.merge(merged_df2, directors, on = ["tconst"], how = "left")

In [95]:
merged_df3 = merged_df3.drop(columns = 'category')
merged_df3 = merged_df3.rename(columns = {'primary_name': 'director'})

In [96]:
merged_df3.head()

Unnamed: 0,title,production_budget,worldwide_gross,net,ratio,year,studio,tconst,runtime_minutes,genres,director
0,pirates of caribbean on stranger tides,410600000.0,1045663872.0,635063872.0,1.55,2011,BV,tt1298650,136.0,"Action,Adventure,Fantasy",Rob Marshall
1,dark phoenix,350000000.0,149762352.0,-200237648.0,-0.57,2019,Unknown,tt6565702,113.0,"Action,Adventure,Sci-Fi",Simon Kinberg
2,avengers age of ultron,330600000.0,1403014016.0,1072414016.0,3.24,2015,BV,tt2395427,141.0,"Action,Adventure,Sci-Fi",Joss Whedon
3,star wars ep viii last jedi,317000000.0,1316721792.0,999721792.0,3.15,2017,Unknown,Unknown,,Unknown,
4,star wars ep vii force awakens,306000000.0,2053311232.0,1747311232.0,5.71,2015,Unknown,Unknown,,Unknown,


In [97]:
merged_df3['director'] = merged_df3.director.fillna('Not listed')

In [99]:
producers = imbd_name_prin.loc[imbd_name_prin.category == 'producer']

In [101]:
producers = producers.rename(columns = {'primary_name': 'producer'})

In [102]:
producers = producers.drop(columns = 'category')

In [104]:
merged_df4 = pd.merge(merged_df3, producers, on = ["tconst"], how = "left")

In [106]:
merged_df4['producer'] = merged_df4.producer.fillna('Not listed')

In [107]:
merged_df4.head()

Unnamed: 0,title,production_budget,worldwide_gross,net,ratio,year,studio,tconst,runtime_minutes,genres,director,producer
0,pirates of caribbean on stranger tides,410600000.0,1045663872.0,635063872.0,1.55,2011,BV,tt1298650,136.0,"Action,Adventure,Fantasy",Rob Marshall,Not listed
1,dark phoenix,350000000.0,149762352.0,-200237648.0,-0.57,2019,Unknown,tt6565702,113.0,"Action,Adventure,Sci-Fi",Simon Kinberg,Not listed
2,avengers age of ultron,330600000.0,1403014016.0,1072414016.0,3.24,2015,BV,tt2395427,141.0,"Action,Adventure,Sci-Fi",Joss Whedon,Kevin Feige
3,star wars ep viii last jedi,317000000.0,1316721792.0,999721792.0,3.15,2017,Unknown,Unknown,,Unknown,Not listed,Not listed
4,star wars ep vii force awakens,306000000.0,2053311232.0,1747311232.0,5.71,2015,Unknown,Unknown,,Unknown,Not listed,Not listed


In [115]:
director_ratio_mean = merged_df4.groupby(['director']).ratio.mean().sort_values(ascending=False)[:20]

In [116]:
director_ratio_mean

director
Chris Lofing         415.56
Travis Cluff         415.56
Tod Williams          58.17
Bradley Parker        41.41
Jordan Peele          30.89
Nitesh Tiwari         30.02
Jose G. Ramos         28.76
Yun Xie               26.18
Junshu Huang          26.18
John R. Leonetti      26.00
William Brent Bell    24.81
Josh Boone            24.60
Alex Kendrick         23.66
Henry Joost           22.69
Ariel Schulman        22.69
Daniel Stamm          22.39
Dan Trachtenberg      20.66
David F. Sandberg     19.90
Stiles White          19.66
Barry Jenkins         18.59
Name: ratio, dtype: float64

In [117]:
producer_ratio_mean = merged_df4.groupby(['producer']).ratio.mean().sort_values(ascending=False)[:20]

In [118]:
producer_ratio_mean

producer
Benjamin Forkner     415.56
Dean Schnider        415.56
Guymon Casady        208.69
Morris Paulson       100.76
Edward H. Hamm Jr.    50.07
Brian Witten          41.41
Jason Blum            34.50
Lawrence Grey         28.76
Adam Donaghey         26.70
Zheng Jun             26.18
Jera Wang             26.18
Jack Zheng            26.18
Malek Akkad           24.49
James Wan             22.69
Thomas A. Bliss       21.67
Adele Romanski        21.58
Steven Schneider      20.68
Lindsey Weber         20.66
Sean McKittrick       20.46
Oren Peli             19.41
Name: ratio, dtype: float64

In [111]:
merged_df4.producer.value_counts()[:21]

Not listed                931
Jason Blum                 42
Tim Bevan                  21
Eric Fellner               19
Scott Rudin                17
Lorenzo di Bonaventura     16
Peter Chernin              15
Broderick Johnson          13
Ryan Kavanaugh             13
Neal H. Moritz             13
Janet Healy                12
Scott Stuber               12
Andrew A. Kosove           12
Wyck Godfrey               12
Michael De Luca            12
John Davis                 11
Beau Flynn                 11
Brian Grazer               11
Nina Jacobson              11
Marty Bowen                11
Michael Bay                11
Name: producer, dtype: int64

In [None]:
merged_df.head()

In [None]:
title_ids = list(merged_df.title_id.unique())

In [None]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year', 'primary_profession', 'known_for_titles'])
#imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
#imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
#imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')

In [None]:
imbd_basics.head()

In [None]:
imbd_basics.primary_title = imbd_basics.primary_title.str.strip()
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.lower())
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.primary_title = imbd_basics.primary_title.replace(['the', 'and'], value='', regex=True)

In [None]:
imbd_basics = imbd_basics.drop(columns = ['original_title', 'runtime_minutes'])

In [None]:
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title', 'start_year': 'year'})

In [None]:
imbd_basics['year'] = imbd_basics['year'].astype(str)

In [None]:
merged_df = pd.merge(bom_20, imbd_basics, on = ["title", 'year'], how = "inner")

In [None]:
merged_df.head()

In [None]:
imbd_principals = imbd_principals.drop(columns = ['ordering', 'job', 'characters'])

In [None]:
imbd_principals.head()

In [None]:
imbd_merge = pd.merge(imbd_name, imbd_principals, on = ['nconst'], how = "outer")

In [None]:
imbd_merge.head()

In [None]:
imbd_director = imbd_merge.rename(columns = {'nconst': 'directors', 'primary_name': 'director_name'})

In [None]:
imbd_director = imbd_director.drop(columns = 'category')

In [None]:
imbd_director

In [None]:
imbd_crews = pd.merge(imbd_crews, imbd_director, on = ['directors', 'tconst'])

In [None]:
imbd_directors = imbd_crews.drop(columns = "writers")

In [None]:
imbd_basics = pd.merge(imbd_basics, imbd_directors, on = 'tconst')

In [None]:
imbd_bom = pd.merge(bom_20, imbd_basics, on = ['title', 'year'], how = 'left')

In [None]:
imbd_bom.info()

In [None]:
list_dup = list(imbd_bom.loc[imbd_bom.title.duplicated() == True].title.unique())

In [None]:
imbd_bom.loc[imbd_bom.title.isin(list_dup) == True]

In [None]:
original_list = ['David F. Sandberg', 'Robert Zemeckis', 'Tom McCarthy', 'Jason Moore', 'Tim Johnson', 
               'Kenneth Branagh', 'Brad Anderson', 'John Singleton', 'Michel Hazanavicius', 'David Fincher',
              'Steve Antin', 'Ridley Scott', 'Tim Burton']

In [None]:
imbd_bom = imbd_bom.loc[(imbd_bom.title.isin(list_dup) == False) 
             | ((imbd_bom.director_name.isin(original_list) == True) 
               & (imbd_bom.title.isin(list_dup) == True))]

In [None]:
imbd_bom.head()

In [None]:
imbd_bom.loc[(imbd_bom.director_name.duplicated() == True) & ( imbd_bom.director_name.isna() == False)].director_name.nunique()

In [None]:
imbd_principals.head()

In [None]:
imbd_actor = imbd_principals.loc[imbd_principals.category == "actor"]

In [None]:
imbd_name.head()

In [None]:
imbd_actor = pd.merge(imbd_actor, imbd_name, on = 'nconst', how = 'inner')

In [None]:
imbd_actor = imbd_actor.rename(columns = {'primary_name': 'actor_name'})

In [None]:
imbd_actress = imbd_principals.loc[imbd_principals.category == "actress"]

In [None]:
imbd_actress = pd.merge(imbd_actress, imbd_name, on = 'nconst', how = 'inner')

In [None]:
imbd_actress = imbd_actress.rename(columns = {'primary_name': 'actress_name'})

In [None]:
imbd_actress.head()

In [None]:
imbd_actors = pd.merge(imbd_actor, imbd_actress, on = ['tconst', 'nconst'])

In [None]:
imbd_actors = pd.concat[imbd_actor, imbd_actress]