In [16]:
#loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# setting plt to inline
%matplotlib inline

In [17]:
# importing sqlite3
import sqlite3

# conneting to db and creating cursor
conn = sqlite3.Connection('data/movies.db')
cur = conn.cursor()

# creating query functions
def fetcha(q):
    return cur.execute(q).fetchall()
def fetcho(q):
    return cur.execute(q).fetchone()

In [18]:
# getting table names
q = """SELECT name FROM sqlite_master 
WHERE type IN ('table','view') 
AND name NOT LIKE 'sqlite_%'
ORDER BY 1"""
fetcha(q)

[('bom_movie_gross',),
 ('imdb_name_basic',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rotten_tomatoes_critic_reviews',),
 ('rotten_tomatoes_movies',),
 ('tmdb_movies',),
 ('tn_movie_budgets',)]

In [19]:
q = "PRAGMA table_info('imdb_title_principals')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'ordering', 'INTEGER', 0, None, 0),
 (3, 'nconst', 'TEXT', 0, None, 0),
 (4, 'category', 'TEXT', 0, None, 0),
 (5, 'job', 'TEXT', 0, None, 0),
 (6, 'characters', 'TEXT', 0, None, 0)]

In [20]:
q = "PRAGMA table_info('imdb_name_basics')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 0),
 (1, 'nconst', 'TEXT', 0, None, 0),
 (2, 'primary_name', 'TEXT', 0, None, 0),
 (3, 'birth_year', 'REAL', 0, None, 0),
 (4, 'death_year', 'REAL', 0, None, 0),
 (5, 'primary_profession', 'TEXT', 0, None, 0),
 (6, 'known_for_titles', 'TEXT', 0, None, 0)]

In [21]:
q = "PRAGMA table_info('imdb_title_basics')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'primary_title', 'TEXT', 0, None, 0),
 (3, 'original_title', 'TEXT', 0, None, 0),
 (4, 'start_year', 'INTEGER', 0, None, 0),
 (5, 'runtime_minutes', 'REAL', 0, None, 0),
 (6, 'genres', 'TEXT', 0, None, 0)]

In [22]:
# getting column names
q = "PRAGMA table_info('tn_movie_budgets')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'id', 'INTEGER', 0, None, 0),
 (2, 'release_date', 'TEXT', 0, None, 0),
 (3, 'movie', 'TEXT', 0, None, 0),
 (4, 'production_budget', 'TEXT', 0, None, 0),
 (5, 'domestic_gross', 'TEXT', 0, None, 0),
 (6, 'worldwide_gross', 'TEXT', 0, None, 0)]

In [23]:
# creating initial dataframe by joining the four tables and printing first row
q = """SELECT*FROM imdb_name_basics
       JOIN imdb_title_principals 
       USING(nconst)
       JOIN imdb_title_basics AS ib
       USING(tconst)
       JOIN tn_movie_budgets AS tn
       ON ib.primary_title = tn.movie
       
       """
df = pd.DataFrame(fetcha(q))
df.columns = [i[0] for i in cur.description]
df.head(1)     

Unnamed: 0,idx,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles,idx.1,tconst,ordering,...,start_year,runtime_minutes,genres,idx.2,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553",466550,tt2398241,9,...,2017,90.0,"Adventure,Animation,Comedy",904,5,"Apr 7, 2017",Smurfs: The Lost Village,"$60,000,000","$45,020,282","$197,578,586"


In [24]:
# checking dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32406 entries, 0 to 32405
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   idx                 32406 non-null  int64  
 1   nconst              32406 non-null  object 
 2   primary_name        32406 non-null  object 
 3   birth_year          14742 non-null  float64
 4   death_year          591 non-null    float64
 5   primary_profession  31872 non-null  object 
 6   known_for_titles    32012 non-null  object 
 7   idx                 32406 non-null  int64  
 8   tconst              32406 non-null  object 
 9   ordering            32406 non-null  int64  
 10  category            32406 non-null  object 
 11  job                 9556 non-null   object 
 12  characters          13012 non-null  object 
 13  idx                 32406 non-null  int64  
 14  primary_title       32406 non-null  object 
 15  original_title      32406 non-null  object 
 16  star

In [25]:
# dropping extraneous columns
df = df.drop(['idx', 'birth_year', 'death_year', 'nconst', 'primary_profession',\
              'primary_title', 'original_title', 'id',\
              'known_for_titles', 'tconst', 'ordering',\
              'job', 'characters', 'start_year','runtime_minutes'], axis=1)
df.head(1)

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,Mary Ellen Bauder,producer,"Adventure,Animation,Comedy","Apr 7, 2017",Smurfs: The Lost Village,"$60,000,000","$45,020,282","$197,578,586"


In [26]:
# checking numbers of NaNs
df.isna().sum()

primary_name           0
category               0
genres               260
release_date           0
movie                  0
production_budget      0
domestic_gross         0
worldwide_gross        0
dtype: int64

In [27]:
#cleaning numerical strings and converting to floats
df['worldwide_gross'] = df['worldwide_gross']\
                                           .apply(lambda x: x.strip("$")\
                                           .replace(',', '')).astype('float')
df['domestic_gross'] = df['domestic_gross']\
                                           .apply(lambda x: x.strip("$")\
                                           .replace(',', '')).astype('float')
df['production_budget'] = df['production_budget']\
                                             .apply(lambda x: x.strip("$")\
                                             .replace(',', '')).astype('float')

In [28]:
# change date string to datetime object 
df.release_date = pd.to_datetime(df.release_date)

In [29]:
df['international_gross'] = df['worldwide_gross'].sub(df['domestic_gross'])
df.head(1)

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross
0,Mary Ellen Bauder,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,152558304.0


In [30]:
#adding net profit columns
df['domestic_net'] = df['domestic_gross'].sub(df['production_budget'])
df['worldwide_net'] = df['worldwide_gross'].sub(df['production_budget'])
df['international_net'] = df['worldwide_gross'].sub(df['production_budget'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32406 entries, 0 to 32405
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   primary_name         32406 non-null  object        
 1   category             32406 non-null  object        
 2   genres               32146 non-null  object        
 3   release_date         32406 non-null  datetime64[ns]
 4   movie                32406 non-null  object        
 5   production_budget    32406 non-null  float64       
 6   domestic_gross       32406 non-null  float64       
 7   worldwide_gross      32406 non-null  float64       
 8   international_gross  32406 non-null  float64       
 9   domestic_net         32406 non-null  float64       
 10  worldwide_net        32406 non-null  float64       
 11  international_net    32406 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 3.0+ MB


In [31]:
#adding return on investment columns and checking results
df['domestic_roi'] = df['domestic_net'].divide(df['production_budget'])
df['worldwide_roi'] = df['worldwide_net'].divide(df['production_budget'])
df['international_roi'] = df['international_net'].divide(df['production_budget'])
df.head(1)

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross,domestic_net,worldwide_net,international_net,domestic_roi,worldwide_roi,international_roi
0,Mary Ellen Bauder,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,152558304.0,-14979718.0,137578586.0,137578586.0,-0.249662,2.292976,2.292976


In [32]:
title_gross = df[['movie', 'worldwide_gross', 'release_date']]
unique_titles = title_gross.groupby('movie').first().reset_index()
top20_grossing_films = unique_titles.sort_values(by='worldwide_gross', ascending=False).head(20)
top20_grossing_films

Unnamed: 0,movie,worldwide_gross,release_date
176,Avatar,2776345000.0,2009-12-18
2109,Titanic,2208208000.0,1997-12-19
178,Avengers: Infinity War,2048134000.0,2018-04-27
932,Jurassic World,1648855000.0,2015-06-12
674,Furious 7,1518723000.0,2015-04-03
1651,The Avengers,1517936000.0,2012-05-04
177,Avengers: Age of Ultron,1403014000.0,2015-05-01
259,Black Panther,1348258000.0,2018-02-16
933,Jurassic World: Fallen Kingdom,1305773000.0,2018-06-22
669,Frozen,1272470000.0,2013-11-22


In [33]:
title_net = df[['movie','production_budget', 'worldwide_gross', 'worldwide_net', 'release_date']]
unique_titles = title_net.groupby('movie').first().reset_index()
top20_net_films = unique_titles.sort_values(by='worldwide_net', ascending=False).head(20)
top20_net_films

Unnamed: 0,movie,production_budget,worldwide_gross,worldwide_net,release_date
176,Avatar,425000000.0,2776345000.0,2351345000.0,2009-12-18
2109,Titanic,200000000.0,2208208000.0,2008208000.0,1997-12-19
178,Avengers: Infinity War,300000000.0,2048134000.0,1748134000.0,2018-04-27
932,Jurassic World,215000000.0,1648855000.0,1433855000.0,2015-06-12
674,Furious 7,190000000.0,1518723000.0,1328723000.0,2015-04-03
1651,The Avengers,225000000.0,1517936000.0,1292936000.0,2012-05-04
259,Black Panther,200000000.0,1348258000.0,1148258000.0,2018-02-16
933,Jurassic World: Fallen Kingdom,170000000.0,1305773000.0,1135773000.0,2018-06-22
669,Frozen,150000000.0,1272470000.0,1122470000.0,2013-11-22
216,Beauty and the Beast,160000000.0,1259200000.0,1099200000.0,2017-03-17


In [34]:
title_roi = df[['movie','production_budget', 'worldwide_gross', 'worldwide_roi', 'release_date']]
unique_titles = title_roi.groupby('movie').first().reset_index()
top20_roi_films = unique_titles.sort_values(by='worldwide_roi', ascending=False).head(20)
top20_roi_films

Unnamed: 0,movie,production_budget,worldwide_gross,worldwide_roi,release_date
1781,The Gallows,100000.0,41656474.0,415.56474,2015-07-10
195,Bambi,858000.0,268000000.0,311.354312,1942-08-13
1398,Rocky,1000000.0,225000000.0,224.0,1976-11-21
1518,Snow White and the Seven Dwarfs,1488000.0,184925486.0,123.27788,1937-12-21
1732,The Devil Inside,1000000.0,101759490.0,100.75949,2012-01-06
724,Graduation Day,250000.0,23894000.0,94.576,1981-05-01
875,Insidious,1500000.0,99870886.0,65.580591,2011-04-01
2167,Unfriended,1000000.0,64364198.0,63.364198,2015-04-17
231,Benji,500000.0,31559560.0,62.11912,1974-11-15
1247,Paranormal Activity 2,3000000.0,177512032.0,58.170677,2010-10-20


In [35]:
df.category.unique()

array(['producer', 'composer', 'actor', 'cinematographer', 'director',
       'writer', 'actress', 'editor', 'production_designer', 'self',
       'archive_footage', 'archive_sound'], dtype=object)

In [91]:
years1 = top20_grossing_films.release_date.unique()
np.sort(years1)

array(['1997-12-19T00:00:00.000000000', '2009-12-18T00:00:00.000000000',
       '2011-06-29T00:00:00.000000000', '2012-05-04T00:00:00.000000000',
       '2012-11-08T00:00:00.000000000', '2013-05-03T00:00:00.000000000',
       '2013-11-22T00:00:00.000000000', '2015-04-03T00:00:00.000000000',
       '2015-05-01T00:00:00.000000000', '2015-06-12T00:00:00.000000000',
       '2015-07-10T00:00:00.000000000', '2016-05-06T00:00:00.000000000',
       '2017-03-17T00:00:00.000000000', '2017-04-14T00:00:00.000000000',
       '2018-02-16T00:00:00.000000000', '2018-04-27T00:00:00.000000000',
       '2018-06-15T00:00:00.000000000', '2018-06-22T00:00:00.000000000',
       '2018-12-21T00:00:00.000000000', '2019-03-08T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [88]:
actor_df = df[df['category'].str.contains('actor')]

In [82]:
top20_grossing_actors = actor_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20) 
top20 = pd.DataFrame(top20_grossing_actors)

In [84]:
movie_count = actor_df.groupby(['primary_name']).movie.count()
count = pd.DataFrame(movie_count)

In [85]:
joined = top20.join(count)
joined.head()

Unnamed: 0_level_0,worldwide_gross,movie
primary_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Robert Downey Jr.,9697976000.0,11
Dwayne Johnson,8042797000.0,19
Chris Evans,7431880000.0,13
Chris Hemsworth,6205525000.0,15
Mark Ruffalo,5466510000.0,10


In [86]:
joined['mean_gross'] = joined.worldwide_gross/joined.movie
top20_actors_mean = joined.sort_values(by='mean_gross', ascending=False).head(20)

Unnamed: 0_level_0,worldwide_gross,movie,mean_gross
primary_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ian McKellen,3670811000.0,4,917702700.0
Robert Downey Jr.,9697976000.0,11,881634200.0
Vin Diesel,4725216000.0,7,675030900.0
Chris Evans,7431880000.0,13,571683100.0
Mark Ruffalo,5466510000.0,10,546651000.0
Josh Gad,3626120000.0,7,518017100.0
Chris Pratt,4984451000.0,10,498445100.0
Dwayne Johnson,8042797000.0,19,423305100.0
Liam Hemsworth,3787563000.0,9,420840400.0
Chris Hemsworth,6205525000.0,15,413701700.0


In [54]:
actress_df = df[df['category'].str.contains('actress')]
actress_df.head()

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross,domestic_net,worldwide_net,international_net,domestic_roi,worldwide_roi,international_roi
27,Agata Buzek,actress,"Action,Crime,Drama",2013-06-28,Redemption,23000000.0,36686.0,13593473.0,13556787.0,-22963314.0,-9406527.0,-0.408979,-0.998405,-0.408979,-1.778171e-08
30,Valentina Carnelutti,actress,"Comedy,Drama",2011-10-28,Like Crazy,250000.0,3395391.0,3728400.0,333009.0,3145391.0,3478400.0,13.9136,12.581564,13.9136,5.56544e-05
33,Cecilia Cheung,actress,"Drama,Mystery,Romance",2012-11-09,Dangerous Liaisons,24200000.0,17000.0,10447579.0,10430579.0,-24183000.0,-13752421.0,-0.568282,-0.999298,-0.568282,-2.348272e-08
34,Cecilia Cheung,actress,"Drama,Mystery,Romance",1988-12-21,Dangerous Liaisons,14000000.0,34700000.0,34700000.0,0.0,20700000.0,20700000.0,1.478571,1.478571,1.478571,1.056122e-07
65,Nazanin Farahani,actress,Drama,1996-07-05,Phenomenon,32000000.0,104636382.0,152036382.0,47400000.0,72636382.0,120036382.0,3.751137,2.269887,3.751137,1.17223e-07


In [55]:
top_actresses_films = actress_df.sort_values(by=['worldwide_gross'], ascending=False).head(50)
top_actresses_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

Unnamed: 0,primary_name,movie,worldwide_gross,release_date
19181,Ai Hashimoto,Avatar,2776345000.0,2009-12-18
22193,Rikako Sakata,Avatar,2776345000.0,2009-12-18
25135,Yuri Hane,Avatar,2776345000.0,2009-12-18
23039,Mizuki Sashide,Avatar,2776345000.0,2009-12-18
19965,Nako Mizusawa,Avatar,2776345000.0,2009-12-18


In [56]:
years3 = top_actresses_films.release_date.unique()
print(years3)
print(len(years3))
np.sort(years3)

['2009-12-18T00:00:00.000000000' '1997-12-19T00:00:00.000000000'
 '2015-06-12T00:00:00.000000000' '2012-05-04T00:00:00.000000000'
 '2018-02-16T00:00:00.000000000' '2018-06-22T00:00:00.000000000'
 '2013-11-22T00:00:00.000000000' '2017-03-17T00:00:00.000000000'
 '2018-06-15T00:00:00.000000000' '2017-04-14T00:00:00.000000000'
 '2013-05-03T00:00:00.000000000' '2015-07-10T00:00:00.000000000'
 '2018-12-21T00:00:00.000000000' '2016-05-06T00:00:00.000000000'
 '2011-06-29T00:00:00.000000000' '2019-03-08T00:00:00.000000000'
 '2012-11-08T00:00:00.000000000' '2014-06-27T00:00:00.000000000'
 '2012-07-20T00:00:00.000000000' '2010-06-18T00:00:00.000000000'
 '2016-12-16T00:00:00.000000000' '2011-05-20T00:00:00.000000000'
 '2017-06-30T00:00:00.000000000' '2010-03-05T00:00:00.000000000']
24


array(['1997-12-19T00:00:00.000000000', '2009-12-18T00:00:00.000000000',
       '2010-03-05T00:00:00.000000000', '2010-06-18T00:00:00.000000000',
       '2011-05-20T00:00:00.000000000', '2011-06-29T00:00:00.000000000',
       '2012-05-04T00:00:00.000000000', '2012-07-20T00:00:00.000000000',
       '2012-11-08T00:00:00.000000000', '2013-05-03T00:00:00.000000000',
       '2013-11-22T00:00:00.000000000', '2014-06-27T00:00:00.000000000',
       '2015-06-12T00:00:00.000000000', '2015-07-10T00:00:00.000000000',
       '2016-05-06T00:00:00.000000000', '2016-12-16T00:00:00.000000000',
       '2017-03-17T00:00:00.000000000', '2017-04-14T00:00:00.000000000',
       '2017-06-30T00:00:00.000000000', '2018-02-16T00:00:00.000000000',
       '2018-06-15T00:00:00.000000000', '2018-06-22T00:00:00.000000000',
       '2018-12-21T00:00:00.000000000', '2019-03-08T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [57]:
top20_grossing_actresses = actress_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_actresses

primary_name
Jennifer Lawrence       5.113642e+09
Scarlett Johansson      4.943102e+09
Anne Hathaway           4.559581e+09
Cate Blanchett          4.501554e+09
Kristen Wiig            3.879445e+09
Bryce Dallas Howard     3.426385e+09
Emma Stone              3.118735e+09
Mizuki Sashide          2.776345e+09
Rikako Sakata           2.776345e+09
Yuri Hane               2.776345e+09
Nako Mizusawa           2.776345e+09
Ai Hashimoto            2.776345e+09
Léa Seydoux             2.747252e+09
Amy Adams               2.605902e+09
Angelina Jolie          2.514083e+09
Helena Bonham Carter    2.376813e+09
Emma Watson             2.360011e+09
Emily Blunt             2.347987e+09
Faith Tarby             2.208208e+09
Eliza Jones             2.208208e+09
Name: worldwide_gross, dtype: float64

In [59]:
director_df = df[df['category'].str.contains('director')]
director_df.head(1)

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross,domestic_net,worldwide_net,international_net,domestic_roi,worldwide_roi,international_roi
9,David Bowers,director,"Comedy,Family",2011-03-25,Diary of a Wimpy Kid: Rodrick Rules,18000000.0,52698535.0,73695194.0,20996659.0,34698535.0,55695194.0,3.094177,1.927696,3.094177,1.718987e-07


In [60]:
top_directors_films = director_df.sort_values(by=['worldwide_gross'], ascending=False).head(20)
top_directors_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

Unnamed: 0,primary_name,movie,worldwide_gross,release_date
25514,Atsushi Wada,Avatar,2776345000.0,2009-12-18
4866,Ravi Punj,Titanic,2208208000.0,1997-12-19
6689,Anthony Russo,Avengers: Infinity War,2048134000.0,2018-04-27
485,Joe Russo,Avengers: Infinity War,2048134000.0,2018-04-27
11023,Colin Trevorrow,Jurassic World,1648855000.0,2015-06-12


In [61]:
top20_grossing_directors = director_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_directors

primary_name
Joe Russo            3.902606e+09
Anthony Russo        3.902606e+09
Christopher Nolan    3.086180e+09
James Wan            3.083488e+09
Joss Whedon          2.992085e+09
Peter Jackson        2.922948e+09
Michael Bay          2.911998e+09
Atsushi Wada         2.776345e+09
Jon Favreau          2.614569e+09
Pierre Coffin        2.553409e+09
Francis Lawrence     2.543192e+09
Chris Renaud         2.518783e+09
Zack Snyder          2.420920e+09
Bryan Singer         2.383073e+09
Ravi Punj            2.208208e+09
Kyle Balda           2.195064e+09
Marilyn Barnes       2.174401e+09
Ridley Scott         2.042771e+09
Sam Mendes           1.990148e+09
Steven Spielberg     1.969040e+09
Name: worldwide_gross, dtype: float64

In [62]:
producer_df = df[df['category'].str.contains('producer')]
producer_df.head()

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross,domestic_net,worldwide_net,international_net,domestic_roi,worldwide_roi,international_roi
0,Mary Ellen Bauder,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,152558304.0,-14979718.0,137578586.0,2.292976,-0.249662,2.292976,3.821627e-08
2,Terry Benedict,producer,"Biography,Drama,History",2016-11-04,Hacksaw Ridge,40000000.0,67209615.0,168904682.0,101695067.0,27209615.0,128904682.0,3.222617,0.68024,3.222617,8.056543e-08
3,Sam Bisbee,producer,"Comedy,Crime,Drama",2012-08-17,Robot & Frank,2500000.0,3317468.0,4934356.0,1616888.0,817468.0,2434356.0,0.973742,0.326987,0.973742,3.89497e-07
4,Sam Bisbee,producer,"Crime,Thriller",2015-08-07,Cop Car,5000000.0,134552.0,143658.0,9106.0,-4865448.0,-4856342.0,-0.971268,-0.97309,-0.971268,-1.942537e-07
5,Sam Bisbee,producer,"Comedy,Drama,Music",2018-06-08,Hearts Beat Loud,2000000.0,2386254.0,2420962.0,34708.0,386254.0,420962.0,0.210481,0.193127,0.210481,1.052405e-07


In [63]:
top_producers_films = producer_df.sort_values(by=['worldwide_gross'], ascending=False).head(20)
top_producers_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

Unnamed: 0,primary_name,movie,worldwide_gross,release_date
26093,Baljinder Singh,Titanic,2208208000.0,1997-12-19
13905,Patrick Crowley,Jurassic World,1648855000.0,2015-06-12
447,Neal H. Moritz,Furious 7,1518723000.0,2015-04-03
1735,Michael Fottrell,Furious 7,1518723000.0,2015-04-03
11595,Kevin Feige,The Avengers,1517936000.0,2012-05-04


In [64]:
top20_grossing_producers = producer_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_producers

primary_name
Kevin Feige               6.487829e+09
Janet Healy               4.804838e+09
Neal H. Moritz            4.443698e+09
Lorenzo di Bonaventura    4.024103e+09
Simon Kinberg             3.778237e+09
Michael Fottrell          3.600927e+09
Jason Blum                3.579377e+09
Peter Chernin             3.550046e+09
Patrick Crowley           3.492568e+09
Nina Jacobson             3.442156e+09
Charles Roven             3.273026e+09
Emma Thomas               3.086180e+09
Christopher Meledandri    3.072017e+09
Jon Kilik                 3.016263e+09
Ian Bryce                 2.937349e+09
Carolynne Cunningham      2.922948e+09
Tom DeSanto               2.693025e+09
Don Murphy                2.693025e+09
Joe Roth                  2.651850e+09
David Heyman              2.436039e+09
Name: worldwide_gross, dtype: float64

In [65]:
writer_df = df[df['category'].str.contains('writer')]
writer_df.head()

Unnamed: 0,primary_name,category,genres,release_date,movie,production_budget,domestic_gross,worldwide_gross,international_gross,domestic_net,worldwide_net,international_net,domestic_roi,worldwide_roi,international_roi
14,Francesco Bruni,writer,"Comedy,Drama,Romance",1998-10-09,Slam,1000000.0,1009819.0,1087521.0,77702.0,9819.0,87521.0,0.087521,0.009819,0.087521,8.7521e-08
15,Ron Burch,writer,"Adventure,Animation,Comedy",2017-12-15,Ferdinand,111000000.0,84410380.0,289847930.0,205437550.0,-26589620.0,178847930.0,1.611243,-0.239546,1.611243,1.45157e-08
22,Jez Butterworth,writer,"Biography,Drama,Thriller",1995-11-03,Fair Game,50000000.0,11497497.0,26097497.0,14600000.0,-38502503.0,-23902503.0,-0.47805,-0.77005,-0.47805,-9.561001e-09
23,Jez Butterworth,writer,"Biography,Crime,Drama",2015-09-18,Black Mass,53000000.0,62575678.0,98837872.0,36262194.0,9575678.0,45837872.0,0.864866,0.180673,0.864866,1.631822e-08
24,Jez Butterworth,writer,"Action,Sci-Fi",2014-06-06,Edge of Tomorrow,178000000.0,100206256.0,370541256.0,270335000.0,-77793744.0,192541256.0,1.081692,-0.437044,1.081692,6.076924e-09


In [69]:
top_writers_films = writer_df.sort_values(by=['worldwide_gross'], ascending=False).head(20)
top_writers_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

Unnamed: 0,primary_name,movie,worldwide_gross,release_date
16889,Yûsuke Yamada,Avatar,2776345000.0,2009-12-18
24779,Teruo Noguchi,Avatar,2776345000.0,2009-12-18
15650,Christopher Markus,Avengers: Infinity War,2048134000.0,2018-04-27
10425,Stan Lee,Avengers: Infinity War,2048134000.0,2018-04-27
4109,Jack Kirby,Avengers: Infinity War,2048134000.0,2018-04-27


In [70]:
top20_grossing_writers = writer_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_writers

primary_name
Stan Lee                             1.270937e+10
Jack Kirby                           1.080214e+10
Christopher Markus                   5.417240e+09
Stephen McFeely                      5.417240e+09
Linda Woolverton                     4.914802e+09
Jeanne-Marie Leprince de Beaumont    3.735262e+09
Amanda Silver                        3.713271e+09
Rick Jaffa                           3.713271e+09
Cinco Paul                           3.628817e+09
Ken Daurio                           3.628817e+09
Joe Simon                            3.628055e+09
Derek Connolly                       3.581765e+09
Chris Morgan                         3.550379e+09
Gary Scott Thompson                  3.383733e+09
Larry Lieber                         3.202387e+09
Michael Arndt                        3.113020e+09
Philippa Boyens                      3.008235e+09
Fran Walsh                           3.008235e+09
Guillermo del Toro                   2.962074e+09
Suzanne Collins                      