In [1]:
#loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import cpi


# setting plt to inline
%matplotlib inline

# updating cpi if flagged out of date
# cpi.update()

In [2]:
# importing sqlite3
import sqlite3

# conneting to db and creating cursor
conn = sqlite3.Connection('data/movies.db')
cur = conn.cursor()

# creating query functions
def fetcha(q):
    return cur.execute(q).fetchall()
def fetcho(q):
    return cur.execute(q).fetchone()

In [3]:
# getting table names
q = """SELECT name FROM sqlite_master 
WHERE type IN ('table','view') 
AND name NOT LIKE 'sqlite_%'
ORDER BY 1"""
fetcha(q)

[('bom_movie_gross',),
 ('imdb_name_basic',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rotten_tomatoes_critic_reviews',),
 ('rotten_tomatoes_movies',),
 ('tmdb_movies',),
 ('tn_movie_budgets',)]

In [4]:
q = "PRAGMA table_info('imdb_title_principals')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'ordering', 'INTEGER', 0, None, 0),
 (3, 'nconst', 'TEXT', 0, None, 0),
 (4, 'category', 'TEXT', 0, None, 0),
 (5, 'job', 'TEXT', 0, None, 0),
 (6, 'characters', 'TEXT', 0, None, 0)]

In [5]:
q = "PRAGMA table_info('imdb_name_basics')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 0),
 (1, 'nconst', 'TEXT', 0, None, 0),
 (2, 'primary_name', 'TEXT', 0, None, 0),
 (3, 'birth_year', 'REAL', 0, None, 0),
 (4, 'death_year', 'REAL', 0, None, 0),
 (5, 'primary_profession', 'TEXT', 0, None, 0),
 (6, 'known_for_titles', 'TEXT', 0, None, 0)]

In [6]:
q = "PRAGMA table_info('imdb_title_basics')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'primary_title', 'TEXT', 0, None, 0),
 (3, 'original_title', 'TEXT', 0, None, 0),
 (4, 'start_year', 'INTEGER', 0, None, 0),
 (5, 'runtime_minutes', 'REAL', 0, None, 0),
 (6, 'genres', 'TEXT', 0, None, 0)]

In [7]:
# getting column names
q = "PRAGMA table_info('tn_movie_budgets')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'id', 'INTEGER', 0, None, 0),
 (2, 'release_date', 'TEXT', 0, None, 0),
 (3, 'movie', 'TEXT', 0, None, 0),
 (4, 'production_budget', 'TEXT', 0, None, 0),
 (5, 'domestic_gross', 'TEXT', 0, None, 0),
 (6, 'worldwide_gross', 'TEXT', 0, None, 0)]

In [8]:
# creating initial dataframe by joining the four tables and printing first row
q = """SELECT*FROM imdb_name_basics
       JOIN imdb_title_principals 
       USING(nconst)
       JOIN imdb_title_basics AS ib
       USING(tconst)
       JOIN tn_movie_budgets AS tn
       ON ib.primary_title = tn.movie
       
       """
df = pd.DataFrame(fetcha(q))
df.columns = [i[0] for i in cur.description]
df.head(1)     

Unnamed: 0,idx,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles,idx.1,tconst,ordering,...,start_year,runtime_minutes,genres,idx.2,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553",466550,tt2398241,9,...,2017,90.0,"Adventure,Animation,Comedy",904,5,"Apr 7, 2017",Smurfs: The Lost Village,"$60,000,000","$45,020,282","$197,578,586"


In [9]:
df.columns

Index(['idx', 'nconst', 'primary_name', 'birth_year', 'death_year',
       'primary_profession', 'known_for_titles', 'idx', 'tconst', 'ordering',
       'category', 'job', 'characters', 'idx', 'primary_title',
       'original_title', 'start_year', 'runtime_minutes', 'genres', 'idx',
       'id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [10]:
# checking dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32406 entries, 0 to 32405
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   idx                 32406 non-null  int64  
 1   nconst              32406 non-null  object 
 2   primary_name        32406 non-null  object 
 3   birth_year          14742 non-null  float64
 4   death_year          591 non-null    float64
 5   primary_profession  31872 non-null  object 
 6   known_for_titles    32012 non-null  object 
 7   idx                 32406 non-null  int64  
 8   tconst              32406 non-null  object 
 9   ordering            32406 non-null  int64  
 10  category            32406 non-null  object 
 11  job                 9556 non-null   object 
 12  characters          13012 non-null  object 
 13  idx                 32406 non-null  int64  
 14  primary_title       32406 non-null  object 
 15  original_title      32406 non-null  object 
 16  star

In [11]:
# dropping extraneous columns
df = df.drop(['idx', 'birth_year', 'death_year', 'primary_profession',\
              'primary_title', 'original_title', 'id',\
              'known_for_titles', 'ordering',\
              'job', 'characters', 'start_year','runtime_minutes'], axis=1)
df = df.rename(columns = {'movie':'title', 'release_date':'date'})
df.head(1)

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross
0,nm0061671,Mary Ellen Bauder,tt2398241,producer,"Adventure,Animation,Comedy","Apr 7, 2017",Smurfs: The Lost Village,"$60,000,000","$45,020,282","$197,578,586"


In [12]:
# checking numbers of NaNs
df.isna().sum()

nconst                 0
primary_name           0
tconst                 0
category               0
genres               260
date                   0
title                  0
production_budget      0
domestic_gross         0
worldwide_gross        0
dtype: int64

In [13]:
#cleaning numerical strings and converting to floats
df['worldwide_gross'] = df['worldwide_gross']\
                                           .apply(lambda x: x.strip("$")\
                                           .replace(',', '')).astype('float')
df['domestic_gross'] = df['domestic_gross']\
                                           .apply(lambda x: x.strip("$")\
                                           .replace(',', '')).astype('float')
df['production_budget'] = df['production_budget']\
                                             .apply(lambda x: x.strip("$")\
                                             .replace(',', '')).astype('float')

In [14]:
# change date string to datetime object 
df.date = pd.to_datetime(df.date)
df['year'] = df['date'].apply(lambda x: x.year)
df['month'] = df['date'].apply(lambda x: x.month)

In [15]:
df['international_gross'] = df['worldwide_gross'].sub(df['domestic_gross'])
df.head(1)

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross,year,month,international_gross
0,nm0061671,Mary Ellen Bauder,tt2398241,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,2017,4,152558304.0


In [16]:
df.head()

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross,year,month,international_gross
0,nm0061671,Mary Ellen Bauder,tt2398241,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,2017,4,152558304.0
1,nm0063618,Jeff Beal,tt1147681,composer,"Drama,Mystery",2012-12-31,After,650000.0,0.0,0.0,2012,12,0.0
2,nm0070822,Terry Benedict,tt2119532,producer,"Biography,Drama,History",2016-11-04,Hacksaw Ridge,40000000.0,67209615.0,168904682.0,2016,11,101695067.0
3,nm0083851,Sam Bisbee,tt1990314,producer,"Comedy,Crime,Drama",2012-08-17,Robot & Frank,2500000.0,3317468.0,4934356.0,2012,8,1616888.0
4,nm0083851,Sam Bisbee,tt3813310,producer,"Crime,Thriller",2015-08-07,Cop Car,5000000.0,134552.0,143658.0,2015,8,9106.0


In [17]:
# defing function to adjust for inflation because the year/
# 2020 throws errors when applying cpi.inflate

def adjust_for_inf(row, name):
    if row.year != 2020:
        value = cpi.inflate(row[name], row.year)
    else:
        value = row[name]
    return value

In [18]:
#adding adjusted budget and gross profit columns
df['adj_prod_budg'] = df.apply(lambda x: adjust_for_inf(x, 'production_budget'), axis=1)
df['adj_dom_gross'] = df.apply(lambda x: adjust_for_inf(x, 'domestic_gross'), axis=1)
df['adj_ww_gross'] = df.apply(lambda x: adjust_for_inf(x, 'worldwide_gross'), axis=1)
df['adj_i_gross'] = df.apply(lambda x: adjust_for_inf(x, 'international_gross'), axis=1)

df.head(1)

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross,year,month,international_gross,adj_prod_budg,adj_dom_gross,adj_ww_gross,adj_i_gross
0,nm0061671,Mary Ellen Bauder,tt2398241,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,2017,4,152558304.0,62579230.0,46955570.0,206071900.0,159116300.0


In [19]:
#adding net profit columns
df['adj_dom_net'] = df['adj_dom_gross'].sub(df['adj_prod_budg'])
df['adj_ww_net'] = df['adj_ww_gross'].sub(df['adj_prod_budg'])
df['adj_i_net'] = df['adj_i_gross'].sub(df['adj_prod_budg'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32406 entries, 0 to 32405
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   nconst               32406 non-null  object        
 1   primary_name         32406 non-null  object        
 2   tconst               32406 non-null  object        
 3   category             32406 non-null  object        
 4   genres               32146 non-null  object        
 5   date                 32406 non-null  datetime64[ns]
 6   title                32406 non-null  object        
 7   production_budget    32406 non-null  float64       
 8   domestic_gross       32406 non-null  float64       
 9   worldwide_gross      32406 non-null  float64       
 10  year                 32406 non-null  int64         
 11  month                32406 non-null  int64         
 12  international_gross  32406 non-null  float64       
 13  adj_prod_budg        32406 non-

In [20]:
#adding return on investment columns and checking results
df['adj_dom_roi'] = df['adj_dom_net'].divide(df['adj_prod_budg'])
df['adj_ww_roi'] = df['adj_ww_net'].divide(df['adj_prod_budg'])
df['adj_i_roi'] = df['adj_i_net'].divide(df['adj_prod_budg'])
df.head(1)

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross,...,adj_prod_budg,adj_dom_gross,adj_ww_gross,adj_i_gross,adj_dom_net,adj_ww_net,adj_i_net,adj_dom_roi,adj_ww_roi,adj_i_roi
0,nm0061671,Mary Ellen Bauder,tt2398241,producer,"Adventure,Animation,Comedy",2017-04-07,Smurfs: The Lost Village,60000000.0,45020282.0,197578586.0,...,62579230.0,46955570.0,206071900.0,159116300.0,-15623650.0,143492700.0,96537120.0,-0.249662,2.292976,1.542638


In [21]:
title_gross = df[['title', 'adj_ww_gross', 'date']]
unique_titles = title_gross.drop_duplicates(subset = ['title'])
top20_grossing_films = unique_titles.sort_values(by='adj_ww_gross', ascending=False).head(20)
top20_grossing_films

Unnamed: 0,title,adj_ww_gross,date
6985,Bambi,4203440000.0,1942-08-13
4866,Titanic,3517408000.0,1997-12-19
16889,Avatar,3308483000.0,2009-12-18
2320,Snow White and the Seven Dwarfs,3283159000.0,1937-12-21
485,Avengers: Infinity War,2085246000.0,2018-04-27
258,Jurassic World,1778528000.0,2015-06-12
575,The Lion King,1701301000.0,1994-06-15
1020,The Avengers,1690249000.0,2012-05-04
447,Furious 7,1638161000.0,2015-04-03
12913,Fantasia,1521524000.0,1940-11-13


In [22]:
title_net = df[['title','adj_prod_budg', 'adj_ww_gross', 'adj_ww_net' ,'date']]
unique_titles = title_net.groupby('title').first().reset_index()
top20_net_films = unique_titles.sort_values(by='adj_ww_net', ascending=False).head(20)
top20_net_films

Unnamed: 0,title,adj_prod_budg,adj_ww_gross,adj_ww_net,date
195,Bambi,13457280.0,4203440000.0,4189983000.0,1942-08-13
1518,Snow White and the Seven Dwarfs,26417890.0,3283159000.0,3256741000.0,1937-12-21
2109,Titanic,318575700.0,3517408000.0,3198832000.0,1997-12-19
176,Avatar,506459100.0,3308483000.0,2802024000.0,2009-12-18
178,Avengers: Infinity War,305435900.0,2085246000.0,1779810000.0,2018-04-27
1889,The Lion King,136798900.0,1701301000.0,1564502000.0,1994-06-15
932,Jurassic World,231908500.0,1778528000.0,1546619000.0,2015-06-12
600,Fantasia,41635570.0,1521524000.0,1479889000.0,1940-11-13
1651,The Avengers,250541500.0,1690249000.0,1439707000.0,2012-05-04
674,Furious 7,204942400.0,1638161000.0,1433219000.0,2015-04-03


In [23]:
title_roi = df[['title','adj_prod_budg',  'adj_ww_net', 'adj_ww_roi', 'date']]
unique_titles = title_roi.groupby('title').first().reset_index()
top20_roi_films = unique_titles.sort_values(by='adj_ww_roi', ascending=False).head(20)
top20_roi_films

Unnamed: 0,title,adj_prod_budg,adj_ww_net,adj_ww_roi,date
1781,The Gallows,107864.4,44824650.0,415.56474,2015-07-10
195,Bambi,13457280.0,4189983000.0,311.354312,1942-08-13
1398,Rocky,4493093.0,1006453000.0,224.0,1976-11-21
1518,Snow White and the Seven Dwarfs,26417890.0,3256741000.0,123.27788,1937-12-21
1732,The Devil Inside,1113518.0,112197500.0,100.75949,2012-01-06
724,Graduation Day,703127.1,66498950.0,94.576,1981-05-01
875,Insidious,1704842.0,111804600.0,65.580591,2011-04-01
2167,Unfriended,1078644.0,68347420.0,63.364198,2015-04-17
231,Benji,2592870.0,161066800.0,62.11912,1974-11-15
1247,Paranormal Activity 2,3517312.0,204604400.0,58.170677,2010-10-20


##  Top Creative Talent

In [32]:
df[df['tconst'] == 'tt0120338']

Unnamed: 0,nconst,primary_name,tconst,category,genres,date,title,production_budget,domestic_gross,worldwide_gross,...,adj_prod_budg,adj_dom_gross,adj_ww_gross,adj_i_gross,adj_dom_net,adj_ww_net,adj_i_net,adj_dom_roi,adj_ww_roi,adj_i_roi


In [33]:
df.category.unique()

array(['producer', 'composer', 'actor', 'cinematographer', 'director',
       'writer', 'actress', 'editor', 'production_designer', 'self',
       'archive_footage', 'archive_sound'], dtype=object)

## Top 20 Actors By Adjusted Gross

In [39]:
df = df[df.title != 'Titanic']
df = df[df.title != 'Cinderella']
df = df[df.title != 'Snow White and the Seven Dwarfs']
df = df[df.title != 'Beauty and the Beast']
# df=df[~df['title'].isin(['Titanic', 'Cinderella', 'Snow White and the Seven Dwarfs'])]

In [40]:
actor_df = df[df['category'].str.contains('actor')]

In [64]:
top100_grossing_actors = actor_df.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(100) 
top100_actors = pd.DataFrame(top100_grossing_actors)
top100_actors

Unnamed: 0_level_0,adj_ww_gross
primary_name,Unnamed: 1_level_1
Robert Downey Jr.,1.049882e+10
Dwayne Johnson,8.735245e+09
Chris Evans,7.979593e+09
Chris Hemsworth,6.594242e+09
Mark Ruffalo,5.775902e+09
...,...
Morgan Freeman,1.794945e+09
Irrfan Khan,1.782363e+09
Marwan Kenzari,1.763729e+09
Neil Patrick Harris,1.763620e+09


In [None]:
# names = list(top50_actors.index)
# for name in names:
#     print(title_checker(actor_df, name))

In [62]:
def title_checker(DF, name):
    return pd.DataFrame(DF[DF['primary_name'] == name][['primary_name', 'nconst', 'title', 'tconst','year']])

In [45]:
names = list(top50_actors.index)
frames = []
for name in names:
    frames.append(title_checker(actor_df, name))
result = pd.concat(frames)
result

Unnamed: 0,primary_name,nconst,title,tconst,year
10572,Robert Downey Jr.,nm0000375,The Avengers,tt0848228,2012
10573,Robert Downey Jr.,nm0000375,The Avengers,tt0848228,1998
10574,Robert Downey Jr.,nm0000375,Iron Man 2,tt1228705,2010
10575,Robert Downey Jr.,nm0000375,Due Date,tt1231583,2010
10576,Robert Downey Jr.,nm0000375,Iron Man 3,tt1300854,2013
...,...,...,...,...,...
14111,Ben Affleck,nm0000255,The Accountant,tt2140479,2016
14112,Ben Affleck,nm0000255,Gone Girl,tt2267998,2014
14113,Ben Affleck,nm0000255,Live by Night,tt2361317,2016
14114,Ben Affleck,nm0000255,Runner Runner,tt2364841,2013


In [59]:
dupes = result.duplicated(subset='title') 
result[bool_series]

Unnamed: 0,primary_name,nconst,title,tconst,year
10573,Robert Downey Jr.,nm0000375,The Avengers,tt0848228,1998
11641,Dwayne Johnson,nm0425005,Snitch,tt0882977,2012
11643,Dwayne Johnson,nm0425005,Hercules,tt1267297,2014
9519,Chris Evans,nm0262635,The Avengers,tt0848228,2012
9520,Chris Evans,nm0262635,The Avengers,tt0848228,1998
...,...,...,...,...,...
3334,Adam Sandler,nm0001191,Hotel Transylvania 2,tt2510894,2015
3336,Adam Sandler,nm0001191,Hotel Transylvania 3: Summer Vacation,tt5220122,2018
14108,Ben Affleck,nm0000255,The Town,tt0840361,2010
14110,Ben Affleck,nm0000255,Argo,tt1024648,2012


In [65]:
names = list(top100_actors.index)
for name in names:
    print(title_checker(actor_df, name))

            primary_name     nconst                               title  \
10572  Robert Downey Jr.  nm0000375                        The Avengers   
10573  Robert Downey Jr.  nm0000375                        The Avengers   
10574  Robert Downey Jr.  nm0000375                          Iron Man 2   
10575  Robert Downey Jr.  nm0000375                            Due Date   
10576  Robert Downey Jr.  nm0000375                          Iron Man 3   
10577  Robert Downey Jr.  nm0000375  Sherlock Holmes: A Game of Shadows   
10578  Robert Downey Jr.  nm0000375                           The Judge   
10579  Robert Downey Jr.  nm0000375              Spider-Man: Homecoming   
10580  Robert Downey Jr.  nm0000375             Avengers: Age of Ultron   
10581  Robert Downey Jr.  nm0000375          Captain America: Civil War   
10582  Robert Downey Jr.  nm0000375              Avengers: Infinity War   

          tconst  year  
10572  tt0848228  2012  
10573  tt0848228  1998  
10574  tt1228705  2010  

      primary_name     nconst                               title     tconst  \
14108  Ben Affleck  nm0000255                            The Town  tt0840361   
14109  Ben Affleck  nm0000255                      Justice League  tt0974015   
14110  Ben Affleck  nm0000255                                Argo  tt1024648   
14111  Ben Affleck  nm0000255                      The Accountant  tt2140479   
14112  Ben Affleck  nm0000255                           Gone Girl  tt2267998   
14113  Ben Affleck  nm0000255                       Live by Night  tt2361317   
14114  Ben Affleck  nm0000255                       Runner Runner  tt2364841   
14115  Ben Affleck  nm0000255  Batman v Superman: Dawn of Justice  tt2975590   

       year  
14108  2010  
14109  2017  
14110  2012  
14111  2016  
14112  2014  
14113  2016  
14114  2013  
14115  2016  
        primary_name     nconst                     title     tconst  year
14070  Ralph Fiennes  nm0000146             Hail, Caesar!  tt0475290  2016
140

In [66]:
reset_idx = top100_actors.reset_index()

In [67]:
name_list = list(reset_idx.primary_name)
name_list

['Robert Downey Jr.',
 'Dwayne Johnson',
 'Chris Evans',
 'Chris Hemsworth',
 'Mark Ruffalo',
 'Johnny Depp',
 'Chris Pratt',
 'Vin Diesel',
 'Jason Statham',
 'Samuel L. Jackson',
 'Mark Wahlberg',
 'Liam Hemsworth',
 'Tom Hardy',
 'Ian McKellen',
 'Josh Hutcherson',
 'Steve Carell',
 'Will Smith',
 'Ryan Reynolds',
 'Michael Keaton',
 'Woody Harrelson',
 'Leonardo DiCaprio',
 'Kevin Hart',
 'Martin Freeman',
 'Liam Neeson',
 'Hugh Jackman',
 'Andy Serkis',
 'Richard Armitage',
 'Bradley Cooper',
 'Jesse Eisenberg',
 'Idris Elba',
 'Channing Tatum',
 'Matt Damon',
 'Chiwetel Ejiofor',
 'Gary Oldman',
 'Daniel Craig',
 'Bryan Cranston',
 'Chris Pine',
 'James McAvoy',
 'Seth Rogen',
 'Owen Wilson',
 'Tom Hiddleston',
 'Jeremy Renner',
 'Kevin James',
 'Jack Black',
 'Christoph Waltz',
 'Tom Hanks',
 'Ken Watanabe',
 'Tom Cruise',
 'Adam Sandler',
 'Ben Affleck',
 'Ralph Fiennes',
 'Ben Kingsley',
 'Jamie Foxx',
 'Michael B. Jordan',
 'Matthew McConaughey',
 'Paul Walker',
 'Javier Bard

In [None]:
drop_list = [10573, 11641, 11642, 11642, 9520 ,6042, 9006, 16423, 2345, 6219, 6209, 9045, 9754, 2674]

In [None]:
cleaned_actors = actor_df.drop(drop_list)

In [None]:
top20_2_grossing_actors = cleaned_actors.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_2_actors = pd.DataFrame(top20_2_grossing_actors)
top20_2_actors

In [None]:
names2 = list(top20_2_actors.index)
for name in names2:
    print(title_checker(cleaned_actors, name))

In [None]:
drop_list2 =[12848]

In [None]:
cleaned_actors2 = cleaned_actors.drop(drop_list2)

In [None]:
top20_3_grossing_actors = cleaned_actors2.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_3_actors = pd.DataFrame(top20_3_grossing_actors)
top20_3_actors

In [None]:
names3 = list(top20_3_actors.index)
for name in names3:
    print(title_checker(cleaned_actors2, name))

In [None]:
drop_list3 = [10549]

In [None]:
cleaned_actors3 = cleaned_actors2.drop(drop_list3)

In [None]:
top20_4_grossing_actors = cleaned_actors3.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_4_actors = pd.DataFrame(top20_4_grossing_actors)
top20_4_actors

In [None]:
names4 = list(top20_4_actors.index)
for name in names4:
    print(title_checker(cleaned_actors3, name))

In [None]:
movie_count_m = actor_df.groupby(['primary_name']).title.count()
count_m = pd.DataFrame(movie_count_m)
count_m

In [None]:
joined_m = top20_4_actors.join(count_m)
joined_m.head()

In [None]:
joined_m['mean_gross'] = joined_m.adj_ww_gross/joined_m.title
joined_m = joined_m.reset_index()
joined_m = joined_m.rename(columns={'primary_name': 'Actor',\
                                       'title': 'Number of Titles', 'adj_ww_gross': 'Cumulative Gross',\
                                       'mean_gross': 'Mean Gross per Title'})

In [None]:
top20_cuml_gross_actors =  joined_m.sort_values(by='Cumulative Gross', ascending=False).head(20)
top20_mean_gross_actors = joined_m.sort_values(by='Mean Gross per Title', ascending=False).head(20)
top20_cuml_gross_actors

In [None]:
list(top20_cuml_gross_actors.Actor)

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Actor', y='Cumulative Gross', data=top20_cuml_gross_actors)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Actor", fontsize=20 )
ax.set_ylabel( "Cumulative Gross in Billions" , fontsize=20 )
ax.set_title("Actors With Top Cumulative Gross", fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Actor', y='Mean Gross per Title', data=top20_mean_gross_actors)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Actor", fontsize=20 ) 
ax.set_ylabel( "Mean Gross per Title in Billions" , fontsize=20 )
ax.set_title("Actors With Top Average Gross per Movie", fontsize=20) 
plt.show() 

## Top 20 Actresses By Adjusted Gross

In [None]:
actress_df = df[df['category'].str.contains('actress')]

In [None]:
top20_grossing_actresses = actress_df.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_actresses = pd.DataFrame(top20_grossing_actresses)
top20_actresses

In [None]:
names = list(top20_actresses.index)

In [None]:
def title_checker(DF, name):
    return pd.DataFrame(DF[DF['primary_name'] == name][['primary_name', 'nconst', 'title', 'tconst','year']])

In [None]:
names = list(top20_actresses.index)
for name in names:
    print(title_checker(actress_df, name))

In [None]:
drop_list = [7461, 7466, 2196, 4893, 4896, 2423, 2431,\
             15716, 15653, 16810, 23679, 11183, 28285, 17618,\
             23903, 19032, 21676, 20100, 28394, 30195]

In [None]:
cleaned_actresses = actress_df.drop(drop_list)

In [None]:
top20_2_grossing_actresses = cleaned_actresses.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_2_actresses = pd.DataFrame(top20_2_grossing_actresses)
top20_2_actresses

In [None]:
names2 = list(top20_2_actresses.index)
for name in names2:
    print(title_checker(cleaned_actresses, name))

In [None]:
drop_list2 = [13082, 30138, 29111, 6719, 27509, 17879, 19965, 22193, 23039, 19181, 25135, 31683, 10173]

In [None]:
cleaned_actresses2 = cleaned_actresses.drop(drop_list2)

In [None]:
top20_3_grossing_actresses = cleaned_actresses2.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_3_actresses = pd.DataFrame(top20_3_grossing_actresses)
top20_3_actresses

In [None]:
names3 = list(top20_3_actresses.index)
for name in names3:
    print(title_checker(cleaned_actresses2, name))

In [None]:
drop_list3 = [2316, 2317, 5733, 5735, 920, 4003]

In [None]:
cleaned_actresses3 = cleaned_actresses2.drop(drop_list3)

In [None]:
top20_3_grossing_actresses = cleaned_actresses3.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(20) 
top20_3_actresses = pd.DataFrame(top20_3_grossing_actresses)
top20_3_actresses

In [None]:
names4 = list(top20_3_actresses.index)
for name in names4:
    print(title_checker(cleaned_actresses3, name))

In [None]:
movie_count_f = actress_df.groupby(['primary_name']).title.count()
count_f = pd.DataFrame(movie_count_f)
count_f

In [None]:
joined_f = top20_3_actresses.join(count_f)
joined_f.head()

In [None]:
joined_f['mean_gross'] = joined_f.adj_ww_gross/joined_f.title
joined_f = joined_f.reset_index()
joined_f = joined_f.rename(columns={'primary_name': 'Actress',\
                                       'title': 'Number of Titles', 'adj_ww_gross': 'Cumulative Gross',\
                                       'mean_gross': 'Mean Gross per Title'})

In [None]:
top20_cuml_gross_actresses =  joined_f.sort_values(by='Cumulative Gross', ascending=False).head(20)
top20_mean_gross_actresses = joined_f.sort_values(by='Mean Gross per Title', ascending=False).head(20)
top20_cuml_gross_actresses

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Actress', y='Cumulative Gross', data=top20_cuml_gross_actresses)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Actress", fontsize=20 )
ax.set_ylabel( "Cumulative Gross in Billions" , fontsize=20 )
ax.set_title("Actresses With Top Cumulative Gross", fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Actress', y='Mean Gross per Title', data=top20_mean_gross_actresses)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Actress", fontsize=20 ) 
ax.set_ylabel( "Mean Gross per Title in Billions" , fontsize=20 )
ax.set_title("Actresses With Top Average Gross per Movie", fontsize=20) 
plt.show() 

## Top Directors by Gross

In [None]:
director_df = df[df['category'].str.contains('director')]

In [None]:
top20_grossing_directors = director_df.groupby(['primary_name']).adj_ww_gross.sum().sort_values(ascending=False).head(21)
top20_directors = pd.DataFrame(top20_grossing_directors)
top20_directors

In [None]:
movie_count_d = director_df.groupby(['primary_name']).title.count()
count_d = pd.DataFrame(movie_count_d)
count_d

In [None]:
joined_d = top20_directors.join(count_d)
joined_d.head()

In [None]:
joined_d['mean_gross'] = joined_d.adj_ww_gross/joined_d.title
joined_d = joined_d.reset_index()
joined_d = joined_d.rename(columns={'primary_name': 'Director',\
                                       'title': 'Number of Titles', 'adj_ww_gross': 'Cumulative Gross',\
                                       'mean_gross': 'Mean Gross per Title'})

In [None]:
top20_cuml_gross_directors =  joined_d.sort_values(by='Cumulative Gross', ascending=False).head(20)
top20_mean_gross_directors = joined_d.sort_values(by='Mean Gross per Title', ascending=False).head(20)
top20_cuml_gross_directors

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Director', y='Cumulative Gross', data=top20_cuml_gross_directors)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Director", fontsize=20 )
ax.set_ylabel( "Cumulative Gross in Billions" , fontsize=20 )
ax.set_title("Directors With Top Cumulative Gross", fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='Director', y='Mean Gross per Title', data=top20_mean_gross_directors)
ax.tick_params(labelsize=15)
plt.xticks(rotation=45, ha='right')
ax.set_xlabel( "Director", fontsize=20 ) 
ax.set_ylabel( "Mean Gross per Title in Billions" , fontsize=20 )
ax.set_title("Directors With Top Average Gross per Movie", fontsize=20) 
plt.show() 

In [None]:
df[df['primary_name'] == 'Sébastien Lifshitz']

In [None]:
producer_df = df[df['category'].str.contains('producer')]
producer_df.head()

In [None]:
top_producers_films = producer_df.sort_values(by=['worldwide_gross'], ascending=False).head(20)
top_producers_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

In [None]:
top20_grossing_producers = producer_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_producers

In [None]:
writer_df = df[df['category'].str.contains('writer')]
writer_df.head()

In [None]:
top_writers_films = writer_df.sort_values(by=['worldwide_gross'], ascending=False).head(20)
top_writers_films[['primary_name', 'movie', 'worldwide_gross', 'release_date']].head()

In [None]:
top20_grossing_writers = writer_df.groupby(['primary_name']).worldwide_gross.sum().sort_values(ascending=False).head(20)
top20_grossing_writers