### Import libraries and data

In [1]:
import pandas as pd

In [2]:
!ls data

bom.movie_gross.csv  title.akas.csv	   title.ratings.csv
name.basics.csv      title.basics.csv	   tmdb.movies.csv
rt.movie_info.tsv    title.crew.csv	   tn.movie_budgets.csv
rt.reviews.tsv	     title.principals.csv


In [3]:
# Title, Studio, Domestic Gross, Foreign Gross, Year
movie_gross_df = pd.read_csv('data/bom.movie_gross.csv')

# Name variable (nconst), Name of Crew/Principal, Primary Profession, Known for Titles (tconst)
people_info_df = pd.read_csv('data/name.basics.csv')

# ID, Rating, Genre, Director, Writer, Theater Date, Box Office, Runtime, Studio
rt_movie_info_df = pd.read_csv('data/rt.movie_info.tsv', delimiter='\t')

# ID, Rating, Fresh, Top Critic
rt_movie_reviews_df = pd.read_csv('data/rt.reviews.tsv', delimiter='\t', encoding='latin1')

# Title ID, Title, Region, Language
title_akas_df = pd.read_csv('data/title.akas.csv')

# Title ID (tconst), Primary Title, Original Title, Year, Runtime, Genres
title_basics_df = pd.read_csv('data/title.basics.csv')

# Title ID (tconst), Directors (nconst), Writers (nconst)
title_crew_df = pd.read_csv('data/title.crew.csv')

# Title ID (tconst), People (nconst), Category (primary profession)
title_principals_df = pd.read_csv('data/title.principals.csv')

# Titled ID (tconst), Rating, Num Votes
title_ratings_df = pd.read_csv('data/title.ratings.csv')

# Genre, Language, Original Title, Popularity, Date, Title, Average Rating, Num of Ratings
tmdb_movies_df = pd.read_csv('data/tmdb.movies.csv')

# Release Date, Title, Budget, Domestic Gross, Worldwide Gross
movie_budgets_df = pd.read_csv('data/tn.movie_budgets.csv')

### Splitting up the work

Matt is going to work on joining any of the dataframes with the tconst or nconst variables.

Grace will work on joining the Rotten Tomatoes dataframes.

Jeff will work on joining the rest.

### Matt's dataframes for reference:

people_info_df

title_akas_df

title_basics_df

title_crew_df

title_principals_df

title_ratings_df

### Get column names, shape, and number of unique values

In [7]:
print(people_info_df.columns)

print(title_akas_df.columns)

print(title_basics_df.columns)

print(title_crew_df.columns)

print(title_principals_df.columns)

print(title_ratings_df.columns)

Index(['nconst', 'primary_name', 'birth_year', 'death_year',
       'primary_profession', 'known_for_titles'],
      dtype='object')
Index(['title_id', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'is_original_title'],
      dtype='object')
Index(['tconst', 'primary_title', 'original_title', 'start_year',
       'runtime_minutes', 'genres'],
      dtype='object')
Index(['tconst', 'directors', 'writers'], dtype='object')
Index(['tconst', 'ordering', 'nconst', 'category', 'job', 'characters'], dtype='object')
Index(['tconst', 'averagerating', 'numvotes'], dtype='object')


In [8]:
print(people_info_df.shape)

print(title_akas_df.shape)

print(title_basics_df.shape)

print(title_crew_df.shape)

print(title_principals_df.shape)

print(title_ratings_df.shape)

(606648, 6)
(331703, 8)
(146144, 6)
(146144, 3)
(1028186, 6)
(73856, 3)


In [9]:
print(people_info_df.shape)

print(len(pd.unique(title_akas_df['title_id'])))

print(len(pd.unique(title_basics_df['tconst'])))

print(len(pd.unique(title_crew_df['tconst'])))

print(len(pd.unique(title_principals_df['tconst'])))

print(len(pd.unique(title_ratings_df['tconst'])))

(606648, 6)
122302
146144
146144
143454
73856


### Joining the dataframes

...through a series of successive left joins on columns contain a 
'tconst' value, starting with the dataframe containing the highest number of 'tconst' items.

In [10]:
first_join = title_basics_df.merge(title_crew_df, how='left', left_on='tconst', right_on='tconst')
first_join

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,directors,writers
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",nm0002411,
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,nm0000080,"nm0000080,nm0462648"
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",nm0611531,nm0347899
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy","nm0765384,nm0749914","nm1360635,nm0749914"
...,...,...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama,nm8185151,"nm4843252,nm4900525,nm2679404"
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary,"nm9272490,nm9272491","nm9272490,nm9272491"
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy,nm7764440,nm7933903
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,,nm10538612,nm10538612


In [11]:
second_join = first_join.merge(title_principals_df, how='left', left_on='tconst', right_on='tconst')
second_join

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,directors,writers,ordering,nconst,category,job,characters
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,composer,,
1,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",1.0,nm0474801,actor,,"[""Kundan S. Prasad"",""Bajrangi""]"
2,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",2.0,nm0904537,actress,,"[""Munni"",""Laila-E-Aasmaan""]"
3,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",3.0,nm0756379,actor,,"[""Ganeshi N. Prasad""]"
4,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",4.0,nm0474876,actor,,"[""Dwarka N. Prasad""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030871,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",5.0,nm9272490,director,supervising director,
1030872,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",6.0,nm8349149,director,co-director,
1030873,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",7.0,nm9272489,cinematographer,,
1030874,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",8.0,nm10538638,cinematographer,,


In [12]:
third_join = second_join.merge(title_ratings_df, how='left', left_on='tconst', right_on='tconst')
third_join

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,directors,writers,ordering,nconst,category,job,characters,averagerating,numvotes
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,composer,,,7.0,77.0
1,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",1.0,nm0474801,actor,,"[""Kundan S. Prasad"",""Bajrangi""]",7.0,77.0
2,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",2.0,nm0904537,actress,,"[""Munni"",""Laila-E-Aasmaan""]",7.0,77.0
3,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",3.0,nm0756379,actor,,"[""Ganeshi N. Prasad""]",7.0,77.0
4,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",4.0,nm0474876,actor,,"[""Dwarka N. Prasad""]",7.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030871,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",5.0,nm9272490,director,supervising director,,,
1030872,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",6.0,nm8349149,director,co-director,,,
1030873,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",7.0,nm9272489,cinematographer,,,,
1030874,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",8.0,nm10538638,cinematographer,,,,


In [13]:
fourth_join = third_join.merge(title_akas_df, how='left', left_on='tconst', right_on='title_id')
fourth_join

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,directors,writers,ordering_x,nconst,...,averagerating,numvotes,title_id,ordering_y,title,region,language,types,attributes,is_original_title
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,7.0,77.0,tt0063540,1.0,Sangharsh,IN,hi,,alternative transliteration,0.0
1,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,7.0,77.0,tt0063540,2.0,Sunghursh,,,original,,1.0
2,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,7.0,77.0,tt0063540,3.0,Sunghursh,IN,,,,0.0
3,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,7.0,77.0,tt0063540,4.0,Sunghursh,IN,hi,,alternative transliteration,0.0
4,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,7.0,77.0,tt0063540,5.0,Sungharsh,IN,hi,,alternative spelling,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2975780,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",5.0,nm9272490,...,,,,,,,,,,
2975781,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",6.0,nm8349149,...,,,,,,,,,,
2975782,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",7.0,nm9272489,...,,,,,,,,,,
2975783,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",8.0,nm10538638,...,,,,,,,,,,


In [14]:
fourth_join['title_id'].isna().sum()

134509

In [15]:
fourth_join.groupby('original_title').count()

Unnamed: 0_level_0,tconst,primary_title,start_year,runtime_minutes,genres,directors,writers,ordering_x,nconst,category,...,averagerating,numvotes,title_id,ordering_y,title,region,language,types,attributes,is_original_title
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#1 Serial Killer,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,10,0,20,0,30
#5,9,9,9,2,9,9,9,9,9,9,...,2,2,2,2,2,2,0,0,0,2
#50Fathers,9,9,9,9,9,9,9,9,9,9,...,0,0,9,9,9,9,0,0,0,9
#66,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,0,0,0,10
#82 Summits,5,5,5,5,5,5,0,5,5,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ärtico,50,50,50,50,50,50,50,50,50,50,...,50,50,50,50,50,40,0,30,0,50
è solo questione di punti di vista,8,8,8,8,8,8,8,8,8,8,...,0,0,8,8,8,8,0,0,0,8
è solo questione di tempo,15,15,15,15,15,15,15,15,15,15,...,0,0,15,15,15,10,0,10,0,15
élèctions piege à cons,6,6,6,0,6,6,0,6,6,6,...,0,0,0,0,0,0,0,0,0,0


In [16]:
people_info_df

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"
...,...,...,...,...,...,...
606643,nm9990381,Susan Grobes,,,actress,
606644,nm9990690,Joo Yeon So,,,actress,"tt9090932,tt8737130"
606645,nm9991320,Madeline Smith,,,actress,"tt8734436,tt9615610"
606646,nm9991786,Michelle Modigliani,,,producer,


In [17]:
fifth_join = fourth_join.merge(people_info_df, how='left', left_on='nconst', right_on='nconst')
fifth_join

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,directors,writers,ordering_x,nconst,...,region,language,types,attributes,is_original_title,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,IN,hi,,alternative transliteration,0.0,Naushad,1919.0,2006.0,"composer,soundtrack,music_department","tt0054910,tt0266765,tt0044392,tt0053999"
1,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,,,original,,1.0,Naushad,1919.0,2006.0,"composer,soundtrack,music_department","tt0054910,tt0266765,tt0044392,tt0053999"
2,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,IN,,,,0.0,Naushad,1919.0,2006.0,"composer,soundtrack,music_department","tt0054910,tt0266765,tt0044392,tt0053999"
3,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,IN,hi,,alternative transliteration,0.0,Naushad,1919.0,2006.0,"composer,soundtrack,music_department","tt0054910,tt0266765,tt0044392,tt0053999"
4,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276",10.0,nm0006210,...,IN,hi,,alternative spelling,0.0,Naushad,1919.0,2006.0,"composer,soundtrack,music_department","tt0054910,tt0266765,tt0044392,tt0053999"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2975780,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",5.0,nm9272490,...,,,,,,Angela Gurgel,,,"director,writer","tt10006546,tt10231530,tt7378802,tt9916754"
2975781,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",6.0,nm8349149,...,,,,,,Vinicius Augusto Bozzo,1985.0,,"writer,editor,director","tt9916754,tt10118270,tt7378802,tt10163584"
2975782,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",7.0,nm9272489,...,,,,,,Marcelo Alves,,,"cinematographer,producer,director","tt10011778,tt7373112,tt10056288,tt10011880"
2975783,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,,Documentary,"nm9272490,nm8349149","nm8349149,nm9272490",8.0,nm10538638,...,,,,,,Wellington Barros,,,cinematographer,"tt10169058,tt9916754"


In [18]:
fifth_join.groupby('original_title')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdf4cd83d90>

In [19]:
fifth_join.isna().sum()

tconst                      0
primary_title               0
original_title             32
start_year                  0
runtime_minutes        280635
genres                  32591
directors               20435
writers                265826
ordering_x               2812
nconst                   2812
category                 2812
job                   2231222
characters            1824758
averagerating          519637
numvotes               519637
title_id               134509
ordering_y             134509
title                  134509
region                 595279
language              2596060
types                 1423140
attributes            2839834
is_original_title      134534
primary_name             3302
birth_year            1833225
death_year            2928068
primary_profession      95934
known_for_titles        49511
dtype: int64

### Join the large 'tconst' dataframe with the movie_gross_df. 

Make the join three times, once on each of the columns containing the movie title.

There's probably a better way to do this, but that's just how the cookie crumbles.

In [25]:
sixth_join = fifth_join.merge(movie_gross_df, how='left', left_on='primary_title', right_on='title')

In [26]:
sixth_join.shape

(2975862, 33)

In [27]:
sixth_join = fifth_join.merge(movie_gross_df, how='left', left_on='original_title', right_on='title')

In [28]:
sixth_join.shape

(2975785, 33)

In [29]:
sixth_join = fifth_join.merge(movie_gross_df, how='left', left_on='title', right_on='title')

In [30]:
sixth_join.shape

(2975817, 32)

In [60]:
list(sixth_join.columns)

['tconst',
 'primary_title',
 'original_title',
 'start_year',
 'runtime_minutes',
 'genres',
 'directors',
 'writers',
 'ordering_x',
 'nconst',
 'category',
 'job',
 'characters',
 'averagerating',
 'numvotes',
 'title_id',
 'ordering_y',
 'title',
 'region',
 'language',
 'types',
 'attributes',
 'is_original_title',
 'primary_name',
 'birth_year',
 'death_year',
 'primary_profession',
 'known_for_titles',
 'studio',
 'domestic_gross',
 'foreign_gross',
 'year']

In [36]:
sixth_join.groupby('title').mean().head(50)

Unnamed: 0_level_0,start_year,runtime_minutes,ordering_x,averagerating,numvotes,ordering_y,is_original_title,birth_year,death_year,domestic_gross,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!Women Art Revolution,2010.0,83.0,1.0,6.9,196.0,2.5,0.0,1941.0,,,
# Pire soirée,2017.0,101.0,5.5,5.2,39447.0,22.0,0.0,1985.75,,,
#1 Serial Killer,2013.0,87.0,5.5,5.6,40.0,2.5,0.5,1988.0,,,
#211,2018.0,86.0,5.5,4.4,5001.0,3.0,0.0,1966.5,,,
#5,2015.5,114.666667,4.833333,7.216667,577.666667,1.833333,0.0,1970.666667,,,
#50Fathers,2015.0,132.0,5.0,,,1.0,0.0,,,,
#66,2015.0,116.0,5.5,5.3,18.0,1.0,0.0,,,,
#9,2019.0,159.0,5.5,9.7,5600.0,2.0,0.0,1968.375,,,
#ActorsWanted,2011.0,87.0,5.5,4.3,82.0,1.0,0.0,1980.0,,,
#Afskåret,2016.0,82.0,5.5,7.8,17.0,1.0,0.0,1987.0,,,


### Export to CSV

In [37]:
sixth_join.to_csv('bigol_df.csv')

### Remove all entries without a domestic gross & foreign gross (in progress)

In [59]:
sixth_join.dropna(subset=['domestic_gross', 'foreign_gross'], how='any').groupby('primary_title').mean()


Unnamed: 0_level_0,start_year,runtime_minutes,ordering_x,averagerating,numvotes,ordering_y,is_original_title,birth_year,death_year,domestic_gross,year
primary_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
'71,2014.0,99.0,5.5,7.2,46103.0,12.111111,0.111111,1977.666667,,1300000.0,2015.0
1,2019.0,100.0,4.0,9.2,13.0,2.000000,0.000000,1951.666667,,27800000.0,2010.0
10 Cloverfield Lane,2016.0,103.0,5.5,7.2,260383.0,17.800000,0.200000,1972.833333,,72100000.0,2016.0
102 Not Out,2018.0,102.0,5.5,7.5,4802.0,2.000000,0.500000,1947.000000,,1300000.0,2018.0
11-11-11,2011.0,90.0,5.5,4.0,11712.0,7.333333,0.166667,1966.400000,,32800.0,2011.0
...,...,...,...,...,...,...,...,...,...,...,...
Zero Dark Thirty,2012.0,157.0,5.5,7.4,251072.0,16.250000,0.125000,1969.800000,,95700000.0,2012.0
Zookeeper,2011.0,102.0,5.5,5.2,52396.0,12.666667,0.333333,1968.571429,,80400000.0,2011.0
Zoolander 2,2016.0,101.0,5.5,4.7,59914.0,12.000000,0.090909,1966.700000,2004.0,28800000.0,2016.0
Zootopia,2016.0,108.0,5.5,8.0,383446.0,34.600000,0.200000,1972.125000,,341300000.0,2016.0


### Questions that I think may be worth exploring:

1. Should we try to fill in missing box office numbers? If yes, we could see if domestic and foreign grosses usually correlate. If they do, we could come up with an average y=ax relationship and fill in missing values that way. If not, there are 1760 good records we have that I can make a clean dataframe from and we can just use that for gross returns.

2. What are the top 100 films by gross returns over the time period? 
    a. What are their genres? If we aggregate, what are the three most common genres?
    b. Who directed them?
    c. Who stars?
    d. Which studios made them?
    e. What are their runtimes? If we aggregate, what is the mean? How dispersed?
    
3. What are the bottom 100 films by gross returns? Same subquestions.

4. Many movie studios have their own "mini-studios" to compete with the independent and indy-filmmaking scene. Are the relative profits of big budget movies higher or lower than low budget movies? Are the relative profits of one category more dispersed than the other category? Depending on the answers, we might recommend to Microsoft to either, indeed, create a "mini-studio" of their own, or to forego doing so.

5. Do studios who make a high amount of movies per year make more profit (relative to budgets) than studios who make a small amount of movies per year?

6. Do critically acclaimed movies correlate to higher profits? If no, I think it's worth pointing this out, and specifally warning them against chasing after critical acclaim. 