In [121]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
bom_gross = pd.read_csv('Data/Zipped_Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/Zipped_Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_akas = pd.read_csv('Data/Zipped_Data/imdb.title.akas.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/Zipped_Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/Zipped_Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/Zipped_Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/Zipped_Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/Zipped_Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/Zipped_Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/Zipped_Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/Zipped_Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [None]:
#Determined which title_ids have more than one original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
originals = (imbd_akas.loc[imbd_akas.is_original_title == 1])
original_repeats = list(originals.loc[originals['title_id'].duplicated() == True].title_id.unique())
or_duplicate = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats) == False]
or_duplicate = or_duplicate.loc[or_duplicate.is_original_title == 1]
or_duplicate = or_duplicate.loc[or_duplicate.title_id.duplicated() == False]

#Determined which title_ids have an original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
original_nrp = list(imbd_akas.loc[imbd_akas.is_original_title == 1].title_id.unique())
or_nodup = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp) == False]
or_nodup = or_nodup.loc[or_nodup.is_original_title == 1]

#Determined which title_ids have more than one title listed. Created a new dataset consisting of
#title_ids without more than one title listed, and dropped these title_ids from the main dataset.
non_or_rp = list(imbd_akas.loc[imbd_akas.title_id.duplicated()].title_id.unique())
non_or_nrp = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp) == False]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp)]

#Determined which title_ids have a row with a region listed as "US" or the language listed as "en".
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset.
#In the new dataset, deleted rows that weren't listed either as "US" or "en", and then deleted duplicates.
us_or_en = list(imbd_akas.loc[(imbd_akas.region == "US") | (imbd_akas.language == "en")].title_id.unique())
us_en = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en) == False]
us_en = us_en.loc[(us_en.region == "US") | (us_en.language == "en")]
us_en = us_en.loc[us_en.title_id.duplicated() == False]

#Created a new dataset that is equal to the remaining original dataset, but with duplicates removed.
no_us_en = imbd_akas.loc[imbd_akas.title_id.duplicated() == False]

#Concated the sliced datafiles
imbd_akas_dfs = [or_duplicate, or_nodup, non_or_nrp, us_en, no_us_en]
imbd_akas_cleaned = pd.concat(imbd_akas_dfs)

#Dropped unnecessary columns
imbd_akas_cleaned = imbd_akas_cleaned.drop(columns = ['ordering', 'region', 'language', 'types', 
                                                      'attributes', 'is_original_title'])

#Removed spaces, capitalization and punctuation from title. emove potentially extraneous words.
imbd_akas_cleaned.title = imbd_akas_cleaned.title.str.strip()
imbd_akas_cleaned.title = imbd_akas_cleaned.title.apply(lambda x: x.lower())
imbd_akas_cleaned.title = imbd_akas_cleaned.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_akas_cleaned.title = imbd_akas_cleaned.title.replace(['the', 'and'], value='', regex=True)

In [58]:
#Remove 'studio' column. Set 'foreign_gross' to float and 'year' to string
bom_gross.drop(columns = ['studio'])
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross.foreign_gross, downcast = 'float', errors = 'coerce')
bom_gross['year'] = bom_gross['year'].astype(str)

#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
bom_gross.title = bom_gross.title.replace(years, value='', regex=True)
bom_gross.title = bom_gross.title.str.strip()
bom_gross.title = bom_gross.title.apply(lambda x: x.lower())
bom_gross.title = bom_gross.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
bom_gross.title = bom_gross.title.replace(['the', 'and'], value='', regex=True)

In [59]:
bom_gross['domestic_gross'] = bom_gross.domestic_gross.fillna(value=bom_gross.domestic_gross.median())
bom_gross['foreign_gross'] = bom_gross.foreign_gross.fillna(value=bom_gross.foreign_gross.median())

In [60]:
bom_20 = bom_gross.loc[(bom_gross.domestic_gross >= bom_gross.domestic_gross.quantile(.8)) 
             | (bom_gross.foreign_gross >= bom_gross.foreign_gross.quantile(.8))]

In [None]:
merged_df = pd.merge(bom_20, imbd_akas_cleaned, on = "title", how = "left")

In [None]:
merged_df.head()

In [None]:
title_ids = list(merged_df.title_id.unique())

In [61]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year', 'primary_profession', 'known_for_titles'])
#imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
#imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
#imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')

In [None]:
imbd_name = imbd_name.loc[(imbd_name.primary_profession.str.contains('actress') == True) 
                          | (imbd_name.primary_profession.str.contains('actor') == True ) 
                          | (imbd_name.primary_profession.str.contains('director') == True) 
                          | (imbd_name.primary_profession.str.contains('writer') == True)]

In [62]:
imbd_basics.primary_title = imbd_basics.primary_title.str.strip()
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.lower())
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.primary_title = imbd_basics.primary_title.replace(['the', 'and'], value='', regex=True)

In [63]:
imbd_basics = imbd_basics.drop(columns = ['original_title', 'runtime_minutes'])

In [64]:
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title', 'start_year': 'year'})

In [65]:
imbd_basics['year'] = imbd_basics['year'].astype(str)

In [66]:
merged_df = pd.merge(bom_20, imbd_basics, on = ["title", 'year'], how = "inner")

In [67]:
merged_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres
0,toy story 3,BV,415000000.0,652000000.0,2010,tt0435761,"Adventure,Animation,Comedy"
1,alice in wonderl,BV,334200000.0,691299968.0,2010,tt1014759,"Adventure,Family,Fantasy"
2,alice in wonderl,BV,334200000.0,691299968.0,2010,tt2049386,"Fantasy,Musical"
3,harry potter deathly hallows part 1,WB,296000000.0,664300032.0,2010,tt0926084,"Adventure,Fantasy,Mystery"
4,inception,WB,292600000.0,535700000.0,2010,tt1375666,"Action,Adventure,Sci-Fi"


In [35]:
list(merged_df.genres)

['Adventure,Animation,Comedy',
 'Adventure,Family,Fantasy',
 'Fantasy,Musical',
 'Adventure,Fantasy,Mystery',
 'Action,Adventure,Sci-Fi',
 'Adventure,Animation,Comedy',
 'Adventure,Drama,Fantasy',
 'Action,Adventure,Sci-Fi',
 'Adventure,Animation,Comedy',
 'Animation,Comedy,Family',
 'Action,Adventure,Animation',
 'Action,Adventure,Fantasy',
 'Adventure,Family,Fantasy',
 'Biography,Drama,History',
 'Action,Adventure,Fantasy',
 'Action,Drama,Family',
 'Action,Adventure,Fantasy',
 'Drama,Thriller',
 'Action,Animation,Comedy',
 'Action,Adventure,Drama',
 'Action,Adventure,Family',
 'Comedy,Romance',
 'Action,Adventure,Horror',
 'Mystery,Thriller',
 'Action,Mystery,Thriller',
 'Comedy,Drama,Romance',
 'Action,Adventure,Thriller',
 'Action,Adventure,Thriller',
 'Comedy',
 'Action,Adventure,Comedy',
 'Adventure,Drama,Western',
 'Adventure,Comedy,Family',
 'Adventure,Family,Fantasy',
 'Biography,Drama',
 'Comedy,Romance',
 'Action,Adventure,Family',
 'Adventure,Comedy',
 'Drama,Romance',
 'Ad

In [68]:
imbd_crews.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [69]:
imbd_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [70]:
imbd_principals = imbd_principals.drop(columns = ['ordering', 'job', 'characters'])

In [71]:
imbd_merge = pd.merge(imbd_name, imbd_principals, on = ['nconst'], how = "outer")

In [72]:
imbd_director = imbd_merge.rename(columns = {'nconst': 'directors', 'primary_name': 'director_name'})

In [73]:
imbd_director = imbd_director.drop(columns = 'category')

In [75]:
imbd_crews.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [76]:
imbd_director.head()

Unnamed: 0,directors,director_name,tconst
0,nm0061671,Mary Ellen Bauder,tt2398241
1,nm0061865,Joseph Bauer,tt0433397
2,nm0061865,Joseph Bauer,tt1681372
3,nm0061865,Joseph Bauer,tt2387710
4,nm0061865,Joseph Bauer,tt2281215


In [77]:
imbd_crews = pd.merge(imbd_crews, imbd_director, on = ['directors', 'tconst'])

In [81]:
imbd_crews.head()

Unnamed: 0,tconst,directors,writers,director_name
0,tt0285252,nm0899854,nm0899854,Tony Vitale
1,tt0462036,nm1940585,nm1940585,Bill Haley
2,tt0835418,nm0151540,"nm0310087,nm0841532",Jay Chandrasekhar
3,tt0879859,nm2416460,,Eric Manchester
4,tt0996958,nm2286991,"nm2286991,nm2651190",Tara Cardinal


In [83]:
imbd_directors = imbd_crews.drop(columns = "writers")

In [86]:
imbd_directors.head()

Unnamed: 0,tconst,directors,director_name
0,tt0285252,nm0899854,Tony Vitale
1,tt0462036,nm1940585,Bill Haley
2,tt0835418,nm0151540,Jay Chandrasekhar
3,tt0879859,nm2416460,Eric Manchester
4,tt0996958,nm2286991,Tara Cardinal


In [88]:
imbd_basics = pd.merge(imbd_basics, imbd_directors, on = 'tconst')

In [119]:
imbd_basics.loc[imbd_basics.title.str.contains("me")].title.unique()

array([' promise of perfume', 'american pastoral', 'gnomeo  juliet', ...,
       ' good americans one revolution two nations',
       'wien is t hof van commerce', 'vida em movimento'], dtype=object)

In [95]:
imbd_bom = pd.merge(bom_20, imbd_basics, on = ['title', 'year'], how = 'left')

In [110]:
imbd_bom.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 861 entries, 0 to 860
Data columns (total 9 columns):
title             861 non-null object
studio            860 non-null object
domestic_gross    861 non-null float64
foreign_gross     861 non-null float32
year              861 non-null object
tconst            700 non-null object
genres            700 non-null object
directors         700 non-null object
director_name     700 non-null object
dtypes: float32(1), float64(1), object(7)
memory usage: 63.9+ KB


In [112]:
imbd_bom.loc[imbd_bom.director_name.isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres,directors,director_name
8,tangled,BV,200800000.00,391000000.00,2010,,,,
9,despicable me,Uni.,251500000.00,291600000.00,2010,,,,
10,how to train your dragon,P/DW,217600000.00,277300000.00,2010,,,,
30,true grit,Par.,171200000.00,81000000.00,2010,,,,
46,book of eli,WB,94800000.00,62300000.00,2010,,,,
52,saw 3d,LGF,45700000.00,90400000.00,2010,,,,
62,girl with dragon tattoo,MBox,10100000.00,94300000.00,2010,,,,
64,aftershock tangshan dadizhen,CL,63000.00,100200000.00,2010,,,,
76,oceans,BV,19400000.00,63200000.00,2010,,,,
79,if you are one 2 fei cheng wu rao ii,CL,427000.00,75600000.00,2010,,,,


In [120]:
list_dup = list(imbd_bom.loc[imbd_bom.title.duplicated() == True].title.unique())

TypeError: 'list' object is not callable

In [103]:
imbd_bom.loc[imbd_bom.title.isin(list) == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres,directors,director_name
1,alice in wonderl,BV,334200000.0,691299968.0,2010,tt1014759,"Adventure,Family,Fantasy",nm0000318,Tim Burton
2,alice in wonderl,BV,334200000.0,691299968.0,2010,tt2049386,"Fantasy,Musical",nm0288188,James Fotopoulos
19,robin hood,Uni.,105300000.0,216400000.0,2010,tt0955308,"Action,Adventure,Drama",nm0000631,Ridley Scott
62,girl with dragon tattoo,MBox,10100000.0,94300000.0,2010,,,,
70,burlesque,SGem,39400000.0,50100000.0,2010,tt1126591,"Drama,Music,Musical",nm0031078,Steve Antin
71,burlesque,SGem,39400000.0,50100000.0,2010,tt1586713,Drama,nm3313266,Dominic Deacon
116,girl with dragon tattoo,Sony,102500000.0,130100000.0,2011,tt1568346,"Crime,Drama,Mystery",nm0000399,David Fincher
152,artist,Wein.,44700000.0,88800000.0,2011,tt1655442,"Comedy,Drama,Romance",nm0371890,Michel Hazanavicius
153,artist,Wein.,44700000.0,88800000.0,2011,tt1825978,Thriller,nm3908851,Sunil Prem Vyas
172,abduction,LGF,28100000.0,54000000.0,2011,tt1600195,"Action,Mystery,Thriller",nm0005436,John Singleton


In [107]:
original_list = ['David F. Sandberg', 'Robert Zemeckis', 'Tom McCarthy', 'Jason Moore', 'Tim Johnson', 
               'Kenneth Branagh', 'Brad Anderson', 'John Singleton', 'Michel Hazanavicius', 'David Fincher',
              'Steve Antin', 'Ridley Scott', 'Tim Burton']

In [108]:
imbd_bom.loc[(imbd_bom.director_name.isin(original_list) == True)]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres,directors,director_name
1,alice in wonderl,BV,334200000.0,691299968.0,2010,tt1014759,"Adventure,Family,Fantasy",nm0000318,Tim Burton
19,robin hood,Uni.,105300000.0,216400000.0,2010,tt0955308,"Action,Adventure,Drama",nm0000631,Ridley Scott
33,social network,Sony,97000000.0,128000000.0,2010,tt1285016,"Biography,Drama",nm0000399,David Fincher
70,burlesque,SGem,39400000.0,50100000.0,2010,tt1126591,"Drama,Music,Musical",nm0031078,Steve Antin
107,thor,Par.,181000000.0,268300000.0,2011,tt0800369,"Action,Adventure,Fantasy",nm0000110,Kenneth Branagh
116,girl with dragon tattoo,Sony,102500000.0,130100000.0,2011,tt1568346,"Crime,Drama,Mystery",nm0000399,David Fincher
152,artist,Wein.,44700000.0,88800000.0,2011,tt1655442,"Comedy,Drama,Romance",nm0371890,Michel Hazanavicius
172,abduction,LGF,28100000.0,54000000.0,2011,tt1600195,"Action,Mystery,Thriller",nm0005436,John Singleton
204,promeus,Fox,126500000.0,276900000.0,2012,tt1446714,"Adventure,Mystery,Sci-Fi",nm0000631,Ridley Scott
218,dark shadows,WB,79700000.0,165800000.0,2012,tt1077368,"Comedy,Fantasy,Horror",nm0000318,Tim Burton
