In [1]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
bom_gross = pd.read_csv('Data/Zipped_Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/Zipped_Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_akas = pd.read_csv('Data/Zipped_Data/imdb.title.akas.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/Zipped_Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/Zipped_Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/Zipped_Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/Zipped_Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/Zipped_Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/Zipped_Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/Zipped_Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/Zipped_Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [None]:
#Determined which title_ids have more than one original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
originals = (imbd_akas.loc[imbd_akas.is_original_title == 1])
original_repeats = list(originals.loc[originals['title_id'].duplicated() == True].title_id.unique())
or_duplicate = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats) == False]
or_duplicate = or_duplicate.loc[or_duplicate.is_original_title == 1]
or_duplicate = or_duplicate.loc[or_duplicate.title_id.duplicated() == False]

#Determined which title_ids have an original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
original_nrp = list(imbd_akas.loc[imbd_akas.is_original_title == 1].title_id.unique())
or_nodup = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp) == False]
or_nodup = or_nodup.loc[or_nodup.is_original_title == 1]

#Determined which title_ids have more than one title listed. Created a new dataset consisting of
#title_ids without more than one title listed, and dropped these title_ids from the main dataset.
non_or_rp = list(imbd_akas.loc[imbd_akas.title_id.duplicated()].title_id.unique())
non_or_nrp = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp) == False]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp)]

#Determined which title_ids have a row with a region listed as "US" or the language listed as "en".
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset.
#In the new dataset, deleted rows that weren't listed either as "US" or "en", and then deleted duplicates.
us_or_en = list(imbd_akas.loc[(imbd_akas.region == "US") | (imbd_akas.language == "en")].title_id.unique())
us_en = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en) == False]
us_en = us_en.loc[(us_en.region == "US") | (us_en.language == "en")]
us_en = us_en.loc[us_en.title_id.duplicated() == False]

#Created a new dataset that is equal to the remaining original dataset, but with duplicates removed.
no_us_en = imbd_akas.loc[imbd_akas.title_id.duplicated() == False]

#Concated the sliced datafiles
imbd_akas_dfs = [or_duplicate, or_nodup, non_or_nrp, us_en, no_us_en]
imbd_akas_cleaned = pd.concat(imbd_akas_dfs)

#Dropped unnecessary columns
imbd_akas_cleaned = imbd_akas_cleaned.drop(columns = ['ordering', 'region', 'language', 'types', 
                                                      'attributes', 'is_original_title'])

#Removed spaces, capitalization and punctuation from title. emove potentially extraneous words.
imbd_akas_cleaned.title = imbd_akas_cleaned.title.str.strip()
imbd_akas_cleaned.title = imbd_akas_cleaned.title.apply(lambda x: x.lower())
imbd_akas_cleaned.title = imbd_akas_cleaned.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_akas_cleaned.title = imbd_akas_cleaned.title.replace(['the', 'and'], value='', regex=True)

In [2]:
#Remove 'studio' column. Set 'foreign_gross' to float and 'year' to string
bom_gross.drop(columns = ['studio'])
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross.foreign_gross, downcast = 'float', errors = 'coerce')
bom_gross['year'] = bom_gross['year'].astype(str)

#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
bom_gross.title = bom_gross.title.replace(years, value='', regex=True)
bom_gross.title = bom_gross.title.str.strip()
bom_gross.title = bom_gross.title.apply(lambda x: x.lower())
bom_gross.title = bom_gross.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
bom_gross.title = bom_gross.title.replace(['the', 'and'], value='', regex=True)

In [3]:
bom_gross['domestic_gross'] = bom_gross.domestic_gross.fillna(value=bom_gross.domestic_gross.median())
bom_gross['foreign_gross'] = bom_gross.foreign_gross.fillna(value=bom_gross.foreign_gross.median())

In [4]:
bom_20 = bom_gross.loc[(bom_gross.domestic_gross >= bom_gross.domestic_gross.quantile(.8)) 
             | (bom_gross.foreign_gross >= bom_gross.foreign_gross.quantile(.8))]

In [5]:
merged_df = pd.merge(bom_20, imbd_akas_cleaned, on = "title", how = "left")

NameError: name 'imbd_akas_cleaned' is not defined

In [None]:
merged_df.head()

In [None]:
title_ids = list(merged_df.title_id.unique())

In [6]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year', 'primary_profession', 'known_for_titles'])
#imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
#imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
#imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')

In [7]:
imbd_basics.primary_title = imbd_basics.primary_title.str.strip()
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.lower())
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.primary_title = imbd_basics.primary_title.replace(['the', 'and'], value='', regex=True)

In [8]:
imbd_basics = imbd_basics.drop(columns = ['original_title', 'runtime_minutes'])

In [9]:
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title', 'start_year': 'year'})

In [10]:
imbd_basics['year'] = imbd_basics['year'].astype(str)

In [11]:
merged_df = pd.merge(bom_20, imbd_basics, on = ["title", 'year'], how = "inner")

In [12]:
imbd_principals = imbd_principals.drop(columns = ['ordering', 'job', 'characters'])

In [13]:
imbd_merge = pd.merge(imbd_name, imbd_principals, on = ['nconst'], how = "outer")

In [14]:
imbd_director = imbd_merge.rename(columns = {'nconst': 'directors', 'primary_name': 'director_name'})

In [15]:
imbd_director = imbd_director.drop(columns = 'category')

In [16]:
imbd_crews = pd.merge(imbd_crews, imbd_director, on = ['directors', 'tconst'])

In [17]:
imbd_directors = imbd_crews.drop(columns = "writers")

In [18]:
imbd_basics = pd.merge(imbd_basics, imbd_directors, on = 'tconst')

In [19]:
imbd_bom = pd.merge(bom_20, imbd_basics, on = ['title', 'year'], how = 'left')

In [None]:
imbd_bom.info()

In [21]:
list_dup = list(imbd_bom.loc[imbd_bom.title.duplicated() == True].title.unique())

In [22]:
imbd_bom.loc[imbd_bom.title.isin(list_dup) == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres,directors,director_name
1,alice in wonderl,BV,334200000.0,691299968.0,2010,tt1014759,"Adventure,Family,Fantasy",nm0000318,Tim Burton
2,alice in wonderl,BV,334200000.0,691299968.0,2010,tt2049386,"Fantasy,Musical",nm0288188,James Fotopoulos
19,robin hood,Uni.,105300000.0,216400000.0,2010,tt0955308,"Action,Adventure,Drama",nm0000631,Ridley Scott
62,girl with dragon tattoo,MBox,10100000.0,94300000.0,2010,,,,
70,burlesque,SGem,39400000.0,50100000.0,2010,tt1126591,"Drama,Music,Musical",nm0031078,Steve Antin
71,burlesque,SGem,39400000.0,50100000.0,2010,tt1586713,Drama,nm3313266,Dominic Deacon
116,girl with dragon tattoo,Sony,102500000.0,130100000.0,2011,tt1568346,"Crime,Drama,Mystery",nm0000399,David Fincher
152,artist,Wein.,44700000.0,88800000.0,2011,tt1655442,"Comedy,Drama,Romance",nm0371890,Michel Hazanavicius
153,artist,Wein.,44700000.0,88800000.0,2011,tt1825978,Thriller,nm3908851,Sunil Prem Vyas
172,abduction,LGF,28100000.0,54000000.0,2011,tt1600195,"Action,Mystery,Thriller",nm0005436,John Singleton


In [23]:
original_list = ['David F. Sandberg', 'Robert Zemeckis', 'Tom McCarthy', 'Jason Moore', 'Tim Johnson', 
               'Kenneth Branagh', 'Brad Anderson', 'John Singleton', 'Michel Hazanavicius', 'David Fincher',
              'Steve Antin', 'Ridley Scott', 'Tim Burton']

In [25]:
imbd_bom = imbd_bom.loc[(imbd_bom.title.isin(list_dup) == False) 
             | ((imbd_bom.director_name.isin(original_list) == True) 
               & (imbd_bom.title.isin(list_dup) == True))]

In [26]:
imbd_bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,tconst,genres,directors,director_name
0,toy story 3,BV,415000000.0,652000000.0,2010,tt0435761,"Adventure,Animation,Comedy",nm0881279,Lee Unkrich
1,alice in wonderl,BV,334200000.0,691299968.0,2010,tt1014759,"Adventure,Family,Fantasy",nm0000318,Tim Burton
3,harry potter deathly hallows part 1,WB,296000000.0,664300032.0,2010,tt0926084,"Adventure,Fantasy,Mystery",nm0946734,David Yates
4,inception,WB,292600000.0,535700000.0,2010,tt1375666,"Action,Adventure,Sci-Fi",nm0634240,Christopher Nolan
5,shrek forever after,P/DW,238700000.0,513900000.0,2010,tt0892791,"Adventure,Animation,Comedy",nm0593610,Mike Mitchell


In [29]:
imbd_bom.loc[(imbd_bom.director_name.duplicated() == True) & ( imbd_bom.director_name.isna() == False)].director_name.nunique()

153

In [30]:
imbd_principals.head()

Unnamed: 0,tconst,nconst,category
0,tt0111414,nm0246005,actor
1,tt0111414,nm0398271,director
2,tt0111414,nm3739909,producer
3,tt0323808,nm0059247,editor
4,tt0323808,nm3579312,actress


In [32]:
imbd_actor = imbd_principals.loc[imbd_principals.category == "actor"]

In [34]:
imbd_name.head()

Unnamed: 0,nconst,primary_name
0,nm0061671,Mary Ellen Bauder
1,nm0061865,Joseph Bauer
2,nm0062070,Bruce Baum
3,nm0062195,Axel Baumann
4,nm0062798,Pete Baxter


In [35]:
imbd_actor = pd.merge(imbd_actor, imbd_name, on = 'nconst', how = 'inner')

In [38]:
imbd_actor = imbd_actor.rename(columns = {'primary_name': 'actor_name'})

In [40]:
imbd_actress = imbd_principals.loc[imbd_principals.category == "actress"]

In [44]:
imbd_actress = pd.merge(imbd_actress, imbd_name, on = 'nconst', how = 'inner')

In [46]:
imbd_actress = imbd_actress.rename(columns = {'primary_name': 'actress_name'})

In [48]:
imbd_actress.head()

Unnamed: 0,tconst,nconst,category,actress_name
0,tt0323808,nm3579312,actress,Brittania Nicol
1,tt0323808,nm0502652,actress,Jacqueline Leonard
2,tt0417610,nm0330974,actress,Esther Goris
3,tt0426566,nm0330974,actress,Esther Goris
4,tt3355958,nm0330974,actress,Esther Goris


In [52]:
imbd_actors = pd.merge(imbd_actor, imbd_actress, on = ['tconst', 'nconst'])

In [55]:
imbd_actors = pd.concat[imbd_actor, imbd_actress]

TypeError: 'function' object is not subscriptable