In [5]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
bom_gross = pd.read_csv('Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_akas = pd.read_csv('Data/imdb.title.akas.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [6]:
#Determined which title_ids have more than one original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
originals = (imbd_akas.loc[imbd_akas.is_original_title == 1])
original_repeats = list(originals.loc[originals['title_id'].duplicated() == True].title_id.unique())
or_duplicate = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_repeats) == False]
or_duplicate = or_duplicate.loc[or_duplicate.is_original_title == 1]
or_duplicate = or_duplicate.loc[or_duplicate.title_id.duplicated() == False]

#Determined which title_ids have an original title listed. 
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset. 
#Deleted duplicate title_ids within new dataset.
original_nrp = list(imbd_akas.loc[imbd_akas.is_original_title == 1].title_id.unique())
or_nodup = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(original_nrp) == False]
or_nodup = or_nodup.loc[or_nodup.is_original_title == 1]

#Determined which title_ids have more than one title listed. Created a new dataset consisting of
#title_ids without more than one title listed, and dropped these title_ids from the main dataset.
non_or_rp = list(imbd_akas.loc[imbd_akas.title_id.duplicated()].title_id.unique())
non_or_nrp = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp) == False]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(non_or_rp)]

#Determined which title_ids have a row with a region listed as "US" or the language listed as "en".
#Created a new dataset consisting of these title_ids, and dropped these title_ids from the main dataset.
#In the new dataset, deleted rows that weren't listed either as "US" or "en", and then deleted duplicates.
us_or_en = list(imbd_akas.loc[(imbd_akas.region == "US") | (imbd_akas.language == "en")].title_id.unique())
us_en = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en)]
imbd_akas = imbd_akas.loc[imbd_akas.title_id.isin(us_or_en) == False]
us_en = us_en.loc[(us_en.region == "US") | (us_en.language == "en")]
us_en = us_en.loc[us_en.title_id.duplicated() == False]

#Created a new dataset that is equal to the remaining original dataset, but with duplicates removed.
no_us_en = imbd_akas.loc[imbd_akas.title_id.duplicated() == False]

#Concated the sliced datafiles
imbd_akas_dfs = [or_duplicate, or_nodup, non_or_nrp, us_en, no_us_en]
imbd_akas_cleaned = pd.concat(imbd_akas_dfs)

#Dropped unnecessary columns
#imbd_akas_cleaned = imbd_akas_cleaned.drop(columns = ['ordering', 'region', 'language', 'types', 
#                                                      'attributes', 'is_original_title'])

In [None]:
#Remove null values for 'studio'. Set 'foreign_gross' to float and 'year' to string
bom_gross['studio'] = bom_gross.studio.fillna('Unknown')
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross.foreign_gross, downcast = 'float', errors = 'coerce')
bom_gross['year'] = bom_gross['year'].astype(str)

#Use median data to fill in null values for domestic and foreign gross
bom_median = bom_gross
bom_median['domestic_gross'] = bom_median.domestic_gross.fillna(value=bom_median.domestic_gross.median())
bom_median['foreign_gross'] = bom_median.foreign_gross.fillna(value=bom_median.foreign_gross.median())

In [None]:
bom_median.head()

In [None]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year'])
imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')