In [None]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
bom_gross = pd.read_csv('Data/Zipped_Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/Zipped_Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_akas = pd.read_csv('Data/Zipped_Data/imdb.title.akas.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/Zipped_Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/Zipped_Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/Zipped_Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/Zipped_Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/Zipped_Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/Zipped_Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/Zipped_Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/Zipped_Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [None]:
#Remove 'studio' column. Set 'foreign_gross' to float and 'year' to string
bom_gross.drop(columns = ['studio'])
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross.foreign_gross, downcast = 'float', errors = 'coerce')
bom_gross['year'] = bom_gross['year'].astype(str)

#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
bom_gross.title = bom_gross.title.replace(years, value='', regex=True)
bom_gross.title = bom_gross.title.str.strip()
bom_gross.title = bom_gross.title.apply(lambda x: x.lower())
bom_gross.title = bom_gross.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
bom_gross.title = bom_gross.title.replace(['the', 'and'], value='', regex=True)

In [None]:
#Replace null values with medians
bom_gross['domestic_gross'] = bom_gross.domestic_gross.fillna(value=bom_gross.domestic_gross.median())
bom_gross['foreign_gross'] = bom_gross.foreign_gross.fillna(value=bom_gross.foreign_gross.median())

In [None]:
bom_gross.head()

In [None]:

bom_20 = bom_gross.loc[(bom_gross.domestic_gross >= bom_gross.domestic_gross.quantile(.8)) 
             | (bom_gross.foreign_gross >= bom_gross.foreign_gross.quantile(.8))]

In [None]:
tn_budget.head()

In [None]:
tn_budget['production_budget'] = tn_budget['production_budget'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['domestic_gross'] = tn_budget['domestic_gross'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['worldwide_gross'] = tn_budget['worldwide_gross'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))

In [None]:
tn_budget['production_budget'] = pd.to_numeric(tn_budget.production_budget, downcast = 'float', errors = 'coerce')
tn_budget['domestic_gross'] = pd.to_numeric(tn_budget.domestic_gross, downcast = 'float', errors = 'coerce')
tn_budget['worldwide_gross'] = pd.to_numeric(tn_budget.worldwide_gross, downcast = 'float', errors = 'coerce')

In [None]:
tn_budget.info()

In [None]:
tn_budget.head()

In [None]:
tn_budget.Net = tn_budget.apply(lambda x: x.worldwide_gross - x.production_budget, axis=1)

In [None]:
merged_df = pd.merge(bom_20, imbd_akas_cleaned, on = "title", how = "left")

In [None]:
merged_df.head()

In [None]:
title_ids = list(merged_df.title_id.unique())

In [None]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year', 'primary_profession', 'known_for_titles'])
#imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
#imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
#imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')

In [None]:
imbd_basics.head()

In [None]:
imbd_basics.primary_title = imbd_basics.primary_title.str.strip()
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.lower())
imbd_basics.primary_title = imbd_basics.primary_title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.primary_title = imbd_basics.primary_title.replace(['the', 'and'], value='', regex=True)

In [None]:
imbd_basics = imbd_basics.drop(columns = ['original_title', 'runtime_minutes'])

In [None]:
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title', 'start_year': 'year'})

In [None]:
imbd_basics['year'] = imbd_basics['year'].astype(str)

In [None]:
merged_df = pd.merge(bom_20, imbd_basics, on = ["title", 'year'], how = "inner")

In [None]:
merged_df.head()

In [None]:
imbd_principals = imbd_principals.drop(columns = ['ordering', 'job', 'characters'])

In [None]:
imbd_principals.head()

In [None]:
imbd_merge = pd.merge(imbd_name, imbd_principals, on = ['nconst'], how = "outer")

In [None]:
imbd_merge.head()

In [None]:
imbd_director = imbd_merge.rename(columns = {'nconst': 'directors', 'primary_name': 'director_name'})

In [None]:
imbd_director = imbd_director.drop(columns = 'category')

In [None]:
imbd_director

In [None]:
imbd_crews = pd.merge(imbd_crews, imbd_director, on = ['directors', 'tconst'])

In [None]:
imbd_directors = imbd_crews.drop(columns = "writers")

In [None]:
imbd_basics = pd.merge(imbd_basics, imbd_directors, on = 'tconst')

In [None]:
imbd_bom = pd.merge(bom_20, imbd_basics, on = ['title', 'year'], how = 'left')

In [None]:
imbd_bom.info()

In [None]:
list_dup = list(imbd_bom.loc[imbd_bom.title.duplicated() == True].title.unique())

In [None]:
imbd_bom.loc[imbd_bom.title.isin(list_dup) == True]

In [None]:
original_list = ['David F. Sandberg', 'Robert Zemeckis', 'Tom McCarthy', 'Jason Moore', 'Tim Johnson', 
               'Kenneth Branagh', 'Brad Anderson', 'John Singleton', 'Michel Hazanavicius', 'David Fincher',
              'Steve Antin', 'Ridley Scott', 'Tim Burton']

In [None]:
imbd_bom = imbd_bom.loc[(imbd_bom.title.isin(list_dup) == False) 
             | ((imbd_bom.director_name.isin(original_list) == True) 
               & (imbd_bom.title.isin(list_dup) == True))]

In [None]:
imbd_bom.head()

In [None]:
imbd_bom.loc[(imbd_bom.director_name.duplicated() == True) & ( imbd_bom.director_name.isna() == False)].director_name.nunique()

In [None]:
imbd_principals.head()

In [None]:
imbd_actor = imbd_principals.loc[imbd_principals.category == "actor"]

In [None]:
imbd_name.head()

In [None]:
imbd_actor = pd.merge(imbd_actor, imbd_name, on = 'nconst', how = 'inner')

In [None]:
imbd_actor = imbd_actor.rename(columns = {'primary_name': 'actor_name'})

In [None]:
imbd_actress = imbd_principals.loc[imbd_principals.category == "actress"]

In [None]:
imbd_actress = pd.merge(imbd_actress, imbd_name, on = 'nconst', how = 'inner')

In [None]:
imbd_actress = imbd_actress.rename(columns = {'primary_name': 'actress_name'})

In [None]:
imbd_actress.head()

In [None]:
imbd_actors = pd.merge(imbd_actor, imbd_actress, on = ['tconst', 'nconst'])

In [None]:
imbd_actors = pd.concat[imbd_actor, imbd_actress]