In [1]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data
tn_budget = pd.read_csv('Data/Zipped_Data/tn.movie_budgets.csv.gz', compression = 'gzip')
bom_gross = pd.read_csv('Data/Zipped_Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/Zipped_Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/Zipped_Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/Zipped_Data/imdb.title.principals.csv.gz', compression = 'gzip')

In [None]:
#Remove punctuation from producton and worldwide. Switch variables from string to float.
tn_budget['production_budget'] = tn_budget['production_budget'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['worldwide_gross'] = tn_budget['worldwide_gross'].apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget['production_budget'] = pd.to_numeric(tn_budget.production_budget, downcast = 'float', errors = 'coerce')
tn_budget['worldwide_gross'] = pd.to_numeric(tn_budget.worldwide_gross, downcast = 'float', errors = 'coerce')

#Create column for net profit (worldwide gross minus production budget).
tn_budget['net'] = list(tn_budget.apply(lambda x: x.worldwide_gross - x.production_budget, axis=1))

#Create column for ratio (net profit divided by production budget)
tn_budget['ratio'] = list(tn_budget.apply(lambda x: x.net / x.production_budget, axis=1))

# Rename column 'movie' to 'title'
tn_budget = tn_budget.rename(columns = {'movie': 'title'})

#remove whitespace, potential extra words, punctuation, and case from titles
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
tn_budget.title = tn_budget.title.replace(years, value='', regex=True)
tn_budget.title = tn_budget.title.str.strip()
tn_budget.title = tn_budget.title.apply(lambda x: x.lower())
tn_budget.title = tn_budget.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
tn_budget.title = tn_budget.title.replace(['the', 'and'], value='', regex=True)

#Remove month and year from date. Drop redundant columns
tn_budget['year'] = list(tn_budget['release_date'].str[-4:])
tn_budget = tn_budget.drop(columns = ['id', 'domestic_gross', 'release_date'])

In [10]:
#Remove 'studio', 'domestic_gross', 'foreign_gross' column. Set 'foreign_gross' to float and 'year' to string
bom_gross = bom_gross.drop(columns = ['domestic_gross', 'foreign_gross'])
bom_gross['year'] = bom_gross['year'].astype(str)

#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
bom_gross.title = bom_gross.title.replace(years, value='', regex=True)
bom_gross.title = bom_gross.title.str.strip()
bom_gross.title = bom_gross.title.apply(lambda x: x.lower())
bom_gross.title = bom_gross.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
bom_gross.title = bom_gross.title.replace(['the', 'and'], value='', regex=True)

In [None]:
#Merge tn_budget and bom_gross
merged_df = pd.merge(tn_budget, bom_gross, on = ["title", "year"], how = "left")

#Replace null studio values
merged_df['studio'] = merged_df.studio.fillna('Unknown')

In [17]:
#Create 'year' string column from 'start_year'
imbd_basics['year'] = imbd_basics['start_year'].astype(str)

#Drop unused columns
imbd_basics = imbd_basics.drop(columns = ['original_title', 'start_year'])

#Rename column for easier merging
imbd_basics = imbd_basics.rename(columns = {'primary_title': 'title'})

#Remove years and right whitespace from titles. Remove potentially extraneous words.
years = ['\(2010\)', "\(2011\)", 
         "\(2012\)", "\(2013\)", "\(2014\)", 
         "\(2015\)", "\(2016\)", "\(2017\)", "\(2018\)"]
imbd_basics.title = imbd_basics.title.replace(years, value='', regex=True)
imbd_basics.title = imbd_basics.title.str.strip()
imbd_basics.title = imbd_basics.title.apply(lambda x: x.lower())
imbd_basics.title = imbd_basics.title.apply(lambda x: x.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')))
imbd_basics.title = imbd_basics.title.replace(['the', 'and'], value='', regex=True)

In [18]:
#Merge imbd_basics with merged datafile
merged_df2 = pd.merge(merged_df, imbd_basics, on = ["title", "year"], how = "left")

In [37]:
#Change year to integer, and remove all films made before 2010. Turn year back to string.
merged_df2['year'] = merged_df2['year'].astype(int)
merged_df2 = merged_df2.loc[merged_df2.year >= 2010]
merged_df2['year'] = merged_df2['year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [83]:
#Merge imbd_principals and imbd_name
imbd_name_prin = pd.merge(imbd_principals, imbd_name, on = ["nconst"], how = "left")

In [86]:
#Remove extraneous columns
imbd_name_prin = imbd_name_prin.drop(columns = ['ordering', 'nconst', 
                               'job', 'characters', 'birth_year', 'death_year', 
                               'primary_profession', 'known_for_titles'])

In [90]:
#Create list of directors from imbd_name_prin
directors = imbd_name_prin.loc[imbd_name_prin.category == 'director']

In [93]:
#Merge list of directors with merged document
merged_df3 = pd.merge(merged_df2, directors, on = ["tconst"], how = "left")

In [95]:
#Remove extra column, rename primary_name to director
merged_df3 = merged_df3.drop(columns = 'category')
merged_df3 = merged_df3.rename(columns = {'primary_name': 'director'})

In [97]:
#Fill in null values for director
merged_df3['director'] = merged_df3.director.fillna('Not listed')

In [99]:
#Create list of producers from imbd_name_prin
producers = imbd_name_prin.loc[imbd_name_prin.category == 'producer']

In [101]:
#Rename column primary_name to producer
producers = producers.rename(columns = {'primary_name': 'producer'})

In [102]:
#Drop column 'category'
producers = producers.drop(columns = 'category')

In [104]:
#Merge list of producers with merged document
merged_df4 = pd.merge(merged_df3, producers, on = ["tconst"], how = "left")

In [106]:
#Fill null producer values
merged_df4['producer'] = merged_df4.producer.fillna('Not listed')

In [70]:
#Create a list of 20 studios with the top ratio means
studio_ratio_mean = merged_df2.groupby(['studio']).ratio.mean().sort_values(ascending=False)[:20]

#Create a list of 20 studios with the top ratio medians
studio_ratio_median = merged_df2.groupby(['studio']).ratio.median().sort_values(ascending=False)[:20]

In [115]:
#Create a list of 20 directors with the top ratio means
director_ratio_mean = merged_df4.groupby(['director']).ratio.mean().sort_values(ascending=False)[:20]

In [117]:
#Create a list of 20 producers with the top ratio means
producer_ratio_mean = merged_df4.groupby(['producer']).ratio.mean().sort_values(ascending=False)[:20]