In [None]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#Turn off scientific notation in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Import data
bom_gross = pd.read_csv('Data/bom.movie_gross.csv.gz', compression = 'gzip')
imbd_name = pd.read_csv('Data/imdb.name.basics.csv.gz', compression = 'gzip')
imbd_akas = pd.read_csv('Data/imdb.title.akas.csv.gz', compression = 'gzip')
imbd_basics = pd.read_csv('Data/imdb.title.basics.csv.gz', compression = 'gzip')
imbd_crews = pd.read_csv('Data/imdb.title.crew.csv.gz', compression = 'gzip')
imbd_principals = pd.read_csv('Data/imdb.title.principals.csv.gz', compression = 'gzip')
imbd_ratings = pd.read_csv('Data/imdb.title.ratings.csv.gz', compression = 'gzip')
rt_info = pd.read_csv('Data/rt.movie_info.tsv.gz', delimiter='\t', compression = 'gzip')
rt_reviews = pd.read_csv('Data/rt.reviews.tsv.gz', delimiter='\t', compression = 'gzip', encoding='latin-1')
tmbd = pd.read_csv('Data/tmdb.movies.csv.gz', compression = 'gzip')
tn_budget = pd.read_csv('Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [None]:
#Remove null values for 'studio'. Set 'foreign_gross' to float and 'year' to string
bom_gross['studio'] = bom_gross.studio.fillna('Unknown')
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross.foreign_gross, downcast = 'float', errors = 'coerce')
bom_gross['year'] = bom_gross['year'].astype(str)

#Use median data to fill in null values for domestic and foreign gross
bom_median = bom_gross
bom_median['domestic_gross'] = bom_median.domestic_gross.fillna(value=bom_median.domestic_gross.median())
bom_median['foreign_gross'] = bom_median.foreign_gross.fillna(value=bom_median.foreign_gross.median())

In [None]:
#Drop "birth_year" and "death_year" columns. Fill in null values with 'null'.
imbd_name = imbd_name.drop(columns = ['birth_year', 'death_year'])
imbd_name = imbd_name.fillna('null')

#Turn 'primary_profession' and 'known_for_titles' from string to list
imbd_name['primary_profession'] = imbd_name['primary_profession'].str.split(',')
imbd_name['known_for_titles'] = imbd_name['known_for_titles'].str.split(',')

In [2]:
#Create index of rows that are listed as the original title ('is_original_title'= 1), 
#but are duplicates. Drop duplicate rows.
originals = imbd_akas.loc[imbd_akas.is_original_title == 1]
original_repeats = list(originals.loc[originals['title_id'].duplicated() == True].title_id.unique())
repeat_index = imbd_akas[(imbd_akas.is_original_title == 1) & (imbd_akas.title_id.isin(original_repeats))].index
imbd_akas.drop(repeat_index, inplace = True)

#Create column specifying whether each title_id has a row specifying their original title.
#Remove rows in which the title_id is not the original title, but has an original title listed.
imbd_akas['has_original'] = imbd_akas.title_id.isin(list(imbd_akas.title_id.loc[imbd_akas.is_original_title == 1]))
repeat_index2 = imbd_akas[(imbd_akas.is_original_title != 1) 
                         & (imbd_akas.has_original == True)].index
imbd_akas.drop(repeat_index2, inplace = True)

In [4]:
imbd_akas.title_id.nunique()

122301

In [5]:
imbd_akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122589 entries, 38 to 331700
Data columns (total 9 columns):
title_id             122589 non-null object
ordering             122589 non-null int64
title                122589 non-null object
region               77925 non-null object
language             62 non-null object
types                44826 non-null object
attributes           18 non-null object
is_original_title    122564 non-null float64
has_original         122589 non-null bool
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 8.5+ MB


In [None]:
no_original = imbd_akas.loc[imbd_akas.has_original == False]

In [None]:
no_original_repeats = list(no_original.loc[no_original.title_id.duplicated() == True].title_id.unique())

In [None]:
imbd_akas.info()

In [None]:
imbd_akas.loc[(imbd_akas.has_original == True) | 
              ((imbd_akas.has_original == False) & ~(imbd_akas.title_id.isin(no_original_repeats)))].info()

In [None]:
original_list = list(imbd_akas.loc[(imbd_akas.has_original == True) | 
              ((imbd_akas.has_original == False) & ~(imbd_akas.title_id.isin(no_original_repeats)))].title_id.unique())

In [None]:
no_original_repeats = imbd_akas.loc[~(imbd_akas.title_id.isin(original_list))]

In [None]:
no_original_repeats[~((no_original_repeats.region == "US") | (no_original_repeats.language == "en"))].title_id.unique()

In [None]:
imbd_akas.loc[(imbd_akas.title_id.isin(repeat) == True)&(imbd_akas.original == "yes")]

In [None]:
imbd_akas = imbd_akas.loc[(imbd_akas['original'] == "yes") | 
                          ((imbd_akas['original'] == "no") & (imbd_akas['has_original'] == False))]

In [None]:
len(repeat)

In [None]:
(imbd_akas.loc[(imbd_akas['has_original'] == False)]).title_id.nunique()

In [None]:
no_original = imbd_akas.loc[(imbd_akas['has_original'] == False)]

In [None]:
imbd_akas.loc[(imbd_akas['has_original'] == False) & ()]

In [None]:
duplicates = list((imbd_akas.loc[(imbd_akas.title_id.duplicated() == True)&((imbd_akas['has_original'] == False))]).title_id.unique())

In [None]:
len(duplicates)