In [1]:
#necessary libraries and modules to import

import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
#changing the display settings in dataframes to only format floats and display columns

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('max_rows', 200)

In [3]:
#converts tsv files

tsv_file = './zippedData/rt.movie_info.tsv.gz'
convert = pd.read_table(tsv_file, delimiter='\t')
convert.to_csv('./zippedData/rt.movie_info.csv.gz')

tsv_file_r = './zippedData/rt.reviews.tsv.gz'
convert_r = pd.read_table(tsv_file_r, delimiter='\t', encoding = 'unicode_escape')
convert_r.to_csv('./zippedData/rt.reviews.csv.gz')

In [4]:
#importing batch data using os

csv_files = glob("./zippedData/*.csv.gz")
csv_files

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\rt.movie_info.csv.gz',
 './zippedData\\rt.reviews.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [5]:
# cleaning the filenames

csv_files_dict = {} #create a dictionary of datasets
for filename in csv_files: #create a for loop to batch clean files
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") #remove .csv file extensions
    filename_df = pd.read_csv(filename, index_col=0, encoding='utf-8') 
    csv_files_dict[filename_cleaned] = filename_df #load .csv file as dataframe using col 1 as index and encode in utf-8 and save as the cleaned filename

In [6]:
print(csv_files_dict.keys())

dict_keys(['bom_movie_gross_gz', 'imdb_name_basics_gz', 'imdb_title_akas_gz', 'imdb_title_basics_gz', 'imdb_title_crew_gz', 'imdb_title_principals_gz', 'rt_movie_info_gz', 'rt_reviews_gz', 'tmdb_movies_gz', 'tn_movie_budgets_gz'])


In [48]:
bom_movie_gross_df = csv_files_dict['bom_movie_gross_gz']
imdb_name_basics_df = csv_files_dict['imdb_name_basics_gz']
imdb_title_akas_df = csv_files_dict['imdb_title_akas_gz']
imdb_title_basics_df = csv_files_dict['imdb_title_basics_gz']
imdb_title_crew_df = csv_files_dict['imdb_title_crew_gz']
imdb_title_principals_df = csv_files_dict['imdb_title_principals_gz']
rotten_movie_info_df = csv_files_dict['rt_movie_info_gz']
rotten_movie_reviews_df = csv_files_dict['rt_reviews_gz']
tmdb_movies_df = csv_files_dict['tmdb_movies_gz']
tmdb_genre_ids_df = pd.read_json('tmdb_genre_ids.json')
tmdb_languages_df = pd.read_json('tmdb_languages.json')
tn_movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']

In [8]:
bom_movie_gross_df

Unnamed: 0_level_0,studio,domestic_gross,foreign_gross,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story 3,BV,415000000.00,652000000,2010
Alice in Wonderland (2010),BV,334200000.00,691300000,2010
Harry Potter and the Deathly Hallows Part 1,WB,296000000.00,664300000,2010
Inception,WB,292600000.00,535700000,2010
Shrek Forever After,P/DW,238700000.00,513900000,2010
...,...,...,...,...
The Quake,Magn.,6200.00,,2018
Edward II (2018 re-release),FM,4800.00,,2018
El Pacto,Sony,2500.00,,2018
The Swan,Synergetic,2400.00,,2018


In [9]:
rotten_movie_reviews_df.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [10]:
tmdb_movies_df.iloc[50:100,:]

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
50,"[12, 28, 18]",23759,en,Centurion,13.55,2010-02-15,Centurion,6.1,673
51,"[28, 35, 80]",27581,en,The Other Guys,13.51,2010-08-06,The Other Guys,6.2,2187
52,"[80, 18, 53]",23168,en,The Town,13.48,2010-09-17,The Town,7.1,2386
53,"[28, 37, 18, 14, 53]",20533,en,Jonah Hex,13.47,2010-06-18,Jonah Hex,4.6,708
54,"[12, 28, 878]",20526,en,TRON: Legacy,13.46,2010-12-10,TRON: Legacy,6.3,4387
55,"[12, 14, 28, 10749]",9543,en,Prince of Persia: The Sands of Time,13.44,2010-05-27,Prince of Persia: The Sands of Time,6.2,4040
56,"[35, 16, 10751]",9994,en,The Great Mouse Detective,13.35,1986-07-02,The Great Mouse Detective,7.1,769
57,"[14, 18]",7980,en,The Lovely Bones,13.29,2010-01-15,The Lovely Bones,6.8,2226
58,[18],705,en,All About Eve,13.16,2000-10-06,All About Eve,8.2,685
59,"[18, 12, 37]",44264,en,True Grit,13.12,2010-12-22,True Grit,7.2,2816


In [11]:
tmdb_movies_df.shape

(26517, 9)

In [12]:
tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].astype(str)

In [13]:
tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].str.strip('[]')
#tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].str.strip('')

In [14]:
tmdb_movies_df

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"12, 14, 10751",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.70,10788
1,"14, 12, 16, 10751",10191,en,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.70,7610
2,"12, 28, 878",10138,en,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.80,12368
3,"16, 35, 10751",862,en,Toy Story,28.00,1995-11-22,Toy Story,7.90,10174
4,"28, 878, 12",27205,en,Inception,27.92,2010-07-16,Inception,8.30,22186
...,...,...,...,...,...,...,...,...,...
26512,"27, 18",488143,en,Laboratory Conditions,0.60,2018-10-13,Laboratory Conditions,0.00,1
26513,"18, 53",485975,en,_EXHIBIT_84xxx_,0.60,2018-05-01,_EXHIBIT_84xxx_,0.00,1
26514,"14, 28, 12",381231,en,The Last One,0.60,2018-10-01,The Last One,0.00,1
26515,"10751, 12, 28",366854,en,Trailer Made,0.60,2018-06-22,Trailer Made,0.00,1


In [15]:
#split the genre column to individual genre columns
#use a lambda function to split the genre list at the comma

tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].apply(lambda x: x.split(", ") if x else x)

tmdb_movies_df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.0,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [16]:
#create a set (unordered) of all genres (no duplicate values) using a for loop

all_genres = set()
for genres in tmdb_movies_df['genre_ids']: #for all genres in the move_details_df 'genres' column
    if genres: #if in genres
        all_genres.update(genres) #add to the set called all_genres

In [17]:
all_genres

{'10402',
 '10749',
 '10751',
 '10752',
 '10770',
 '12',
 '14',
 '16',
 '18',
 '27',
 '28',
 '35',
 '36',
 '37',
 '53',
 '80',
 '878',
 '9648',
 '99'}

In [18]:
#add new genre cols from all_genres set and fill with zero values using numpy with a for loop; 

for genre in all_genres: #for genre in all_genres set
    tmdb_movies_df[genre] = np.zeros(shape=tmdb_movies_df.shape[0]) 
        #create new genre column such that it is filled with zeroes 
        #with a shape equal to first dataframe col
                                
tmdb_movies_df.head() #return first 5 rows of dataframe

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,16,28,14,10749,18,878,10751,10402,36,10752,80,35,9648,37,12,53,10770,27,99
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.7,7610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[12, 28, 878]",10138,en,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[16, 35, 10751]",862,en,Toy Story,28.0,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#set the genre value to 1 if the film is of that genre with a nested for loop

for index, row in tmdb_movies_df.iterrows(): #for index, row position in dataframe, iterating throw rows
    if row['genre_ids']: #if in genres col
        for genre in row['genre_ids']: #for genre in genres col
            tmdb_movies_df.loc[index, genre] = 1 #set the value of each matching index/genre position to 1

tmdb_movies_df.head() #return first 5 rows of dataframe

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,16,28,14,10749,18,878,10751,10402,36,10752,80,35,9648,37,12,53,10770,27,99
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,"[12, 28, 878]",10138,en,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.8,12368,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,"[16, 35, 10751]",862,en,Toy Story,28.0,1995-11-22,Toy Story,7.9,10174,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
tmdb_genre_ids_df

Unnamed: 0,genres
0,"{'id': 28, 'name': 'Action'}"
1,"{'id': 12, 'name': 'Adventure'}"
2,"{'id': 16, 'name': 'Animation'}"
3,"{'id': 35, 'name': 'Comedy'}"
4,"{'id': 80, 'name': 'Crime'}"
5,"{'id': 99, 'name': 'Documentary'}"
6,"{'id': 18, 'name': 'Drama'}"
7,"{'id': 10751, 'name': 'Family'}"
8,"{'id': 14, 'name': 'Fantasy'}"
9,"{'id': 36, 'name': 'History'}"


In [21]:
keys = tmdb_genre_ids_df.genres.iloc[0].keys() #Get dictionary keys

#Keep track of columns we make for subsequent preview
genre_cols = []

#Create a new feature for each of these keys
for key in keys:
    new_col = 'genres_{}'.format(key) #Create new column name
    tmdb_genre_ids_df[new_col] = tmdb_genre_ids_df.genres.map(lambda x: x[key]) #Create a new column
    genre_cols.append(new_col)
    
tmdb_genre_ids_df[genre_cols]

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [22]:
tmdb_genre_cols = tmdb_genre_ids_df[genre_cols]
tmdb_genre_cols

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [23]:
genres_dict = dict(zip(tmdb_genre_cols['genres_id'], tmdb_genre_cols['genres_name']))

genres_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [24]:
genres_dict = {str(k): v for k, v in genres_dict.items()}

In [25]:
tmdb_movies_genres_df = tmdb_movies_df.rename(columns=genres_dict)
tmdb_movies_genres_df

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Animation,Action,Fantasy,Romance,Drama,Science Fiction,Family,Music,History,War,Crime,Comedy,Mystery,Western,Adventure,Thriller,TV Movie,Horror,Documentary
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.70,10788,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.70,7610,1.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
2,"[12, 28, 878]",10138,en,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.80,12368,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
3,"[16, 35, 10751]",862,en,Toy Story,28.00,1995-11-22,Toy Story,7.90,10174,1.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.30,22186,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26512,"[27, 18]",488143,en,Laboratory Conditions,0.60,2018-10-13,Laboratory Conditions,0.00,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.60,2018-05-01,_EXHIBIT_84xxx_,0.00,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00
26514,"[14, 28, 12]",381231,en,The Last One,0.60,2018-10-01,The Last One,0.00,1,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
26515,"[10751, 12, 28]",366854,en,Trailer Made,0.60,2018-06-22,Trailer Made,0.00,1,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00


#tmdb_movies_df.rename(columns={tmdb_genre_cols['genres_id']:tmdb_genre_cols['genres_name']})
 

keys = tmdb_genre_cols['genres_id']
values = tmdb_genre_cols['genres_name']

genre_names_list = tmdb_genre_cols['genres_name'].tolist()
genre_names_set = set(genre_names_list)
genre_names_set

#add new genre cols from all_genres set and fill with zero values using numpy with a for loop; 

for genre in genre_names_set: #for genre in all_genres set
    tmdb_movies_df[genre] = np.zeros(shape=tmdb_movies_df.shape[0]) 
        #create new genre column such that it is filled with zeroes 
        #with a shape equal to first dataframe col
                                
tmdb_movies_df.head() #return first 5 rows of dataframe

In [26]:
imdb_title_crew_df.head(10)

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0285252,nm0899854,nm0899854
tt0438973,,"nm0175726,nm1802864"
tt0462036,nm1940585,nm1940585
tt0835418,nm0151540,"nm0310087,nm0841532"
tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943
tt0879859,nm2416460,
tt0996958,nm2286991,"nm2286991,nm2651190"
tt0999913,nm0527109,"nm0527109,nm0329051,nm0001603,nm0930684"
tt10003792,nm10539228,nm10539228
tt10005130,nm10540239,"nm5482263,nm10540239"


In [27]:
imdb_title_principals_df.head(10)

Unnamed: 0_level_0,ordering,nconst,category,job,characters
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0111414,1,nm0246005,actor,,"[""The Man""]"
tt0111414,2,nm0398271,director,,
tt0111414,3,nm3739909,producer,producer,
tt0323808,10,nm0059247,editor,,
tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"
tt0323808,2,nm2694680,actor,,"[""Steve Thomson""]"
tt0323808,3,nm0574615,actor,,"[""Sir Lachlan Morrison""]"
tt0323808,4,nm0502652,actress,,"[""Lady Delia Morrison""]"
tt0323808,5,nm0362736,director,,
tt0323808,6,nm0811056,producer,producer,


In [28]:
imdb_title_akas_df.head(50)

Unnamed: 0_level_0,ordering,title,region,language,types,attributes,is_original_title
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0369610,10,Джурасик свят,BG,bg,,,0.0
tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
tt0369610,15,Jurassic World,GR,,imdbDisplay,,0.0
tt0369610,16,Jurassic World,IT,,imdbDisplay,,0.0
tt0369610,17,Jurski svijet,HR,,imdbDisplay,,0.0
tt0369610,18,Olam ha'Yura,IL,he,imdbDisplay,,0.0
tt0369610,19,Jurassic World: Mundo Jurásico,MX,,imdbDisplay,,0.0


In [29]:
imdb_name_basics_df.head(10)

Unnamed: 0_level_0,primary_name,birth_year,death_year,primary_profession,known_for_titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"
nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous","tt2590280,tt0352080,tt0216559,tt2057445"
nm0063198,Bayou,,,actor,"tt6579724,tt0093116"
nm0063432,Stevie Be-Zet,,,"composer,soundtrack","tt3106212,tt0478239,tt0264917,tt1626606"
nm0063618,Jeff Beal,1963.0,,"composer,music_department,soundtrack","tt0183659,tt2545118,tt0384766,tt1856010"
nm0063750,Lindsay Beamish,,,"actress,miscellaneous","tt0404826,tt0111756,tt0367027,tt1492842"


In [30]:
imdb_title_basics_df.head(50)

Unnamed: 0_level_0,primary_title,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
tt0111414,A Thin Life,A Thin Life,2018,75.0,Comedy
tt0112502,Bigfoot,Bigfoot,2017,,"Horror,Thriller"
tt0137204,Joe Finds Grace,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy"
tt0139613,O Silêncio,O Silêncio,2012,,"Documentary,History"
tt0144449,Nema aviona za Zagreb,Nema aviona za Zagreb,2012,82.0,Biography


## The Numbers dataset

### tn_movie_budgets_df

In [31]:
type(tn_movie_budgets_df) #call type of variable

pandas.core.frame.DataFrame

In [32]:
tn_movie_budgets_df.shape #return the dimensions of the dataframe array

(5782, 5)

In [33]:
tn_movie_budgets_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: object(5)
memory usage: 271.0+ KB


In [34]:
tn_movie_budgets_df.isna().sum() #sum of all the NaN values in each col

release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [35]:
tn_movie_budgets_df.head() #call first five rows

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [36]:
#define function called convert_amt_to_int
#cast datatype of col as string [added to avoid error], remove $, and change datatype to int64

def convert_amt_to_int(df, col):
    df[col] = df[col].astype('str').str.replace("$", "").str.replace(",", "").astype('int64') 
    return df

In [37]:
#make a list of cols to batch-change datatype with a for loop

money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']

for col in money_cols: 
    tn_movie_budgets_df = convert_amt_to_int(tn_movie_budgets_df, col)

In [38]:
tn_movie_budgets_df.info() #return basic summary of dataframe to confirm datatype changes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null int64
domestic_gross       5782 non-null int64
worldwide_gross      5782 non-null int64
dtypes: int64(3), object(2)
memory usage: 271.0+ KB


In [39]:
tn_movie_sort = tn_movie_budgets_df.sort_values(by=['worldwide_gross'], ascending=False)
tn_movie_sort

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
43,"Dec 19, 1997",Titanic,200000000,659363944,2208208395
6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220
7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200
34,"Jun 12, 2015",Jurassic World,215000000,652270625,1648854864
...,...,...,...,...,...
75,"Dec 31, 2005",Insomnia Manica,500000,0,0
74,"Jul 17, 2012",Girls Gone Dead,500000,0,0
73,"Apr 3, 2012",Enter Nowhere,500000,0,0
72,"Dec 31, 2010",Drones,500000,0,0


lang = tmdb_mgl_df.groupby(['original_language', 'original_title']).count()

lang

# tmdb_lang_df work

In [40]:
tmdb_lang_df = pd.read_csv('tmdb_lang.csv')
tmdb_lang_df

Unnamed: 0.1,Unnamed: 0,id,original_title,popularity,release_date,title,vote_average,vote_count,Comedy,Drama,Mystery,Western,Thriller,Romance,History,Horror,Fantasy,Music,Adventure,Documentary,Family,War,Crime,Science Fiction,Animation,Action,TV Movie,original_language
0,0,12444,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.70,10788,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,English
1,1,10191,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.70,7610,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
2,2,10138,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.80,12368,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
3,3,862,Toy Story,28.00,1995-11-22,Toy Story,7.90,10174,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
4,4,27205,Inception,27.92,2010-07-16,Inception,8.30,22186,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26512,26512,488143,Laboratory Conditions,0.60,2018-10-13,Laboratory Conditions,0.00,1,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26513,26513,485975,_EXHIBIT_84xxx_,0.60,2018-05-01,_EXHIBIT_84xxx_,0.00,1,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26514,26514,381231,The Last One,0.60,2018-10-01,The Last One,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,English
26515,26515,366854,Trailer Made,0.60,2018-06-22,Trailer Made,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,English


In [41]:
tmdb_lang_df = tmdb_lang_df.drop_duplicates(subset='original_title', keep='first')
tmdb_lang_df

Unnamed: 0.1,Unnamed: 0,id,original_title,popularity,release_date,title,vote_average,vote_count,Comedy,Drama,Mystery,Western,Thriller,Romance,History,Horror,Fantasy,Music,Adventure,Documentary,Family,War,Crime,Science Fiction,Animation,Action,TV Movie,original_language
0,0,12444,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.70,10788,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,English
1,1,10191,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.70,7610,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
2,2,10138,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.80,12368,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
3,3,862,Toy Story,28.00,1995-11-22,Toy Story,7.90,10174,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
4,4,27205,Inception,27.92,2010-07-16,Inception,8.30,22186,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26512,26512,488143,Laboratory Conditions,0.60,2018-10-13,Laboratory Conditions,0.00,1,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26513,26513,485975,_EXHIBIT_84xxx_,0.60,2018-05-01,_EXHIBIT_84xxx_,0.00,1,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26514,26514,381231,The Last One,0.60,2018-10-01,The Last One,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,English
26515,26515,366854,Trailer Made,0.60,2018-06-22,Trailer Made,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,English


In [42]:
tmdb_lang_df = tmdb_lang_df.dropna()
tmdb_lang_df

Unnamed: 0.1,Unnamed: 0,id,original_title,popularity,release_date,title,vote_average,vote_count,Comedy,Drama,Mystery,Western,Thriller,Romance,History,Horror,Fantasy,Music,Adventure,Documentary,Family,War,Crime,Science Fiction,Animation,Action,TV Movie,original_language
0,0,12444,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.70,10788,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,English
1,1,10191,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.70,7610,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
2,2,10138,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.80,12368,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
3,3,862,Toy Story,28.00,1995-11-22,Toy Story,7.90,10174,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,English
4,4,27205,Inception,27.92,2010-07-16,Inception,8.30,22186,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26512,26512,488143,Laboratory Conditions,0.60,2018-10-13,Laboratory Conditions,0.00,1,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26513,26513,485975,_EXHIBIT_84xxx_,0.60,2018-05-01,_EXHIBIT_84xxx_,0.00,1,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,English
26514,26514,381231,The Last One,0.60,2018-10-01,The Last One,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,English
26515,26515,366854,Trailer Made,0.60,2018-06-22,Trailer Made,0.00,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,English


In [43]:
tmdb_all_lang_df = tmdb_lang_df.loc[:, ['original_title', 'original_language', 'release_date', 'popularity']]
tmdb_all_lang_df

Unnamed: 0,original_title,original_language,release_date,popularity
0,Harry Potter and the Deathly Hallows: Part 1,English,2010-11-19,33.53
1,How to Train Your Dragon,English,2010-03-26,28.73
2,Iron Man 2,English,2010-05-07,28.52
3,Toy Story,English,1995-11-22,28.00
4,Inception,English,2010-07-16,27.92
...,...,...,...,...
26512,Laboratory Conditions,English,2018-10-13,0.60
26513,_EXHIBIT_84xxx_,English,2018-05-01,0.60
26514,The Last One,English,2018-10-01,0.60
26515,Trailer Made,English,2018-06-22,0.60


In [44]:
tmdblangne = tmdb_all_lang_df[~tmdb_all_lang_df['original_language'].isin(['English'])] #filter out 'English' value in 'original_language' col
tmdblangne

Unnamed: 0,original_title,original_language,release_date,popularity
17,LelleBelle,Dutch,2010-10-09,18.59
49,Tres metros sobre el cielo,Spanish,2010-12-20,13.72
70,El secreto de sus ojos,Spanish,2010-04-16,12.53
75,サマーウォーズ,Japanese,2010-10-13,12.28
79,Luftslottet som sprängdes,Swedish,2010-10-29,12.23
...,...,...,...,...
26457,Maison du bonheur,French,2018-08-24,0.60
26469,Point of No Return,Portuguese,2018-09-19,0.60
26494,La última virgen,Spanish,2018-05-26,0.60
26503,Evolution 4K,German,2018-06-12,0.60


21, 756 English as original language vs. 3079 non-English language films in dataset. 

In [45]:
tmdblangnesort = tmdblangne.groupby('original_language').count()
tmdblangnesort = tmdblangnesort.sort_values(by=['original_title'], ascending=False)
tmdblangnesort = tmdblangnesort.reset_index()
tmdblangnesort

Unnamed: 0,original_language,original_title,release_date,popularity
0,French,479,479,479
1,Spanish,432,432,432
2,Russian,295,295,295
3,Japanese,243,243,243
4,German,224,224,224
5,Mandarin,174,174,174
6,Hindi,168,168,168
7,Italian,118,118,118
8,Portuguese,93,93,93
9,Korean,92,92,92


In [49]:
tn_movie_budgets_df

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747
...,...,...,...,...,...
78,"Dec 31, 2018",Red 11,7000,0,0
79,"Apr 2, 1999",Following,6000,48482,240495
80,"Jul 13, 2005",Return to the Land of Wonders,5000,1338,1338
81,"Sep 29, 2015",A Plague So Pleasant,1400,0,0


In [51]:
tn_movie_budgets_df['foreign_gross'] = tn_movie_budgets_df['worldwide_gross'] - tn_movie_budgets_df['domestic_gross']
tn_movie_budgets_df

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,2015837654
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,804600000
3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,107000000
4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,944008095
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,696540365
...,...,...,...,...,...,...
78,"Dec 31, 2018",Red 11,7000,0,0,0
79,"Apr 2, 1999",Following,6000,48482,240495,192013
80,"Jul 13, 2005",Return to the Land of Wonders,5000,1338,1338,0
81,"Sep 29, 2015",A Plague So Pleasant,1400,0,0,0


In [None]:
tn_movie_budgets_df = tn_movie_budgets_df.sort_values(by=['worldwide_gross'], ascending=False)
tn_movie_budgets_df = tn_movie_budgets_df.reset_index(drop=True)
top10grosseng = tn_movie_budgets_df.loc[:9,['movie','worldwide_gross']]
top10grosseng

In [57]:
tn_movie_budgets_df = tn_movie_budgets_df.sort_values(by=['foreign_gross'], ascending=False)
tn_movie_budgets_df = tn_movie_budgets_df.reset_index(drop=True)
top10foreng = tn_movie_budgets_df.loc[:9,['movie','foreign_gross']]
top10foreng

Unnamed: 0,movie,foreign_gross
0,Avatar,2015837654
1,Titanic,1548844451
2,Avengers: Infinity War,1369318718
3,Furious 7,1165715774
4,Star Wars Ep. VII: The Force Awakens,1116648995
5,The Fate of the Furious,1009081502
6,Jurassic World,996584239
7,Harry Potter and the Deathly Hallows: Part II,960500000
8,Avengers: Age of Ultron,944008095
9,The Avengers,894656350


In [58]:
top10foreng.loc[:,'foreign_gross'].sum()

12021195778