# Data Cleaning & EDA

## Importing the data

In [1]:
#necessary libraries and modules to import

import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
#importing batch data using os

csv_files = glob("./zippedData/*.csv.gz")
csv_files

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\rt.movie_info.csv.gz',
 './zippedData\\rt.reviews.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [3]:
# cleaning the filenames

csv_files_dict = {} #create a dictionary of datasets
for filename in csv_files: #create a for loop to batch clean files
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") #remove .csv file extensions
    filename_df = pd.read_csv(filename, index_col=0, encoding='utf-8') 
    csv_files_dict[filename_cleaned] = filename_df #load .csv file as dataframe using col 1 as index and encode in utf-8 and save as the cleaned filename

In [4]:
#call dictionary keys

print(csv_files_dict.keys())

dict_keys(['bom_movie_gross_gz', 'imdb_name_basics_gz', 'imdb_title_akas_gz', 'imdb_title_basics_gz', 'imdb_title_crew_gz', 'imdb_title_principals_gz', 'rt_movie_info_gz', 'rt_reviews_gz', 'tmdb_movies_gz', 'tn_movie_budgets_gz'])


In [5]:
#loading datasets into dataframes from csv & json files

bom_movie_gross_df = csv_files_dict['bom_movie_gross_gz']
imdb_name_basics_df = csv_files_dict['imdb_name_basics_gz']
imdb_title_akas_df = csv_files_dict['imdb_title_akas_gz']
imdb_title_basics_df = csv_files_dict['imdb_title_basics_gz']
imdb_title_crew_df = csv_files_dict['imdb_title_crew_gz']
imdb_title_principals_df = csv_files_dict['imdb_title_principals_gz']
rotten_movie_info_df = csv_files_dict['rt_movie_info_gz']
rotten_movie_reviews_df = csv_files_dict['rt_reviews_gz']
tmdb_movies_df = csv_files_dict['tmdb_movies_gz']
tmdb_genre_ids_df = pd.read_json('tmdb_genre_ids.json')
tmdb_languages_df = pd.read_json('tmdb_languages.json')
tn_movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']

## TMDB datasets

### tmdb_movies_df

In [6]:
type(tmdb_movies_df) #call type of variable

pandas.core.frame.DataFrame

In [7]:
tmdb_movies_df.shape #return the dimensions of the dataframe array

(26517, 9)

In [8]:
tmdb_movies_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
genre_ids            26517 non-null object
id                   26517 non-null int64
original_language    26517 non-null object
original_title       26517 non-null object
popularity           26517 non-null float64
release_date         26517 non-null object
title                26517 non-null object
vote_average         26517 non-null float64
vote_count           26517 non-null int64
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [9]:
tmdb_movies_df.isna().sum() #sum of all the NaN values in each col

genre_ids            0
id                   0
original_language    0
original_title       0
popularity           0
release_date         0
title                0
vote_average         0
vote_count           0
dtype: int64

In [10]:
tmdb_movies_df.head() #call first five rows

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


### tmdb_genre_ids_df >>> tmdb_genre_cols_df

In [11]:
type(tmdb_genre_ids_df) #call type of variable

pandas.core.frame.DataFrame

In [12]:
tmdb_genre_ids_df.shape #return the dimensions of the dataframe array

(19, 1)

In [13]:
tmdb_genre_ids_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 1 columns):
genres    19 non-null object
dtypes: object(1)
memory usage: 280.0+ bytes


In [14]:
tmdb_genre_ids_df.isna().sum() #sum of all the NaN values in each col

genres    0
dtype: int64

In [15]:
tmdb_genre_ids_df.head() #call first five rows

Unnamed: 0,genres
0,"{'id': 28, 'name': 'Action'}"
1,"{'id': 12, 'name': 'Adventure'}"
2,"{'id': 16, 'name': 'Animation'}"
3,"{'id': 35, 'name': 'Comedy'}"
4,"{'id': 80, 'name': 'Crime'}"


In [16]:
keys = tmdb_genre_ids_df.genres.iloc[0].keys() #Get dictionary keys

#Keep track of columns we make for subsequent preview
genre_cols = []

#Create a new feature for each of these keys
for key in keys:
    new_col = 'genres_{}'.format(key) #Create new column name
    tmdb_genre_ids_df[new_col] = tmdb_genre_ids_df.genres.map(lambda x: x[key]) #Create a new column
    genre_cols.append(new_col)
    
tmdb_genre_ids_df[genre_cols]

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [17]:
#set new dataframe of genres

tmdb_genre_cols_df = tmdb_genre_ids_df[genre_cols]
tmdb_genre_cols_df

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


### tmdb_languages_df

In [18]:
type(tmdb_languages_df) #call type of variable

pandas.core.frame.DataFrame

In [19]:
tmdb_languages_df.shape #return the dimensions of the dataframe array

(187, 3)

In [20]:
tmdb_languages_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 3 columns):
iso_639_1       187 non-null object
english_name    187 non-null object
name            187 non-null object
dtypes: object(3)
memory usage: 4.5+ KB


In [21]:
tmdb_languages_df.isna().sum() #sum of all the NaN values in each col

iso_639_1       0
english_name    0
name            0
dtype: int64

In [22]:
tmdb_languages_df.head() #call first five rows

Unnamed: 0,iso_639_1,english_name,name
0,xx,No Language,No Language
1,aa,Afar,
2,af,Afrikaans,Afrikaans
3,ak,Akan,
4,an,Aragonese,


## The Numbers dataset

### tn_movie_budgets_df

In [23]:
type(tn_movie_budgets_df) #call type of variable

pandas.core.frame.DataFrame

In [24]:
tn_movie_budgets_df.shape #return the dimensions of the dataframe array

(5782, 5)

In [25]:
tn_movie_budgets_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: object(5)
memory usage: 271.0+ KB


In [26]:
tn_movie_budgets_df.isna().sum() #sum of all the NaN values in each col

release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [27]:
tn_movie_budgets_df.head() #call first five rows

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [28]:
#define function called convert_amt_to_int
#cast datatype of col as string [added to avoid error], remove $, and change datatype to int64

def convert_amt_to_int(df, col):
    df[col] = df[col].astype('str').str.replace("$", "").str.replace(",", "").astype('int64') 
    return df

In [29]:
#make a list of cols to batch-change datatype with a for loop

money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']

for col in money_cols: 
    tn_movie_budgets_df = convert_amt_to_int(tn_movie_budgets_df, col)

In [30]:
tn_movie_budgets_df.info() #return basic summary of dataframe to confirm datatype changes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null int64
domestic_gross       5782 non-null int64
worldwide_gross      5782 non-null int64
dtypes: int64(3), object(2)
memory usage: 271.0+ KB


In [31]:
tn_movie_budgets_df.head() #call first 5 rows, confirming the $ has been removed

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [32]:
#re-confirm no NaN values

tn_movie_budgets_df.isna().sum()

release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

Checking the value counts

In [33]:
#looking at the most occuring values to see if there are any weird values
#using a for loop, returning normalized values (values as a percentage of the total values) in the first 5 rows  

for col in tn_movie_budgets_df:
    print(f'Viewing values in col: {col}')
    print(f'Top 5 values:\n{tn_movie_budgets_df[col].value_counts(normalize = True)[:5]}')
    print("-------------------")

Viewing values in col: release_date
Top 5 values:
Dec 31, 2014    0.004151
Dec 31, 2015    0.003978
Dec 31, 2010    0.002594
Dec 31, 2008    0.002421
Dec 31, 2013    0.002248
Name: release_date, dtype: float64
-------------------
Viewing values in col: movie
Top 5 values:
Halloween                0.000519
King Kong                0.000519
Home                     0.000519
The Birth of a Nation    0.000346
Conan the Barbarian      0.000346
Name: movie, dtype: float64
-------------------
Viewing values in col: production_budget
Top 5 values:
20000000    0.039952
10000000    0.036666
30000000    0.030612
15000000    0.029920
25000000    0.029575
Name: production_budget, dtype: float64
-------------------
Viewing values in col: domestic_gross
Top 5 values:
0           0.094777
8000000     0.001557
2000000     0.001211
7000000     0.001211
10000000    0.001038
Name: domestic_gross, dtype: float64
-------------------
Viewing values in col: worldwide_gross
Top 5 values:
0          0.063473
80

## Merging Datasets

### All TMDB datasets

### tmdb_movies_df + tmdb_genres_col_df

Adding the genre names to the dataset

In [34]:
tmdb_movies_df.head(1)

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788


In [35]:
tmdb_genre_cols_df.head(1)

Unnamed: 0,genres_id,genres_name
0,28,Action


In [36]:
#casting values as strings

tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].astype(str)

In [37]:
#removing brackets from the column values

tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].str.strip('[]')

In [38]:
tmdb_movies_df.head() #return first 5 rows

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"12, 14, 10751",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"14, 12, 16, 10751",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"12, 28, 878",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"16, 35, 10751",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"28, 878, 12",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [39]:
#split the genre column to individual genre columns
#use a lambda function to split the genre list at the comma

tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].apply(lambda x: x.split(", ") if x else x)

tmdb_movies_df.head() #return first 5 rows

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [40]:
#create a set (unordered) of all genres (no duplicate values) using a for loop

all_genres = set()
for genres in tmdb_movies_df['genre_ids']: #for all genres in the move_details_df 'genres' column
    if genres: #if in genres
        all_genres.update(genres) #add to the set called all_genres

all_genres

{'10402',
 '10749',
 '10751',
 '10752',
 '10770',
 '12',
 '14',
 '16',
 '18',
 '27',
 '28',
 '35',
 '36',
 '37',
 '53',
 '80',
 '878',
 '9648',
 '99'}

In [41]:
#add new genre cols from all_genres set and fill with zero values using numpy with a for loop; 

for genre in all_genres: #for genre in all_genres set
    tmdb_movies_df[genre] = np.zeros(shape=tmdb_movies_df.shape[0]) 
        #create new genre column such that it is filled with zeroes 
        #with a shape equal to first dataframe col
                                
tmdb_movies_df.head() #return first row of dataframe

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,14,...,10402,10752,99,10770,16,12,27,878,35,80
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
#set the genre value to 1 if the film is of that genre with a nested for loop

for index, row in tmdb_movies_df.iterrows(): #for index, row position in dataframe, iterating throw rows
    if row['genre_ids']: #if in genres col
        for genre in row['genre_ids']: #for genre in genres col
            tmdb_movies_df.loc[index, genre] = 1 #set the value of each matching index/genre position to 1

tmdb_movies_df.head() #return first 5 rows of dataframe

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,14,...,10402,10752,99,10770,16,12,27,878,35,80
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [43]:
tmdb_genre_cols_df.head()

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime


In [44]:
#create a dictionary of genre ids to genre names in tmdb_genre_cols_df

genres_dict = dict(zip(tmdb_genre_cols_df['genres_id'], tmdb_genre_cols_df['genres_name']))

genres_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [45]:
#iterate through the dictionary to cast all keys as strings using list comprehension

genres_dict = {str(key): value for key, value in genres_dict.items()}

In [46]:
#rename genre columns replacing id numbers with names

tmdb_movies_genres_df = tmdb_movies_df.rename(columns=genres_dict)

tmdb_movies_genres_df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Fantasy,...,Music,War,Documentary,TV Movie,Animation,Adventure,Horror,Science Fiction,Comedy,Crime
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [47]:
#drop genre_ids column

tmdb_movies_genres_df = tmdb_movies_genres_df.drop(['genre_ids'], axis=1)

tmdb_movies_genres_df.head()

Unnamed: 0,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Fantasy,Romance,...,Music,War,Documentary,TV Movie,Animation,Adventure,Horror,Science Fiction,Comedy,Crime
0,12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### + tmdb_languages_df

In [48]:
tmdb_languages_df.head()

Unnamed: 0,iso_639_1,english_name,name
0,xx,No Language,No Language
1,aa,Afar,
2,af,Afrikaans,Afrikaans
3,ak,Akan,
4,an,Aragonese,


In [49]:
tmdb_languages_df.keys()

Index(['iso_639_1', 'english_name', 'name'], dtype='object')

In [50]:
#merge dataframes using different columns

tmdb_mgl_df = pd.merge(tmdb_movies_genres_df, tmdb_languages_df, 
                       left_on = ['original_language'], right_on = ['iso_639_1'],
                       how = 'left')

In [51]:
tmdb_mgl_df.head()

Unnamed: 0,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Fantasy,Romance,...,TV Movie,Animation,Adventure,Horror,Science Fiction,Comedy,Crime,iso_639_1,english_name,name
0,12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,en,English,English
1,10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,en,English,English
2,10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,en,English,English
3,862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,en,English,English
4,27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,en,English,English


In [52]:
tmdb_mgl_df = tmdb_mgl_df.drop(['original_language', 'iso_639_1', 'name'], axis=1)

In [53]:
tmdb_mgl_df = tmdb_mgl_df.rename(columns={'english_name': 'original_language'})

In [54]:
tmdb_mgl_df.head()

Unnamed: 0,id,original_title,popularity,release_date,title,vote_average,vote_count,Fantasy,Romance,Drama,...,War,Documentary,TV Movie,Animation,Adventure,Horror,Science Fiction,Comedy,Crime,original_language
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,English
1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,English
2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,English
3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,English
4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,English


### + tn_movie_budgets_df

In [55]:
tn_movie_budgets_df.head()

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [56]:
#merge dataframes using different columns

tmdb_mglb_df = pd.merge(tmdb_mgl_df, tn_movie_budgets_df, 
                       left_on = ['original_title'], right_on = ['movie'],
                       how = 'left')

In [57]:
tmdb_mglb_df.head()

Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Romance,Drama,...,Horror,Science Fiction,Comedy,Crime,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,English,,,,,
1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494870992.0
2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156389.0
3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545516.0
4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524642.0


### Adding Features

#### Domestic & Worldwide Net (Gross - Production Budget)

In [58]:
#add a new column 'worldwide_budget_gross_diff' by subtracting 'production_budget' from 'worldwide_gross' cols 

tmdb_mglb_df['worldwide_budget_gross_diff'] = tmdb_mglb_df['worldwide_gross'] - tmdb_mglb_df['production_budget']

tn_movie_budgets_df.head() #call first 5 rows

Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Romance,Drama,...,Science Fiction,Comedy,Crime,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,worldwide_budget_gross_diff
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,0.0,...,0.0,0.0,0.0,English,,,,,,
1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,0.0,...,0.0,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494870992.0,329870992.0
2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,...,1.0,0.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156389.0,451156389.0
3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545516.0,334545516.0
4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,...,1.0,0.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524642.0,675524642.0


In [59]:
#add a new column 'domestic_budget_gross_diff' by subtracting 'production_budget' from 'domestic_gross' cols 

tmdb_mglb_df['domestic_budget_gross_diff'] = tmdb_mglb_df['domestic_gross'] - tmdb_mglb_df['production_budget']

tmdb_mglb_df.head(10) #call first 10 rows

Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Romance,Drama,...,Comedy,Crime,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,worldwide_budget_gross_diff,domestic_budget_gross_diff
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,0.0,...,0.0,0.0,English,,,,,,,
1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,0.0,...,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494871000.0,329871000.0,52581232.0
2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,...,0.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156400.0,451156400.0,142433331.0
3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,...,1.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545500.0,334545500.0,161796233.0
4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,...,0.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524600.0,675524600.0,132576195.0
5,32657,Percy Jackson & the Olympians: The Lightning T...,26.691,2010-02-11,Percy Jackson & the Olympians: The Lightning T...,6.1,4229,1.0,0.0,0.0,...,0.0,0.0,English,"Feb 12, 2010",Percy Jackson & the Olympians: The Lightning T...,95000000.0,88768303.0,223050900.0,128050900.0,-6231697.0
6,19995,Avatar,26.526,2009-12-18,Avatar,7.4,18676,1.0,0.0,0.0,...,0.0,0.0,English,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2776345000.0,2351345000.0,335507625.0
7,10193,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340,0.0,0.0,0.0,...,1.0,0.0,English,"Jun 18, 2010",Toy Story 3,200000000.0,415004880.0,1068880000.0,868879500.0,215004880.0
8,20352,Despicable Me,23.673,2010-07-09,Despicable Me,7.2,10057,0.0,0.0,0.0,...,1.0,0.0,English,"Jul 9, 2010",Despicable Me,69000000.0,251513985.0,543464600.0,474464600.0,182513985.0
9,38055,Megamind,22.855,2010-11-04,Megamind,6.8,3635,0.0,0.0,0.0,...,1.0,0.0,English,"Nov 5, 2010",Megamind,130000000.0,148415853.0,321887200.0,191887200.0,18415853.0


genreprof = tmdb_mgl_df.groupby(['original_language', 'original_title']).count()

lang

## By Genre, which films yield highest net profit?

short description

### Genre performance by year

    * Review net loss
    * Review net gain

### Genre performance by runtime

    * Review net loss
    * Review net gain

## Data Cleaning

### Dealing with datatypes

### Checking the value counts 

### Result: By Genre, which films yield the highest net profit?

## Data Visualization