# Using Kaggle API

Read how to use kaggle api: https://github.com/Kaggle/kaggle-api  
You need to have kaggle.json file  
You may need to upgrade pip and other python packages

!pip install kaggle  
!kaggle datasets download -d rounakbanik/the-movies-dataset  
!kaggle datasets download -d tmdb/tmdb-movie-metadata  
!kaggle datasets download -d carolzhangdc/imdb-5000-movie-dataset  

# Unzip files

from zipfile import ZipFile  
ZipFile('imdb-5000-movie-dataset.zip').extractall()  
ZipFile('tmdb-movie-metadata.zip').extractall()  
ZipFile('the-movies-dataset.zip').extractall()  

# Load Packages

In [1]:
%matplotlib inline

import re
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ast import literal_eval
from unicodedata import normalize
from dateutil.parser import parse

# Load and Clean Datasets

* TMDB

In [2]:
tmdf1 = pd.read_csv('tmdb_5000_credits.csv')
tmdf2 = pd.read_csv('tmdb_5000_movies.csv')\
.drop(['budget', 'homepage', 'original_title', 'overview', 'status', 'tagline',\
       'revenue', 'popularity'], axis = 1)
tmdf1.columns = ['id','title','cast','crew']
tmdf1['cast'] = tmdf1['cast'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf1['crew'] = tmdf1['crew'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf2['keywords'] = tmdf2['keywords'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf2['production_companies'] = tmdf2['production_companies'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf2['production_countries'] = tmdf2['production_countries'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf2['spoken_languages'] = tmdf2['spoken_languages'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])
tmdf2['genres'] = tmdf2['genres'].apply(json.loads)\
.apply(lambda x: [value for i in x for key, value in i.items() if key == 'name'])\
.apply(lambda x: [i.replace('Science Fiction','Sci-Fi') for i in x])\
#.apply(lambda x: '|'.join(x))

df = tmdf1.merge(tmdf2,on = ['id', 'title'])
df['title'] = df['title'].apply(lambda x: x.lower())

* IMDB

In [3]:
imdf = pd.read_csv('movie_metadata.csv').rename(columns={'movie_title' : 'title'})\
.drop_duplicates(subset=['title', 'duration', 'color'])\
.drop(['budget', 'gross', 'facenumber_in_poster', 'movie_imdb_link'], axis=1)
imdf = imdf.drop(imdf.columns[imdf.columns.str.contains('likes|reviews')].values.tolist(), axis = 1)
imdf['title'] = imdf['title'].apply(lambda x: normalize('NFKD', x))\
.apply(lambda x: x[:-1]).apply(lambda x: x.lower())
imdf['genres'] = imdf['genres'].apply(lambda x: x.split(sep = '|'))
imdf['plot_keywords'] = imdf['plot_keywords'].fillna(' ')\
.apply(lambda x: x.split(sep = '|'))

* merge

In [4]:
df = df.merge(imdf, on = ['title'], how = 'inner').drop_duplicates(['title'], keep = False)
df['genres'] = (df.genres_x+df.genres_y).fillna(' ').apply(lambda x: list(set(x)))
df.cast = (df.cast + df.actor_1_name.apply(lambda x: [x])\
+ df.actor_2_name.apply(lambda x: [x]) + df.actor_3_name.apply(lambda x: [x]))\
.fillna(' ').apply(lambda x: list(set(x)))
df.keywords = (df.keywords + df.plot_keywords).fillna(' ').apply(lambda x: list(set(x)))
df.language = (df.language.apply(lambda x: [x]) + df.original_language.apply(lambda x: [x])\
               + df.spoken_languages).fillna(' ').apply(lambda x: list(set(x)))
df.crew = (df.crew + df.director_name.apply(lambda x: [x])).fillna(' ').apply(lambda x: list(set(x)))
df.title_year = df.title_year.combine(df.release_date.fillna('1880').apply(lambda x: parse(x).year), lambda x,y: x if x >= y else y)
df.runtime = df.runtime.fillna(0).combine(df.duration.fillna(0), lambda x,y: x if x >= y else y)
df.drop(['genres_x', 'genres_y', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'plot_keywords',\
         'original_language', 'spoken_languages', 'director_name', 'release_date', 'duration'], axis=1, inplace=True)

df.runtime = df.runtime.replace(0, np.nan)
df[['title_year']] = df[['title_year']].applymap(int)