# Data Preparation and Cleaning

### Essential Libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

### API Key

In [2]:
API_KEY = '7884759e98354ed855458b12c5c2bcdc'

### Helper Functions

In [67]:
def get_casts(movie_id):
    """returns list of casts from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={API_KEY}&language=en-US")
    casts_dict = resp.json()['cast']
    
    casts = []
    
    for item in casts_dict:
        casts.append(item['name'])
    
    return casts

def get_keywords(movie_id):
    """returns list of keywords from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={API_KEY}")
    keywords_dict = resp.json()['keywords']

    keywords = []

    for item in keywords_dict:
        keywords.append(item['name'])
    
    return keywords


### Data Extraction from TMDB

In [68]:
def get_movies_data():

    # Create empty DF to store movie data
    movie = pd.DataFrame()

    # Get total pages of Discover section
    resp = r.get("https://api.themoviedb.org/3/discover/movie?" \
                 f"api_key={API_KEY}&language=en-US&sort_by=revenue.desc&" \
                 "include_adult=true&include_video=false&page=1")
    total_pages = resp.json()['total_pages']

    # Relevant movie columns
    cols = ['title', 'id', 'adult', 'popularity', 'vote_count', 'vote_average','release_date', 'original_language']

    # Iterating through DB pages and appending to movie DF

    for i in tqdm(range(1,2)):
        resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=revenue.desc&include_adult=True&include_video=false&page={i}")
        
        # Select the relevant columns
        data = pd.DataFrame(resp.json()['results'])[cols]
        movie = movie.append(data)
    
    movie = movie.reset_index(drop=True)
    
    return movie

# def get_movies_stats(movie_df)

In [89]:
movie_df = get_movies_data()

cols = ['casts', 'keywords']

rows = []

for i in tqdm(range(5)):
    row = []
    movie_id = movie_df['id'][i]
    casts = get_casts(movie_id)
    keywords = get_keywords(movie_id)
    row.append(casts)
    row.append(keywords)
    rows.append(row)


stats_df = pd.DataFrame(rows, columns=cols)
# stats_df.head()
df = pd.concat([movie_df, stats_df], axis = 1)
df.head(n=10)

  movie = movie.append(data)
100%|██████████| 1/1 [00:00<00:00, 22.22it/s]
100%|██████████| 5/5 [00:00<00:00, 14.39it/s]


Unnamed: 0,title,id,adult,popularity,vote_count,vote_average,release_date,original_language,casts,keywords
0,Watch Your Six,1072246,False,0.6,1,10.0,,es,"[Strahinja Nicic, Harris Mathieson, Harry Foster]",[]
1,Avatar,19995,False,432.987,28731,7.6,2009-12-15,en,"[Sam Worthington, Zoe Saldaña, Sigourney Weave...","[culture clash, future, space war, space colon..."
2,Avengers: Endgame,299534,False,152.326,22894,8.3,2019-04-24,en,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,...","[space travel, time travel, time machine, sequ..."
3,Condemned,1106123,False,1.4,1,10.0,,en,"[Desiree Ruff, Larry Wachorn, David A Atwood, ...","[moon, killing, detective, journalist, rain, m..."
4,Avatar: The Way of Water,76600,False,6789.789,6098,7.7,2022-12-14,en,"[Sam Worthington, Zoe Saldaña, Sigourney Weave...","[loss of loved one, dying and death, alien lif..."
5,Titanic,597,False,140.29,22639,7.9,1997-11-18,en,,
6,Star Wars: The Force Awakens,140607,False,55.393,17857,7.3,2015-12-15,en,,
7,Avengers: Infinity War,299536,False,241.654,26589,8.3,2018-04-25,en,,
8,Spider-Man: No Way Home,634649,False,348.363,17053,8.0,2021-12-15,en,,
9,Jurassic World,135397,False,71.208,18927,6.7,2015-06-12,en,,
