In [4]:
# For data processing
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

In [5]:
API_KEY = '96c7131158ec2d199c338e83be8f0fd5'

# Create empty DF to store the movie data
movie = pd.DataFrame()

resp = r.get("https://api.themoviedb.org/3/discover/movie?" \
                 f"api_key={API_KEY}&language=en-US&sort_by=revenue.desc&" \
                 "include_video=false&page=1")
# Find number of pages in the API call
total_pages = resp.json()['total_pages']

# Relevant movie columns
cols = ['title', 'id', 'popularity', 'vote_count', 'vote_average','release_date', 'original_language', 'adult']
# No. of pages to collect responses from
for i in tqdm(range(1, 10)):
        resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=revenue.desc&include_video=false&page={i}")
        
        # Select the relevant columns
        try:
                data = pd.DataFrame(resp.json()['results'],columns=cols)
        except: continue

        movie = movie.append(data)
movie = movie.reset_index(drop=True)

more_cols = ['budget', 'genres', 'revenue', 'year_released', 'decade_released', 'keywords']

rows = []
for i in tqdm(range(len(movie))):
        row = []
        movie_id = movie['id'][i]
        
        
        # Get the movie's other data that is not found in all movies
        resp = r.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&language=en-US')


        try:
            budget = resp.json()['budget']
            if budget == 0:
                row.append(np.nan)
            else:
                row.append(budget)
        except KeyError:
            row.append(np.nan)
        try:
            genres = []
            for item in resp.json()['genres']:
                genres.append(item['name'])
            row.append(genres)
        except:
            row.append(np.nan)
        
        try:
            revenue = resp.json()['revenue']
            if revenue == 0:
                row.append(np.nan)
            else:
                row.append(revenue)
        except KeyError:
            row.append(np.nan)
        
        try:
            year_movie = int(resp.json()['release_date'][0:4])
            if year_movie >= 2020:
                decade_num = '2020s'
            elif year_movie >= 2010:
                decade_num = '2010s'
            elif year_movie >=2000:
                decade_num = '2000s'
            elif year_movie >=1990:
                decade_num = '1990s'
            elif year_movie >=1980:
                decade_num = '1980s'
            elif year_movie >=1970:
                decade_num = '1970s'
            elif year_movie >=1960:
                decade_num = '1960s'
            else:
                decade_num = 'Before 1960s'
            row.append(year_movie)
            row.append(decade_num)
        except:
            row.append(np.nan)
            row.append(np.nan)
        
        resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={API_KEY}")

        keywords = [kw['name'] for kw in resp.json()['keywords']]

        if keywords == []:
            row.append(np.nan)
        else: row.append(keywords)

        rows.append(row)

# Create a DF containing all the data for all movies
stats = pd.DataFrame(rows, columns=more_cols)

df = pd.concat([movie, stats], axis=1)

df


100%|██████████| 9/9 [00:02<00:00,  3.03it/s]
100%|██████████| 180/180 [00:10<00:00, 16.88it/s]


Unnamed: 0,title,id,popularity,vote_count,vote_average,release_date,original_language,adult,budget,genres,revenue,year_released,decade_released,keywords
0,Watch Your Six,1072246,0.600,1,10.0,,es,False,1.0,"[Action, Thriller, Comedy, Drama]",4999999999,,,
1,Avatar,19995,429.684,28636,7.6,2009-12-15,en,False,237000000.0,"[Action, Adventure, Fantasy, Science Fiction]",2920357254,2009.0,2000s,"[culture clash, future, space war, space colon..."
2,Avengers: Endgame,299534,164.067,22818,8.3,2019-04-24,en,False,356000000.0,"[Adventure, Science Fiction, Action]",2799439100,2019.0,2010s,"[space travel, time travel, time machine, sequ..."
3,Avatar: The Way of Water,76600,941.753,5756,7.7,2022-12-14,en,False,460000000.0,"[Science Fiction, Adventure, Action]",2293000000,2022.0,2020s,"[loss of loved one, dying and death, alien lif..."
4,Titanic,597,155.002,22565,7.9,1997-11-18,en,False,200000000.0,"[Drama, Romance]",2187463944,1997.0,1990s,"[drowning, evacuation, shipwreck, iceberg, for..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,Logan,263115,55.664,17601,7.8,2017-02-28,en,False,97000000.0,"[Action, Drama, Science Fiction]",619021436,2017.0,2010s,"[cyborg, experiment, self-destruction, mutant,..."
176,The Lost World: Jurassic Park,330,1.913,7395,6.5,1997-05-23,en,False,73000000.0,"[Adventure, Action, Science Fiction]",618638999,1997.0,1990s,"[exotic island, dna, paleontology, tyrannosaur..."
177,The Passion of the Christ,615,83.970,3724,7.4,2004-02-25,en,False,30000000.0,[Drama],611899420,2004.0,2000s,"[christianity, jewry, roman empire, suffering,..."
178,Mamma Mia!,11631,25.553,5788,7.0,2008-07-03,en,False,52000000.0,"[Comedy, Romance]",609841637,2008.0,2000s,"[single parent, parent child relationship, gre..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              180 non-null    object 
 1   id                 180 non-null    int64  
 2   popularity         180 non-null    float64
 3   vote_count         180 non-null    int64  
 4   vote_average       180 non-null    float64
 5   release_date       180 non-null    object 
 6   original_language  180 non-null    object 
 7   adult              180 non-null    bool   
 8   budget             178 non-null    float64
 9   genres             180 non-null    object 
 10  revenue            180 non-null    int64  
 11  year_released      177 non-null    float64
 12  decade_released    177 non-null    object 
 13  keywords           172 non-null    object 
dtypes: bool(1), float64(4), int64(3), object(6)
memory usage: 18.6+ KB
