In [2]:
# For data processing
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

In [8]:
API_KEY = '96c7131158ec2d199c338e83be8f0fd5'

# Create empty DF to store the movie data
movie = pd.DataFrame()

resp = r.get("https://api.themoviedb.org/3/discover/movie?" \
                 f"api_key={API_KEY}&language=en-US&sort_by=revenue.desc&" \
                 "include_video=false&page=1")
# Find number of pages in the API call
total_pages = resp.json()['total_pages']

# Relevant movie columns
cols = ['title', 'id', 'vote_average','release_date', 'original_language', 'adult']
# No. of pages to collect responses from
for i in tqdm(range(1, 501)):
        resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=revenue.desc&include_video=false&page={i}")
        
        # Select the relevant columns
        try:
                data = pd.DataFrame(resp.json()['results'],columns=cols)
        except: continue

        movie = movie.append(data)
movie = movie.reset_index(drop=True)

more_cols = ['budget', 'genres', 'revenue', 'decade_released']

rows = []
for i in tqdm(range(len(movie))):
        row = []
        movie_id = movie['id'][i]
        
        
        # Get the movie's other data that is not found in all movies
        resp = r.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&language=en-US')


        try:
            budget = resp.json()['budget']
            if budget == 0:
                row.append(np.nan)
            else:
                row.append(budget)
        except KeyError:
            row.append(np.nan)
        try:
            genres = []
            for item in resp.json()['genres']:
                genres.append(item['name'])
            row.append(genres)
        except:
            row.append(np.nan)
        
        try:
            revenue = resp.json()['revenue']
            if revenue == 0:
                row.append(np.nan)
            else:
                row.append(revenue)
        except KeyError:
            row.append(np.nan)
        
        try:
            year_movie = int(resp.json()['release_date'][0:4])
            if year_movie >= 2020:
                decade_num = '2020s'
            elif year_movie >= 2010:
                decade_num = '2010s'
            elif year_movie >=2000:
                decade_num = '2000s'
            elif year_movie >=1990:
                decade_num = '1990s'
            elif year_movie >=1980:
                decade_num = '1980s'
            elif year_movie >=1970:
                decade_num = '1970s'
            elif year_movie >=1960:
                decade_num = '1960s'
            else:
                decade_num = 'Before 1960s'
            row.append(decade_num)
        except:
            row.append(np.nan)
        

        rows.append(row)

# Create a DF containing all the data for all movies
stats = pd.DataFrame(rows, columns=more_cols)

df = pd.concat([movie, stats], axis=1)

df


  movie = movie.append(data)
  movie = movie.append(data)
100%|██████████| 2/2 [00:00<00:00,  7.37it/s]
100%|██████████| 40/40 [00:02<00:00, 13.82it/s]


Unnamed: 0,title,id,vote_average,release_date,original_language,adult,budget,genres,revenue,decade_released
0,Watch Your Six,1072246,10.0,,es,False,1,"[Action, Thriller, Comedy, Drama]",4999999999,
1,Avatar,19995,7.6,2009-12-15,en,False,237000000,"[Action, Adventure, Fantasy, Science Fiction]",2920357254,2000s
2,Avengers: Endgame,299534,8.3,2019-04-24,en,False,356000000,"[Adventure, Science Fiction, Action]",2799439100,2010s
3,Condemned,1106123,10.0,,en,False,300,"[Horror, Mystery]",2470000000,
4,Avatar: The Way of Water,76600,7.8,2022-12-14,en,False,460000000,"[Science Fiction, Adventure, Action]",2310416014,2020s
5,Titanic,597,7.9,1997-11-18,en,False,200000000,"[Drama, Romance]",2187463944,1990s
6,Star Wars: The Force Awakens,140607,7.3,2015-12-15,en,False,245000000,"[Adventure, Action, Science Fiction, Fantasy]",2068223624,2010s
7,Avengers: Infinity War,299536,8.3,2018-04-25,en,False,300000000,"[Adventure, Action, Science Fiction]",2046239637,2010s
8,Spider-Man: No Way Home,634649,8.0,2021-12-15,en,False,200000000,"[Action, Adventure, Science Fiction]",1921847111,2020s
9,Jurassic World,135397,6.7,2015-06-12,en,False,150000000,"[Action, Adventure, Science Fiction, Thriller]",1671537444,2010s


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              40 non-null     object 
 1   id                 40 non-null     int64  
 2   vote_average       40 non-null     float64
 3   release_date       40 non-null     object 
 4   original_language  40 non-null     object 
 5   adult              40 non-null     bool   
 6   budget             40 non-null     int64  
 7   genres             40 non-null     object 
 8   revenue            40 non-null     int64  
 9   decade_released    38 non-null     object 
 10  keywords           39 non-null     object 
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 3.3+ KB


In [None]:
df.to_csv('uncleaned_1015.csv')