<font size="+3"><strong>Analyzing Movie Data and Predicting Box Office Performance with the TMDb API</strong></font>

In [1]:
# Import libraries.
import time
import requests
import pandas as pd

# 1 Importation

In [2]:
def get_movies(api_key, base_url, params):
    """Request data from TMDb API.
    
    Parameters
    ----------
    api_key: string, 
        API key.
    
    base_url: string, 
        base URL for API request.
    
    params: dictionary, 
        list of tuples or bytes to send in the query string.
    
    Return
    ------
    movies: json, 
        lists of movies imported
    """
    
    # Initialize an empty list to store the movie data
    movies = []
    
    # Set a flag to indicate whether there are more pages to request
    more_pages = True
    
    # Start time and iteration index
    start_time = time.time()

    while more_pages:        
        # Make the API request
        response = requests.get(f'{base_url}discover/movie', params=params)

        # Check the status code to make sure the request was successful
        if response.status_code == 200:
            # Loop through the results and make a separate API request for each movie
            for movie in response.json()['results']:
                # Set the movie ID for the API request
                movie_id = movie['id']

                # Make the API request for the movie details
                movie_response = requests.get(f'{base_url}movie/{movie_id}', params={'api_key': api_key})

                # Check the status code to make sure the request was successful
                if movie_response.status_code == 200:
                    # Add the movie data to the list
                    movies.append(movie_response.json())

            # Update the page number for the next request
            params['page'] += 1

            # Check if there are more pages to request
            if params['page'] > response.json()['total_pages']:
                more_pages = False
        else:
            print(f'Request failed with status code {response.status_code}')
            break

        # Verbose printing
        if params['page'] % 25 == 0:
            print(f"Page {params['page']}. Load {len(movies)} movies. Elapsed time {time.time() - start_time} seconds.")

    return movies

In [3]:
# API key
api_key = "06af3e1b654bb9481c777bde394b620b"
# Set the base URL for the TMDB API
base_url = 'https://api.themoviedb.org/3/'
# Set the parameters for the API request
params = {
    'api_key': api_key,
    'language': 'en-US',
    'include_adult': 'false',
    'sort_by': 'popularity.desc',  # most popular movies
    'primary_release_date.gte': '1900-01-01',  # start date
    'primary_release_date.lte': '2015-12-31',  # end date
    'page': 1
}

movies = get_movies(api_key, base_url, params)

Page 25. Load 480 movies. Elapsed time 93.0046534538269 seconds.
Page 50. Load 980 movies. Elapsed time 189.97506618499756 seconds.
Page 75. Load 1480 movies. Elapsed time 286.50015687942505 seconds.
Page 100. Load 1980 movies. Elapsed time 381.6607096195221 seconds.
Page 125. Load 2480 movies. Elapsed time 476.96944546699524 seconds.
Page 150. Load 2980 movies. Elapsed time 573.2153375148773 seconds.
Page 175. Load 3480 movies. Elapsed time 668.552496433258 seconds.
Page 200. Load 3980 movies. Elapsed time 763.9174160957336 seconds.
Page 225. Load 4480 movies. Elapsed time 860.8392179012299 seconds.
Page 250. Load 4980 movies. Elapsed time 960.2945592403412 seconds.
Page 275. Load 5479 movies. Elapsed time 1057.0364940166473 seconds.
Page 300. Load 5979 movies. Elapsed time 1153.9656944274902 seconds.
Page 325. Load 6479 movies. Elapsed time 1251.696575641632 seconds.
Page 350. Load 6979 movies. Elapsed time 1349.213386297226 seconds.
Page 375. Load 7479 movies. Elapsed time 1448.0404

In [4]:
# Turn into DataFrame and save as pickle for further use.
# To avoid having to run again all the API requests.
movies_df = pd.DataFrame(movies)
movies_df.to_pickle("./data/movies_tmdb.pkl")