In [1]:
# Import libraries.
import time
import requests
import pandas as pd

### 1  Importation

In [4]:
def get_movies(api_key, base_url, params, iteration_start=1):
    """Request data from TMDb API.
    
    Parameters
    ----------
    api_key: string, 
        API key.
    
    base_url: string, 
        base URL for API request.
    
    params: dictionary, 
        list of tuples or bytes to send in the query string.
    
    Return
    ------
    movies: json, 
        lists of movies imported
    """
    
    # Initialize an empty list to store the movie data
    movies = []
    
    # Set a flag to indicate whether there are more pages to request
    more_pages = True
    
    # Start time and iteration index
    start_time = time.time()
    iteration = iteration_start

    while more_pages:        
        # Stop condition
        # if iteration == 500:
            # break

        # Make the API request
        response = requests.get(f'{base_url}discover/movie', params=params)

        # Check the status code to make sure the request was successful
        if response.status_code == 200:
            # Loop through the results and make a separate API request for each movie
            for movie in response.json()['results']:
                # Set the movie ID for the API request
                movie_id = movie['id']

                # Make the API request for the movie details
                movie_response = requests.get(f'{base_url}movie/{movie_id}', params={'api_key': api_key})

                # Check the status code to make sure the request was successful
                if movie_response.status_code == 200:
                    # Add the movie data to the list
                    movies.append(movie_response.json())

            # Update the page number for the next request
            params['page'] += 1

            # Check if there are more pages to request
            if params['page'] > response.json()['total_pages']:
                more_pages = False
        else:
            print(f'Request failed with status code {response.status_code}')
            break

        # Verbose printing
        if iteration % 25 == 0:
            print(f"Page {params['page']}. Elapsed time {time.time() - start_time} seconds.")
        iteration += 1
        
    return movies

In [3]:
# API key
api_key = "06af3e1b654bb9481c777bde394b620b"
# Set the base URL for the TMDB API
base_url = 'https://api.themoviedb.org/3/'
# Set the parameters for the API request
params = {
    'api_key': api_key,
    'language': 'en-US',
    'include_adult': 'false',
    'sort_by': 'popularity.desc',  # most popular movies
    'primary_release_date.gte': '1900-01-01',  # start date
    'primary_release_date.lte': '2015-12-31',  # end date
    'page': 1
}

movies = get_movies(api_key, base_url, params)

Iteration 25. Elapsed time 83.2518720626831 seconds.
Iteration 50. Elapsed time 166.22432231903076 seconds.
Iteration 75. Elapsed time 249.17099475860596 seconds.
Iteration 100. Elapsed time 332.4762153625488 seconds.
Iteration 125. Elapsed time 415.5822877883911 seconds.
Iteration 150. Elapsed time 498.3738663196564 seconds.
Iteration 175. Elapsed time 582.491557598114 seconds.
Iteration 200. Elapsed time 666.1178531646729 seconds.
Iteration 225. Elapsed time 748.9675574302673 seconds.
Iteration 250. Elapsed time 831.1019570827484 seconds.
Iteration 275. Elapsed time 914.5418336391449 seconds.
Iteration 300. Elapsed time 998.1842296123505 seconds.
Iteration 325. Elapsed time 1081.4362106323242 seconds.
Iteration 350. Elapsed time 1165.123994588852 seconds.
Iteration 375. Elapsed time 1248.139220714569 seconds.
Iteration 400. Elapsed time 1337.7508780956268 seconds.
Iteration 425. Elapsed time 1424.3477404117584 seconds.
Iteration 450. Elapsed time 1509.579218864441 seconds.
Iteration 

In [16]:
# Turn into DataFrame and save as pickle for further use.
# To avoid re-doing all changes.
movies_df = pd.DataFrame(movies)
movies_df.to_pickle("./data/movies_tmdb.pkl")

In [21]:
# Load the movie data into a pandas DataFrame
movies_df = pd.read_pickle("./data/movies_tmdb.pkl")
print(movies_df.shape)
movies_df.head(2)

(10000, 25)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/Yc9q6QuWrMp9nuDm5R8ExNqbEq.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,en,Avatar,...,2009-12-15,2920357254,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.541,26969
1,False,/uEwGFGtao9YG2JolmdvtHLLVbA9.jpg,,0,"[{'id': 99, 'name': 'Documentary'}]",,111332,tt1599280,en,Avatar: Creating the World of Pandora,...,2010-02-07,0,23,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Avatar: Creating the World of Pandora,False,7.328,29


In [22]:
# Keep only non-zero budget and revenue rows.
movies_df = movies_df.query('budget > 0 and revenue > 0')
print(movies_df.shape)
movies_df.head(2)

(4974, 25)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/Yc9q6QuWrMp9nuDm5R8ExNqbEq.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,en,Avatar,...,2009-12-15,2920357254,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.541,26969
3,False,/Avbc5QFUFMpN6RiPgFyRB4RshUP.jpg,,123000000,"[{'id': 10751, 'name': 'Family'}, {'id': 35, '...",,8871,tt0170016,en,How the Grinch Stole Christmas,...,2000-11-15,345823040,104,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,He puts the mean in green.,How the Grinch Stole Christmas,False,6.7,6242


### 2 Analyse descriptive

In [None]:
# 

### 3 Modélisation