In [1]:
# Import libraries.
import time
import requests
import pandas as pd

### 1  Importation

In [2]:
def get_movies(api_key, base_url, params):
    """Request data from TMDb API.
    
    Parameters
    ----------
    api_key: string, 
        API key.
    
    base_url: string, 
        base URL for API request.
    
    params: dictionary, 
        list of tuples or bytes to send in the query string.
    
    Return
    ------
    movies: json, 
        lists of movies imported
    """
    
    # Initialize an empty list to store the movie data
    movies = []
    
    # Set a flag to indicate whether there are more pages to request
    more_pages = True
    
    # Start time and iteration index
    start_time = time.time()

    while more_pages:        
        # Make the API request
        response = requests.get(f'{base_url}discover/movie', params=params)

        # Check the status code to make sure the request was successful
        if response.status_code == 200:
            # Loop through the results and make a separate API request for each movie
            for movie in response.json()['results']:
                # Set the movie ID for the API request
                movie_id = movie['id']

                # Make the API request for the movie details
                movie_response = requests.get(f'{base_url}movie/{movie_id}', params={'api_key': api_key})

                # Check the status code to make sure the request was successful
                if movie_response.status_code == 200:
                    # Add the movie data to the list
                    movies.append(movie_response.json())

            # Update the page number for the next request
            params['page'] += 1

            # Check if there are more pages to request
            if params['page'] > response.json()['total_pages']:
                more_pages = False
        else:
            print(f'Request failed with status code {response.status_code}')
            break

        # Verbose printing
        if params['page'] % 25 == 0:
            print(f"Page {params['page']}. Load {len(movies)} movies. Elapsed time {time.time() - start_time} seconds.")

    return movies

In [3]:
# API key
api_key = "06af3e1b654bb9481c777bde394b620b"
# Set the base URL for the TMDB API
base_url = 'https://api.themoviedb.org/3/'
# Set the parameters for the API request
params = {
    'api_key': api_key,
    'language': 'en-US',
    'include_adult': 'false',
    'sort_by': 'popularity.desc',  # most popular movies
    'primary_release_date.gte': '1900-01-01',  # start date
    'primary_release_date.lte': '2015-12-31',  # end date
    'page': 1
}

movies = get_movies(api_key, base_url, params)

Page 25. Elapsed time 92.00552105903625 seconds.
Page 50. Elapsed time 190.91733288764954 seconds.
Page 75. Elapsed time 285.52460622787476 seconds.
Page 100. Elapsed time 380.7748918533325 seconds.
Page 125. Elapsed time 477.75760436058044 seconds.
Page 150. Elapsed time 582.7462062835693 seconds.
Page 175. Elapsed time 679.8796634674072 seconds.
Page 200. Elapsed time 777.3079595565796 seconds.
Page 225. Elapsed time 872.7655184268951 seconds.
Page 250. Elapsed time 971.5524516105652 seconds.
Page 275. Elapsed time 1068.8458898067474 seconds.
Page 300. Elapsed time 1164.7658591270447 seconds.
Page 325. Elapsed time 1260.5397877693176 seconds.
Page 350. Elapsed time 1357.7487869262695 seconds.
Page 375. Elapsed time 1454.621375799179 seconds.
Page 400. Elapsed time 1551.629147529602 seconds.
Page 425. Elapsed time 1649.9315621852875 seconds.
Page 450. Elapsed time 1748.1851208209991 seconds.
Page 475. Elapsed time 1848.4275760650635 seconds.
Page 500. Elapsed time 1952.9907190799713 s

In [16]:
# Turn into DataFrame and save as pickle for further use.
# To avoid having to run again all the API requests.
movies_df = pd.DataFrame(movies)
movies_df.to_pickle("./data/movies_tmdb.pkl")

In [21]:
# Load the movie data into a pandas DataFrame
movies_df = pd.read_pickle("./data/movies_tmdb.pkl")
print(movies_df.shape)
movies_df.head(2)

(10000, 25)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/Yc9q6QuWrMp9nuDm5R8ExNqbEq.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,en,Avatar,...,2009-12-15,2920357254,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.541,26969
1,False,/uEwGFGtao9YG2JolmdvtHLLVbA9.jpg,,0,"[{'id': 99, 'name': 'Documentary'}]",,111332,tt1599280,en,Avatar: Creating the World of Pandora,...,2010-02-07,0,23,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Avatar: Creating the World of Pandora,False,7.328,29


In [22]:
# Keep only non-zero budget and revenue rows.
movies_df = movies_df.query('budget > 0 and revenue > 0')
print(movies_df.shape)
movies_df.head(2)

(4974, 25)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/Yc9q6QuWrMp9nuDm5R8ExNqbEq.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,en,Avatar,...,2009-12-15,2920357254,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.541,26969
3,False,/Avbc5QFUFMpN6RiPgFyRB4RshUP.jpg,,123000000,"[{'id': 10751, 'name': 'Family'}, {'id': 35, '...",,8871,tt0170016,en,How the Grinch Stole Christmas,...,2000-11-15,345823040,104,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,He puts the mean in green.,How the Grinch Stole Christmas,False,6.7,6242


### 2 Analyse descriptive

In [11]:
# Cleaning Function.
def cleaning_variable(cell):
    variable = []
    for element in cell:
        variable.append(element["name"])    
    return variable

# List of columns to which apply the function. 
colnames = ["genres", "production_countries", "production_companies"]

# Apply the function
for colname in colnames:
    try:
        movies_df[colname] = movies_df[colname].apply(cleaning_variable) 
    except:
        pass
    
movies_df.head(2)

Unnamed: 0,id,title,release_date,budget,genres,popularity,production_companies,production_countries,runtime,tagline,vote_average,vote_count,revenue
0,19995,Avatar,2009-12-15,237000000,"[Action, Adventure, Fantasy, Science Fiction]",3424.983,"[20th Century Fox, Ingenious Media, Dune Enter...","[United States of America, United Kingdom]",162,Enter the world of Pandora.,7.542,27004,2920357254
3,8871,How the Grinch Stole Christmas,2000-11-15,123000000,"[Family, Comedy, Fantasy]",365.805,"[Imagine Entertainment, Universal Pictures, LU...","[Germany, United States of America]",104,He puts the mean in green.,6.737,6243,345823040


In [12]:
# Column names.
movies_df.columns

Index(['id', 'title', 'release_date', 'budget', 'genres', 'popularity',
       'production_companies', 'production_countries', 'runtime', 'tagline',
       'vote_average', 'vote_count', 'revenue'],
      dtype='object')

In [13]:
# Subset columns.
columns = [
    "id", "title", "release_date", "budget", "genres", "popularity", 
    "production_companies", "production_countries", "runtime", 
    "tagline", "vote_average", "vote_count", "revenue"
]
movies_df = movies_df[columns]
movies_df.columns

Index(['id', 'title', 'release_date', 'budget', 'genres', 'popularity',
       'production_companies', 'production_countries', 'runtime', 'tagline',
       'vote_average', 'vote_count', 'revenue'],
      dtype='object')

### 3 Modélisation

In [15]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [16]:
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, validation_curve

In [17]:
# Subset Data.
cols = [
    "genres", "popularity", "production_companies", "production_countries", 
    "budget", "runtime", "vote_average", "vote_count", "revenue"
]
movies_df = movies_df[cols]

In [22]:
movies_df["genres"]

0        [Action, Adventure, Fantasy, Science Fiction]
3                            [Family, Comedy, Fantasy]
5                         [Adventure, Family, Fantasy]
6       [Action, Animation, Adventure, Comedy, Family]
8                                    [Comedy, Fantasy]
                             ...                      
9986         [Mystery, Action, Drama, Thriller, Crime]
9993                                           [Drama]
9994                 [Action, Drama, History, Romance]
9996                                  [Drama, Romance]
9997                                          [Comedy]
Name: genres, Length: 4974, dtype: object

In [None]:
# Split features and target.
target="revenue"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

# Train-Test randomized split.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

# Baseline mean square error
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))


## Iterate.

# Create a pipeline.
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

# Fit the model.
model.fit(X_train, y_train)

# Evaluate.

# Communicate Results.