In [47]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, cv, Pool
from sklearn.metrics import root_mean_squared_error, make_scorer, mean_absolute_percentage_error
from sklearn.feature_selection import r_regression
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",None)

In [None]:
df = pd.read_json('DATASET_FINAL.json')
df
df['main_actor'] = df['actors'].apply(lambda x : x[0] if isinstance(x,list) and len(x)>0 else None)

df['jpbox_budget'] = df['jpbox_budget'].apply(lambda x : pd.NA if x == '' else x)
df['jpbox_budget'].isna().sum()
moyenne = round(pd.to_numeric(df['jpbox_budget']).dropna().mean(),0)
df['jpbox_budget'] = df['jpbox_budget'].fillna(moyenne)
df['jpbox_budget'] = df['jpbox_budget'].astype(int)


Unnamed: 0,fr_title,released_year,directors,writer,distribution,country,budget,category,released_date,classification,duration,weekly_entrances,duration_minutes,actor_1,actor_2,actor_3
0,La Ch’tite famille,2018,Dany Boon,Dany Boon,Pathé Films,France,27800000,Comédie,28/02/2018,Tout public,1h 47min,2429906,107,Dany Boon,François Berléand,Valérie Bonneton
1,Rien à déclarer,2011,Dany Boon,Dany Boon,Pathé Films,France,24434009,Comédie,02/02/2011,Interdit - 10 ans,1h 48min,2587056,108,Dany Boon,Benoît Poelvoorde,François Damiens
2,Avatar,2009,James Cameron,James Cameron,The Walt Disney Company France,Etats-Unis,315000000,Science Fiction,16/12/2009,Tout public,2h 40min,2648596,160,Sam Worthington,Zoe Saldaña,Sigourney Weaver
3,Avengers: Infinity War,2018,- Russo (brothers),Christopher Markus,The Walt Disney Company France,Etats-Unis,295000000,Comicbook,25/04/2018,Tout public,2h 36min,2565953,156,Chris Evans,Chris Hemsworth,Josh Brolin
4,Harry Potter et les reliques de la mort - part...,2010,David Yates,Joanne Kathleen Rowling,Warner Bros.,Etats-Unis,125000000,Fantasy,24/11/2010,Tout public,2h 30min,2537450,150,Daniel Radcliffe,Emma Watson,Helena Bonham Carter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7246,Fast & Furious 7,2015,James Wan,Chris Morgan,Universal Pictures International France,Etats-Unis,190000000,Aventure - Action,01/04/2015,Tout public,2h 20min,2158508,140,Vin Diesel,Paul Walker,Dwayne Johnson
7247,007 Spectre,2015,Sam Mendes,John Logan,Sony Pictures Releasing France,Grande-Bretagne,245000000,Aventure - Action,11/11/2015,Tout public,2h 30min,2203549,150,Daniel Craig,Christoph Waltz,Monica Bellucci
7248,Barbie,2023,Greta Gerwig,Greta Gerwig,Warner Bros. France,Etats-Unis,145000000,Film familial,19/07/2023,Tout public,1h 54min,1680490,114,Ryan Gosling,Margot Robbie,Will Ferrell
7249,Les Gardiens de la Galaxie 2,2017,James Gunn,James Gunn,The Walt Disney Company France,Etats-Unis,200000000,Comicbook,26/04/2017,Tout public,2h 16min,1401339,136,Chris Pratt,Kurt Russell,Sylvester Stallone


In [None]:
df

Index(['fr_title', 'released_year', 'directors', 'writer', 'distribution',
       'country', 'budget', 'category', 'released_date', 'classification',
       'duration', 'weekly_entrances', 'duration_minutes', 'actor_1',
       'actor_2', 'actor_3'],
      dtype='object')

In [None]:
df_top_100_actors = df.groupby('main_actor')['jpbox_weekly_entrances'].mean().sort_values(ascending= False)[0:100].reset_index()
df_top_100_actors['main_actor']

df['top100_actor'] = df['main_actor'].apply(lambda x : 1 if x in(df_top_100_actors['main_actor'].to_list()
) else 0)


df_top_100_directors = df.groupby('jpbox_directors')['jpbox_weekly_entrances'].mean().sort_values(ascending= False)[0:100].reset_index()
df_top_100_directors['jpbox_directors']

df['top100_director'] = df['jpbox_directors'].apply(lambda x : 1 if x in(df_top_100_directors['jpbox_directors'].to_list()
) else 0)

df_top_100_writers = df.groupby('allocine_writer')['jpbox_weekly_entrances'].mean().sort_values(ascending= False)[0:100].reset_index()
df_top_100_writers['allocine_writer']

df['top100_writer'] = df['allocine_writer'].apply(lambda x : 1 if x in(df_top_100_writers['allocine_writer'].to_list()
) else 0)

df_top_20_distributions = df.groupby('allocine_distribution')['jpbox_weekly_entrances'].mean().sort_values(ascending= False)[0:20].reset_index()
df_top_20_distributions['allocine_distribution']

df['top20_distribution'] = df['allocine_distribution'].apply(lambda x : 1 if x in(df_top_20_distributions['allocine_distribution'].to_list()
) else 0)


In [157]:
df_top_100_actors = df.groupby('allocine_distribution')['jpbox_weekly_entrances'].mean().sort_values(ascending= False)[0:].reset_index()
df_top_100_actors

Unnamed: 0,allocine_distribution,jpbox_weekly_entrances
0,1 prix et 4 nominations,1941238.0
1,CIC,1629779.0
2,Columbia TriStar,1629217.0
3,EuropaCorp Distribution / ARP Sélection,1524218.0
4,5 010 728 entrées,1250646.0
...,...,...
333,Wayna Pitch,489.0
334,DMVB Films,383.0
335,Too Cool Production & Distribution,352.0
336,CFA programmation et distribution,239.0


In [136]:
df

Unnamed: 0,jpbox_fr_title,allocine_fr_title,jpbox_released_year,jpbox_actors,jpbox_directors,allocine_writer,allocine_distribution,jpbox_country,jpbox_budget,jpbox_category,jpbox_released_date,allocine_classification,jpbox_duration,jpbox_weekly_entrances,duration_minutes,main_actor,top100_actor,top100_director
0,La Ch’tite famille,La Ch’tite famille,2018,"[Dany Boon, François Berléand, Valérie Bonneto...",Dany Boon,Dany Boon,Pathé Films,France,27800000,Comédie,28/02/2018,Tout public,1h 47min,2429906,107,Dany Boon,1,1
1,Rien à déclarer,Rien à déclarer,2011,"[Dany Boon, Benoît Poelvoorde, François Damien...",Dany Boon,Dany Boon,Pathé Films,France,24434009,Comédie,02/02/2011,Interdit - 10 ans,1h 48min,2587056,108,Dany Boon,1,1
2,Avatar,Avatar,2009,"[Sam Worthington, Sigourney Weaver]",James Cameron,James Cameron,The Walt Disney Company France,Etats-Unis,315000000,Science Fiction,16/12/2009,Tout public,2h 40min,2648596,160,Sam Worthington,1,1
3,Avengers: Infinity War,Avengers: Infinity War,2018,"[Chris Evans, Chris Hemsworth, Josh Brolin, Ro...",- Russo (brothers),Christopher Markus,The Walt Disney Company France,Etats-Unis,295000000,Comicbook,25/04/2018,Tout public,2h 36min,2565953,156,Chris Evans,1,1
4,"Pirates des Caraïbes, le secret du coffre maudit",Pirates des Caraïbes : le Secret du Coffre Maudit,2006,"[Johnny Depp, Orlando Bloom, Geoffrey Rush, Ke...",Gore Verbinski,Ted Elliott,Buena Vista International,Etats-Unis,225000000,Aventure - Action,02/08/2006,Interdit - 10 ans,2h 30min,2708112,150,Johnny Depp,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7845,Fast & Furious 7,Fast & Furious 7,2015,"[Vin Diesel, Paul Walker, Dwayne Johnson, Kurt...",James Wan,Chris Morgan,Universal Pictures International France,Etats-Unis,190000000,Aventure - Action,01/04/2015,Tout public,2h 20min,2158508,140,Vin Diesel,1,0
7846,Spectre,007 Spectre,2015,"[Daniel Craig, Christoph Waltz, Monica Bellucc...",Sam Mendes,John Logan,Sony Pictures Releasing France,Grande-Bretagne,245000000,Aventure - Action,11/11/2015,Tout public,2h 30min,2203549,150,Daniel Craig,1,1
7847,Barbie,Barbie,2023,"[Ryan Gosling, Margot Robbie, Will Ferrell, He...",Greta Gerwig,Greta Gerwig,Warner Bros. France,Etats-Unis,145000000,Film familial,19/07/2023,Tout public,1h 54min,1680490,114,Ryan Gosling,0,1
7848,Les Gardiens de la Galaxie 2,Les Gardiens de la Galaxie 2,2017,"[Chris Pratt, Kurt Russell, Sylvester Stallone...",James Gunn,James Gunn,The Walt Disney Company France,Etats-Unis,200000000,Comicbook,26/04/2017,Tout public,2h 16min,1401339,136,Chris Pratt,1,1


In [None]:
features_of_interest = [
    "main_actor",
    # "jpbox_directors",
    "allocine_writer",
    # "allocine_distribution",
    "jpbox_country",
    "jpbox_budget",
    'jpbox_category',
    'allocine_classification',
    'duration_minutes', 
    "top100_actor",
    "top100_director",
    'top100_writer',
    'top20_distribution'
]


numerical_column = [
    "jpbox_budget",
    "duration_minutes"
]


ordinal_column = [
    "top100_actor",
    "top100_director",
    'top100_writer',
    'top20_distribution'
]

categorical_column = [
        "main_actor",
        # "jpbox_directors",
        "allocine_writer",
        "allocine_distribution",
    "jpbox_country",
    'jpbox_category',
    'allocine_classification'
]


target_name = "jpbox_weekly_entrances"
data, target, numerical_data,categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[categorical_column]
)


In [190]:
X_train, X_test, y_train, y_test = train_test_split(data, target, shuffle=True, train_size=0.85, random_state=42)

In [191]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column),
    ("ordinal", OrdinalEncoder(), ordinal_column)
    ],
    remainder="passthrough",
)

catboost_model = make_pipeline(preprocessor, CatBoostRegressor())
catboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)

# Calcul des métriques
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
errors = y_pred - y_test
sum_errors = np.sum(errors)
print(f"✅ RMSE (test) Catboost: {rmse:.4f}")
print(f"✅ MAPE (test) Catboost: {mape:.4f}")
print(f"🔥 Somme des erreurs: {sum_errors:.4f}")

Learning rate set to 0.05526
0:	learn: 330624.4145855	total: 9.65ms	remaining: 9.64s
1:	learn: 322177.2138821	total: 21.9ms	remaining: 10.9s
2:	learn: 314233.8281123	total: 30.9ms	remaining: 10.3s
3:	learn: 306859.8435912	total: 38.3ms	remaining: 9.54s
4:	learn: 300041.6754197	total: 46.6ms	remaining: 9.27s
5:	learn: 293886.8668214	total: 54.7ms	remaining: 9.06s
6:	learn: 288139.5565177	total: 64.6ms	remaining: 9.17s
7:	learn: 282714.5429600	total: 72.4ms	remaining: 8.98s
8:	learn: 278055.2725351	total: 81.9ms	remaining: 9.02s
9:	learn: 273300.9916270	total: 89.6ms	remaining: 8.87s
10:	learn: 268713.0005051	total: 98.7ms	remaining: 8.87s
11:	learn: 264583.2389164	total: 108ms	remaining: 8.85s
12:	learn: 260508.1570790	total: 118ms	remaining: 8.99s
13:	learn: 256906.9426981	total: 130ms	remaining: 9.15s
14:	learn: 253692.6781373	total: 148ms	remaining: 9.69s
15:	learn: 250601.1406932	total: 158ms	remaining: 9.71s
16:	learn: 247850.2444465	total: 168ms	remaining: 9.7s
17:	learn: 244886.0

In [110]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column)
    ],
    remainder="passthrough",
)

catboost_model = make_pipeline(preprocessor, CatBoostRegressor(  verbose=0, random_state=42))
catboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)

# Calcul des métriques
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
errors = y_pred - y_test
sum_errors = np.sum(errors)
print(f"✅ RMSE (test) Catboost: {rmse:.4f}")
print(f"✅ MAPE (test) Catboost: {mape:.4f}")
print(f"🔥 Somme des erreurs: {sum_errors:.4f}")

✅ RMSE (test) Catboost: 254894.8927
✅ MAPE (test) Catboost: 9.8806
🔥 Somme des erreurs: -20473912.6790


In [193]:
# 2. Grille de paramètres à tester (tu peux en ajouter)
param_grid = {
    'catboostregressor__depth': [4, 6, 8],
    'catboostregressor__learning_rate': [0.01, 0.05, 0.1],
    'catboostregressor__iterations': [200, 500],
}

# 3. GridSearchCV
grid_search = GridSearchCV(
    catboost_model,
    param_grid,
    cv=3,  # ou 5 pour une validation croisée plus robuste
    scoring='neg_root_mean_squared_error',  # on veut minimiser le RMSE
    n_jobs=-1,
    verbose=2
)

# 4. Entraînement
grid_search.fit(X_train, y_train)

# 5. Résultats
print("✅ Best params:", grid_search.best_params_)
print(f"✅ Best RMSE: {-grid_search.best_score_:.4f}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits


0:	learn: 332808.4631325	total: 59.1ms	remaining: 11.8s
1:	learn: 331321.6242533	total: 68.7ms	remaining: 6.8s
2:	learn: 329942.3549894	total: 73.9ms	remaining: 4.85s
3:	learn: 328474.3460169	total: 84.1ms	remaining: 4.12s
0:	learn: 334884.5767304	total: 103ms	remaining: 20.4s
0:	learn: 341294.7876048	total: 69.3ms	remaining: 13.8s
4:	learn: 326958.6094492	total: 92.2ms	remaining: 3.6s
1:	learn: 339564.5424611	total: 88.5ms	remaining: 8.76s
0:	learn: 326540.7683003	total: 63.7ms	remaining: 12.7s
5:	learn: 325464.6778441	total: 105ms	remaining: 3.4s
0:	learn: 334577.5278980	total: 109ms	remaining: 21.6s
1:	learn: 326690.5008560	total: 132ms	remaining: 13s
2:	learn: 337942.5291441	total: 107ms	remaining: 7s
2:	learn: 319531.2087332	total: 144ms	remaining: 9.48s
1:	learn: 319559.8824538	total: 108ms	remaining: 10.7s
1:	learn: 327447.2481076	total: 137ms	remaining: 13.6s
3:	learn: 336372.8286143	total: 150ms	remaining: 7.35s
3:	learn: 312850.9784951	total: 182ms	remaining: 8.93s
2:	learn: 

In [114]:
from xgboost import XGBRegressor

preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column)
    ],
    remainder="passthrough",
)

xgboost_model = make_pipeline(preprocessor, XGBRegressor())
xgboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)

# # Calcul des métriques
# rmse = root_mean_squared_error(y_test, y_pred)
# mape = mean_absolute_percentage_error(y_test, y_pred)

# print(f"✅ RMSE (test) de XGBOOST: {rmse:.4f}")
# print(f"✅ MAPE (test)de XGBOOST: {mape:.4f}")

# errors = y_pred - y_test
# sum_errors = np.sum(errors)
# print(f"🔥 Somme des erreurs XGBOOST: {sum_errors:.4f}")

# 2. Grille de paramètres à tester (tu peux en ajouter)
param_grid = {
    'xgbregressor__max_depth': [4, 6, 8],
    'xgbregressor__learning_rate': [0.01, 0.05, 0.1],
    'xgbregressor__n_estimators': [200, 500],
}

# GridSearchCV
grid_search = GridSearchCV(
    xgboost_model,
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=5,
    verbose=2
)

# Fit
grid_search.fit(X_train, y_train)

# Meilleurs résultats
print("✅ Best params:", grid_search.best_params_)
print(f"✅ Best RMSE: {-grid_search.best_score_:.4f}")

# Prédictions + évaluation sur le test set
y_pred = grid_search.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
sum_errors = np.sum(y_pred - y_test)

print(f"📊 RMSE (test): {rmse:.4f}")
print(f"📊 MAPE (test): {mape:.4f}")
print(f"📊 Somme des erreurs: {sum_errors:.4f}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=200; total time=   1.2s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=200; total time=   1.1s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=200; total time=   1.2s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=500; total time=   2.4s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=500; total time=   2.5s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=6, xgbregressor__n_estimators=200; total time=   2.7s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=4, xgbregressor__n_estimators=500; total time=   2.8s
[CV] END xgbregressor__learning_rate=0.01, xgbregressor__max_depth=6, xgbregressor__n_estima

In [None]:
# from lightgbm import LGBMRegressor

# preprocessor = ColumnTransformer(
#     [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
#     ("numeric", RobustScaler(), numerical_column)
#     ],
#     remainder="passthrough",
# )

# catboost_model = make_pipeline(preprocessor, LGBMRegressor())
# catboost_model.fit(X_train,y_train)

# # Prédictions sur le test
# y_pred = catboost_model.predict(X_test)
# rmse = root_mean_squared_error(y_test, y_pred)
# mape = mean_absolute_percentage_error(y_test, y_pred)
# # Calcul des métriques
# rmse = root_mean_squared_error(y_test, y_pred)
# mape = mean_absolute_percentage_error(y_test, y_pred)

# print(f"✅ RMSE (test) de Lightgbm: {rmse:.4f}")
# print(f"✅ MAPE (test) de Lightgbm: {mape:.4f}")
# print(f"🔥 Somme des erreurs LGBM: {sum_errors:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 647
[LightGBM] [Info] Number of data points in the train set: 6672, number of used features: 131
[LightGBM] [Info] Start training from score 202436.017236
✅ RMSE (test) de Lightgbm: 272228.8980
✅ MAPE (test) de Lightgbm: 8.0862
🔥 Somme des erreurs LGBM: -18468045.5879
