In [47]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, cv, Pool
from sklearn.metrics import root_mean_squared_error, make_scorer, mean_absolute_percentage_error
from sklearn.feature_selection import r_regression
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",None)

In [75]:
df = pd.read_json('dataset_nottoyé.json')
df['main_actor'] = df['jpbox_actors'].apply(lambda x : x[0] if isinstance(x,list) and len(x)>0 else None)

df['jpbox_budget'] = df['jpbox_budget'].apply(lambda x : pd.NA if x == '' else x)
df['jpbox_budget'].isna().sum()
moyenne = round(pd.to_numeric(df['jpbox_budget']).dropna().mean(),0)
df['jpbox_budget'] = df['jpbox_budget'].fillna(moyenne)
df['jpbox_budget'] = df['jpbox_budget'].astype(int)


In [76]:
features_of_interest = [
    "main_actor",
    "jpbox_directors",
    "allocine_writer",
    "allocine_distribution",
    "jpbox_country",
    "jpbox_budget",
    'jpbox_category',
    'allocine_classification',
    'duration_minutes'

]

numerical_column = [
    "jpbox_budget",
    "duration_minutes"
]


# ordinal_column = [
#     "sex",
#     "smoker",
# ]

categorical_column = [
        "main_actor","jpbox_directors","allocine_writer","allocine_distribution",
    "jpbox_country",'jpbox_category','allocine_classification'
]


target_name = "jpbox_weekly_entrances"
data, target, numerical_data,categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[categorical_column]
)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(data, target, shuffle=True, train_size=0.85, random_state=42)

In [78]:
print(df[features_of_interest].dtypes)
df[features_of_interest].head(50)

main_actor                 object
jpbox_directors            object
allocine_writer            object
allocine_distribution      object
jpbox_country              object
jpbox_budget                int64
jpbox_category             object
allocine_classification    object
duration_minutes            int64
dtype: object


Unnamed: 0,main_actor,jpbox_directors,allocine_writer,allocine_distribution,jpbox_country,jpbox_budget,jpbox_category,allocine_classification,duration_minutes
0,Dany Boon,Dany Boon,Dany Boon,Pathé Films,France,27800000,Comédie,Tout public,107
1,Dany Boon,Dany Boon,Dany Boon,Pathé Films,France,24434009,Comédie,Interdit - 10 ans,108
2,Sam Worthington,James Cameron,James Cameron,The Walt Disney Company France,Etats-Unis,315000000,Science Fiction,Tout public,160
3,Chris Evans,- Russo (brothers),Christopher Markus,The Walt Disney Company France,Etats-Unis,295000000,Comicbook,Tout public,156
4,Johnny Depp,Gore Verbinski,Ted Elliott,Buena Vista International,Etats-Unis,225000000,Aventure - Action,Interdit - 10 ans,150
5,Daniel Radcliffe,David Yates,Joanne Kathleen Rowling,,Etats-Unis,125000000,Fantasy,Tout public,150
6,Jamel Debbouze,Jon Favreau,Jeff Nathanson,The Walt Disney Company France,Etats-Unis,260000000,Aventure - Action,Interdit - 6 ans,118
7,Daniel Radcliffe,Alfonso Cuaron,Steve Kloves,Warner Bros. France,Etats-Unis,130000000,Fantasy,Interdit - 10 ans,141
8,Adam Driver,Rian Johnson,Rian Johnson,The Walt Disney Company France,Etats-Unis,200000000,Fantasy,Interdit - 8 ans,151
9,Robert Pattinson,David Yates,Michael Goldenberg,Warner Bros. France,Etats-Unis,150000000,Fantasy,Interdit - 10 ans,138


In [103]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column)
    ],
    remainder="passthrough",
)

catboost_model = make_pipeline(preprocessor, CatBoostRegressor())
catboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)

# Calcul des métriques
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
errors = y_pred - y_test
sum_errors = np.sum(errors)
print(f"✅ RMSE (test) Catboost: {rmse:.4f}")
print(f"✅ MAPE (test) Catboost: {mape:.4f}")
print(f"🔥 Somme des erreurs: {sum_errors:.4f}")

Learning rate set to 0.05526
0:	learn: 333715.6224585	total: 16.6ms	remaining: 16.6s
1:	learn: 328039.6905851	total: 29.5ms	remaining: 14.7s
2:	learn: 321742.6443382	total: 39.7ms	remaining: 13.2s
3:	learn: 316401.5498607	total: 52.7ms	remaining: 13.1s
4:	learn: 311380.5670690	total: 66.4ms	remaining: 13.2s
5:	learn: 307506.3243639	total: 78ms	remaining: 12.9s
6:	learn: 303274.5708269	total: 90.2ms	remaining: 12.8s
7:	learn: 298965.1352866	total: 102ms	remaining: 12.6s
8:	learn: 295274.9028531	total: 115ms	remaining: 12.6s
9:	learn: 292317.6221543	total: 126ms	remaining: 12.5s
10:	learn: 289096.9414477	total: 137ms	remaining: 12.3s
11:	learn: 285926.5765243	total: 150ms	remaining: 12.3s
12:	learn: 282926.1052639	total: 162ms	remaining: 12.3s
13:	learn: 280519.5499841	total: 173ms	remaining: 12.2s
14:	learn: 277280.6337814	total: 187ms	remaining: 12.3s
15:	learn: 275174.1054593	total: 201ms	remaining: 12.3s
16:	learn: 273159.5788924	total: 213ms	remaining: 12.3s
17:	learn: 271119.290971

In [None]:


# Graphique des erreurs
plt.figure(figsize=(10, 6))

# Affichage des erreurs
plt.plot(errors, label="Erreurs (Prédictions - Réelles)", color="red", marker="o", linestyle="--", alpha=0.7)

# Tracer la ligne horizontale à 0 (zéro erreur)
plt.axhline(0, color='black', linewidth=2)

# Ajouter titre et labels
plt.title('Courbe des erreurs entre prédictions et vraies valeurs')
plt.xlabel('Index des échantillons')
plt.ylabel('Erreur')

# Afficher la légende
plt.legend()

# Afficher le graphique
plt.show()

In [105]:
from xgboost import XGBRegressor

preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column)
    ],
    remainder="passthrough",
)

catboost_model = make_pipeline(preprocessor, XGBRegressor())
catboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)

# Calcul des métriques
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"✅ RMSE (test) de XGBOOST: {rmse:.4f}")
print(f"✅ MAPE (test)de XGBOOST: {mape:.4f}")

errors = y_pred - y_test
sum_errors = np.sum(errors)
print(f"🔥 Somme des erreurs XGBOOST: {sum_errors:.4f}")

✅ RMSE (test) de XGBOOST: 261169.5781
✅ MAPE (test)de XGBOOST: 9.3463
🔥 Somme des erreurs XGBOOST: -18468045.5879


In [106]:
from lightgbm import LGBMRegressor

preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("numeric", RobustScaler(), numerical_column)
    ],
    remainder="passthrough",
)

catboost_model = make_pipeline(preprocessor, LGBMRegressor())
catboost_model.fit(X_train,y_train)

# Prédictions sur le test
y_pred = catboost_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Calcul des métriques
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"✅ RMSE (test) de Lightgbm: {rmse:.4f}")
print(f"✅ MAPE (test) de Lightgbm: {mape:.4f}")
print(f"🔥 Somme des erreurs LGBM: {sum_errors:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 647
[LightGBM] [Info] Number of data points in the train set: 6672, number of used features: 131
[LightGBM] [Info] Start training from score 202436.017236
✅ RMSE (test) de Lightgbm: 272228.8980
✅ MAPE (test) de Lightgbm: 8.0862
🔥 Somme des erreurs LGBM: -18468045.5879
