In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, cv, Pool
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error, make_scorer, mean_absolute_percentage_error, mean_absolute_error, r2_score
from sklearn.feature_selection import r_regression
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",None)

In [51]:
df_prediction = pd.read_json('new_film.json')

df_prediction.released_date

0     23/04/2025
1     18/05/2005
2     23/04/2025
3     24/04/2025
4     23/04/2025
5     23/04/2025
6     23/04/2025
7     23/04/2025
8     23/04/2025
9     25/04/2025
10    23/04/2025
11    23/04/2025
12    23/04/2025
13    24/04/2025
14    23/04/2025
15    24/04/2025
16    26/04/2025
17    25/04/2025
18    23/04/2025
19    23/04/2025
20    23/04/2025
21    16/04/1986
22    23/04/2025
23    23/04/2025
24    28/04/2025
25    23/04/2025
26    23/04/2025
27    23/04/2025
28    23/04/2025
Name: released_date, dtype: object

In [57]:
df_1 = pd.read_json('DATASET_FINAL.json')

df = df_1.copy()

df_prediction = pd.read_json('new_film.json')

# Groupement des acteurs 1, 2, 3 , scénaristes, réalisateurs, et distributeurs qui font plus de 500k entrées 
# + ajout d'un groupe "mid" entre 250k et 500k

# Acteur 1
df_actor_1 = df.groupby('actor_1')['weekly_entrances'].mean().reset_index()
df_actor_1_mid = df_actor_1[(df_actor_1['weekly_entrances'] < 500001) & (df_actor_1['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_actor_1 = df_actor_1[df_actor_1['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)

# Acteur 2
df_actor_2 = df.groupby('actor_2')['weekly_entrances'].mean().reset_index()
df_actor_2_mid = df_actor_2[(df_actor_2['weekly_entrances'] < 500001) & (df_actor_2['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_actor_2 = df_actor_2[df_actor_2['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)

# Acteur 3
df_actor_3 = df.groupby('actor_3')['weekly_entrances'].mean().reset_index()
df_actor_3_mid = df_actor_3[(df_actor_3['weekly_entrances'] < 500001) & (df_actor_3['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_actor_3 = df_actor_3[df_actor_3['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)

# Réalisateurs
df_director = df.groupby('directors')['weekly_entrances'].mean().reset_index()
df_director_mid = df_director[(df_director['weekly_entrances'] < 500001) & (df_director['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_director = df_director[df_director['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)

# Scénaristes
df_writer = df.groupby('writer')['weekly_entrances'].mean().reset_index()
df_writer_mid = df_writer[(df_writer['weekly_entrances'] < 500001) & (df_writer['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_writer = df_writer[df_writer['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)

# Distributeurs
df_distribution = df.groupby('distribution')['weekly_entrances'].mean().reset_index()
df_distribution_mid = df_distribution[(df_distribution['weekly_entrances'] < 500001) & (df_distribution['weekly_entrances'] > 250000)].sort_values(by='weekly_entrances', ascending=False)
df_distribution = df_distribution[df_distribution['weekly_entrances'] > 500000].sort_values(by='weekly_entrances', ascending=False)


# Création des colonnes "top" et "top_mid" pour les différents groupes

df_prediction['top_actor_1'] = df_prediction['actor_1'].apply(lambda x: 1 if x in df_actor_1['actor_1'].to_list() else 0)
df_prediction['top_actor_1_mid'] = df_prediction['actor_1'].apply(lambda x: 1 if x in df_actor_1_mid['actor_1'].to_list() else 0)

df_prediction['top_actor_2'] = df_prediction['actor_2'].apply(lambda x: 1 if x in df_actor_2['actor_2'].to_list() else 0)
df_prediction['top_actor_2_mid'] = df_prediction['actor_2'].apply(lambda x: 1 if x in df_actor_2_mid['actor_2'].to_list() else 0)

df_prediction['top_actor_3'] = df_prediction['actor_3'].apply(lambda x: 1 if x in df_actor_3['actor_3'].to_list() else 0)
df_prediction['top_actor_3_mid'] = df_prediction['actor_3'].apply(lambda x: 1 if x in df_actor_3_mid['actor_3'].to_list() else 0)

df_prediction['top_director'] = df_prediction['directors'].apply(lambda x: 1 if x in df_director['directors'].to_list() else 0)
df_prediction['top_director_mid'] = df_prediction['directors'].apply(lambda x: 1 if x in df_director_mid['directors'].to_list() else 0)

df_prediction['top_writer'] = df_prediction['writer'].apply(lambda x: 1 if x in df_writer['writer'].to_list() else 0)
df_prediction['top_writer_mid'] = df_prediction['writer'].apply(lambda x: 1 if x in df_writer_mid['writer'].to_list() else 0)

df_prediction['top_distribution'] = df_prediction['distribution'].apply(lambda x: 1 if x in df_distribution['distribution'].to_list() else 0)
df_prediction['top_distribution_mid'] = df_prediction['distribution'].apply(lambda x: 1 if x in df_distribution_mid['distribution'].to_list() else 0)



df_prediction['top_pays'] = df_prediction.country.apply(lambda x : 1 if x in (['France','Etats-Unis','Grande-Bretagne']) else 0)

df_prediction['released_date'] = pd.to_datetime(
    df_prediction['released_date'],
    format="%d/%m/%Y",
    errors='coerce'  # Optionnel : mettra NaT si une date est mal formée
)

df_prediction["summer"] = df_prediction["released_date"].apply(lambda x: 1 if ((x.month == 6 and x.day >= 21) or x.month in [7, 8] or (x.month == 9 and x.day < 22)) else 0)
df_prediction["automn"] = df_prediction["released_date"].apply(lambda x: 1 if ((x.month == 9 and x.day >= 22) or x.month in [10, 11] or (x.month == 12 and x.day < 21)) else 0)
df_prediction["winter"] = df_prediction["released_date"].apply(lambda x: 1 if ((x.month == 12 and x.day >= 21) or x.month in [1, 2] or (x.month == 3 and x.day < 20)) else 0)
df_prediction["spring"] = df_prediction["released_date"].apply(lambda x: 1 if ((x.month == 3 and x.day >= 21) or x.month in [4, 5] or (x.month == 6 and x.day < 21)) else 0)

df_prediction["is_covid"] = df_prediction["released_date"].apply(lambda x: 1 if (
    (x >= pd.to_datetime("2020-03-17") and x <= pd.to_datetime("2020-05-11")) or
    (x >= pd.to_datetime("2020-10-30") and x <= pd.to_datetime("2020-12-15")) or
    (x >= pd.to_datetime("2021-04-03") and x <= pd.to_datetime("2021-05-03"))
) else 0)

df_prediction["post_streaming"] = df_prediction["released_date"].apply(lambda x: 1 if x >= pd.to_datetime("2014-09-15") else 0)

df_prediction["summer_holidays"] = df_prediction["released_date"].apply(lambda x: 1 if x.month >= 7 or (x.month <= 9 and x.day < 10) else 0)

df_prediction["christmas_period"] = df_prediction["released_date"].apply(lambda x: 1 if (x.month == 12 and x.day >= 20) or (x.month == 1 and x.day <= 5) else 0)

df_prediction["is_award_season"] = df_prediction["released_date"].apply(lambda x: 1 if (x.month == 2 or (x.month == 3 and x.day <= 10)) else 0)



In [53]:
df_prediction.released_date

0    2025-04-23
1    2005-05-18
2    2025-04-23
3    2025-04-24
4    2025-04-23
5    2025-04-23
6    2025-04-23
7    2025-04-23
8    2025-04-23
9    2025-04-25
10   2025-04-23
11   2025-04-23
12   2025-04-23
13   2025-04-24
14   2025-04-23
15   2025-04-24
16   2025-04-26
17   2025-04-25
18   2025-04-23
19   2025-04-23
20   2025-04-23
21   1986-04-16
22   2025-04-23
23   2025-04-23
24   2025-04-28
25   2025-04-23
26   2025-04-23
27   2025-04-23
28   2025-04-23
Name: released_date, dtype: datetime64[ns]

In [59]:
features_of_interest = [
    # "actor_1",
    # "actor_2",
    # 'actor_3',
    # "directors",
    #  "writer",
    # "distribution",
    #"fr_title",
    'released_year',
    "country",
    #"budget",
    'category',
    'classification',
    'duration_minutes', 
    "top_actor_1",
    "top_actor_2",
    "top_actor_3",
    "top_director",
    'top_writer',
    'top_distribution',
    "top_actor_1_mid",
    "top_actor_2_mid",
    "top_actor_3_mid",
    "top_director_mid",
    'top_writer_mid',
    'top_distribution_mid',
    #'budget_category',
    'top_pays',
    # 'summer',
    # 'automn',
    # 'winter',
    # 'spring',
    # 'is_covid',
    'post_streaming',
    'summer_holidays',
    'christmas_period',
    'is_award_season',
    #'month',
]


numerical_column = [
    #"budget",
    'released_year',
    "duration_minutes",
    #'month',

]


ordinal_column = [
    "top_actor_1",
    "top_actor_2",
    "top_actor_3",
    "top_director",
    'top_writer',
    "top_actor_1_mid",
    "top_actor_2_mid",
    "top_actor_3_mid",
    "top_director_mid",
    'top_writer_mid',
    'top_distribution_mid',
    'top_distribution',
    'top_pays',
    #     'summer',
    # 'automn',
    # 'winter',
    # 'spring',
    # 'is_covid',
    'post_streaming',
    'summer_holidays',
    'christmas_period',
    'is_award_season',
]

categorical_column = [
        # "actor_1",
        # "actor_2",
        # "actor_3",
        # "directors",
        # "writer",
        # "distribution",
    #"fr_title",
    "country",
    'category',
    'classification',
    #'budget_category'

]

#
# target_name = "weekly_entrances"
data, numerical_data,categorical_data = (
    df_prediction[features_of_interest],
    df_prediction[numerical_column],
    df_prediction[categorical_column]
)


In [60]:
# Import du modele

import pickle

# Remplace 'mon_modele.pkl' par le nom de ton fichier
with open("catboostmodel.pkl", "rb") as f:
    modele = pickle.load(f)


result = np.round(modele.predict(data),0)

print(result)


[ 65065.   1450.  14989.  15014. -33213.  44789.  45901.  44281.  21151.
   2301.  29029.  10563.  45746.  46614.   8485.  38285.  49895.  75868.
  15784.  49107.  36318.  34326. 440348. 100760.  46574.  26262.  86937.
 859527.  36114.  29029.  21151.  46614.  44444.  95924.  29145.  50035.
   9323.  10898.  42398.  23228.  26767.  14231.  50035.  59781.  44032.
  11619.  33466.  -5414.  41061.  15014.  30031.  68489.  35591.  44427.
  11303.]


In [22]:
data['prediction'] = result

In [62]:
df_prediction['prediction'] = result

df_prediction.sort_values(by = 'prediction', ascending=False).head(10)

Unnamed: 0,fr_title,original_title,released_date,released_year,actor_1,actor_2,actor_3,directors,writer,distribution,country,category,classification,duration,duration_minutes,allocine_url,image_url,top_actor_1,top_actor_1_mid,top_actor_2,top_actor_2_mid,top_actor_3,top_actor_3_mid,top_director,top_director_mid,top_writer,top_writer_mid,top_distribution,top_distribution_mid,top_pays,summer,automn,winter,spring,is_covid,post_streaming,summer_holidays,christmas_period,is_award_season,prediction
27,Star Wars : Episode III - La Revanche des Sith,Star Wars: Episode III - Revenge of the Sith,2005-05-18,2005,Hayden Christensen,Ewan McGregor,Natalie Portman,George Lucas,George Lucas,The Walt Disney Company France,U.S.A.,Action,Tout public,2h 20min,140,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/medias...,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,859527.0
22,Sinners,Sinners,2025-04-16,2025,Michael B. Jordan,Hailee Steinfeld,Miles Caton,Ryan Coogler,Ryan Coogler,Warner Bros. France,U.S.A.,Action,Interdit - 12 ans,2h 17min,137,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img5.acsta.net/c_310_420/img/92...,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,440348.0
23,Rapide,Rapide,2025-04-16,2025,Paola Locatelli,Alban Lenoir,Anne Marivin,Morgan S. Dalibert,Morgan S. Dalibert,Universal Pictures International France,France,Action,Tout public,1h 38min,98,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img4.acsta.net/c_310_420/img/2d...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,100760.0
33,Until Dawn : La mort sans fin,Until Dawn,2025-04-23,2025,Ella Rubin,Michael Cimino (II),Odessa A’zion,David F. Sandberg,Blair Butler,Sony Pictures Releasing France,U.S.A.,Epouvante-horreur,Interdit - 12 ans avec avertissement,1h 43min,103,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/4a...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,95924.0
26,La Légende d'Ochi,The Legend Of Ochi,2025-04-23,2025,Helena Zengel,Willem Dafoe,Emily Watson,Isaiah Saxon,Isaiah Saxon,KMBO,U.S.A.,Aventure,à partir de 8 ans,1h 35min,95,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/f9...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,86937.0
17,Aimons-nous vivants,Aimons-nous vivants,2025-04-16,2025,Gérard Darmon,Valérie Lemercier,Patrick Timsit,Jean-Pierre Améris,Marion Michau,ARP Sélection,France,Comédie,Tout public,1h 30min,90,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img4.acsta.net/c_310_420/img/a4...,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,75868.0
51,Drop Game,Drop,2025-04-23,2025,Meghann Fahy,Brandon Sklenar,Violett Beane,Christopher Landon,Chris Roach,Universal Pictures International France,U.S.A.,Thriller,Tout public avec avertissement,1h 40min,100,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img2.acsta.net/c_310_420/img/5c...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,68489.0
0,La Réparation,La Réparation,2025-04-16,2025,Julia de Nunez,Clovis Cornillac,Julien De Saint-Jean,Régis Wargnier,Manon Feuvray,Nour Films,France,Drame,Tout public,1h 44min,104,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/17...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,65065.0
43,Sumo,Sumo,2025-04-25,2025,Shiva,Yogi Babu,Priya Anand,S. P. Hosimin,Shiva,Night ed films,Inde,Drame,Tout public,2h 30min,150,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/ee...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,59781.0
42,The Metropolitan Opera : Les Noces de Figaro,The Metropolitan Opera: Le Nozze di Figaro,2025-04-26,2025,Federica Lombardi,Elizabeth Bishop,Joshua Hopkins,unknown,unknown,Pathé Live,U.S.A.,Opéra,Tout public,3h 55min,235,https://www.allocine.fr/film/fichefilm_gen_cfi...,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,50035.0
