# Imports

In [671]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting  # Permet d'activer HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np

# DataFrame

In [672]:
df = pd.read_csv('data/allocine_langue_film.csv')

In [673]:
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'titre', 'acteur_1',
       'acteur_2', 'acteur_3', 'réalisateur', 'distributeur', 'note_presse',
       'duree', 'genre', 'pays', 'type', 'nominations', 'prix',
       'annee_production', 'Semaine', 'Entrées_1ère_semaine',
       'actor_1_popularity', 'actor_2_popularity', 'actor_3_popularity',
       'director_popularity', 'vacances', 'saison', 'actors_popularity',
       'langue'],
      dtype='object')

In [674]:

df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Unnamed: 0.2'])
# , 'titre', 'acteur_1', 'acteur_2','acteur_3', 'réalisateur','Semaine','type', 'nominations', 'prix', 'annee_production', 'distributeur'

In [675]:
df = df.dropna()

*Features engineering+nettoyage*

In [676]:
rows_with_newline = df[df['distributeur'].str.contains('\n')]

In [677]:
df = df.drop(rows_with_newline.index)

In [678]:
# Calcul de la moyenne des entrées pour chaque distributeur
mean_entries_by_distributor = df.groupby('distributeur')['Entrées_1ère_semaine'].mean()

# Classement des distributeurs en fonction de la moyenne des entrées (du plus petit au plus grand)
ranked_distributors = mean_entries_by_distributor.rank()

# Création d'un dictionnaire de correspondance entre le classement et la note
rank_to_score = {rank: score for rank, score in enumerate(range(1, len(ranked_distributors) + 1), 1)}

# Création de la colonne 'reputation_distributeur' en utilisant le classement pour attribuer une note
df['reputation_distributeur'] = df['distributeur'].map(mean_entries_by_distributor.rank().map(rank_to_score))

In [679]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7485 entries, 0 to 8233
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   titre                    7485 non-null   object 
 1   acteur_1                 7485 non-null   object 
 2   acteur_2                 7485 non-null   object 
 3   acteur_3                 7485 non-null   object 
 4   réalisateur              7485 non-null   object 
 5   distributeur             7485 non-null   object 
 6   note_presse              7485 non-null   float64
 7   duree                    7485 non-null   int64  
 8   genre                    7485 non-null   object 
 9   pays                     7485 non-null   object 
 10  type                     7485 non-null   object 
 11  nominations              7485 non-null   int64  
 12  prix                     7485 non-null   int64  
 13  annee_production         7485 non-null   int64  
 14  Semaine                 

# Entrainement du model

In [680]:
df.describe()

Unnamed: 0,note_presse,duree,nominations,prix,annee_production,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,actors_popularity,reputation_distributeur
count,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7485.0,7465.0
mean,3.078024,103.582899,1.658651,1.852505,2012.68978,159641.3,15.383564,13.426208,10.831014,3.913951,39.640786,344.67649
std,0.64245,19.667655,2.928893,4.651097,6.792977,323779.8,21.558655,22.610268,18.257015,5.665519,45.673172,90.129517
min,1.0,0.0,0.0,0.0,1926.0,4.0,0.6,0.6,0.6,0.6,1.8,1.0
25%,2.7,91.0,0.0,0.0,2009.0,7129.0,2.866,2.584,1.979,0.98,10.519,289.0
50%,3.1,100.0,0.0,0.0,2013.0,49046.0,8.017,7.283,5.214,1.994,23.137,381.0
75%,3.5,112.0,2.0,0.0,2018.0,167270.0,21.358,18.33,14.567,4.471,59.358,415.0
max,5.0,543.0,53.0,62.0,2023.0,4378720.0,544.768,544.768,544.768,86.909,667.184,435.0


In [681]:
df = df[df['Entrées_1ère_semaine'] < 3000000]

In [682]:
#df = df[df['director_popularity'] < 30]

In [683]:
#df = df[df['actors_popularity'] < 400]

In [684]:
# Appliquer le logarithme 
df['actors_popularity'] = round(np.log1p(df['actors_popularity']), 10)
df['director_popularity'] = round(np.log1p(df['director_popularity']), 10)
# df['duree'] = round(np.log1p(df['duree']), 10)
# df['reputation_distributeur'] = round(np.log1p(df['reputation_distributeur']), 10)


In [685]:
df = df.dropna()

In [686]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7455 entries, 0 to 8233
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   titre                    7455 non-null   object 
 1   acteur_1                 7455 non-null   object 
 2   acteur_2                 7455 non-null   object 
 3   acteur_3                 7455 non-null   object 
 4   réalisateur              7455 non-null   object 
 5   distributeur             7455 non-null   object 
 6   note_presse              7455 non-null   float64
 7   duree                    7455 non-null   int64  
 8   genre                    7455 non-null   object 
 9   pays                     7455 non-null   object 
 10  type                     7455 non-null   object 
 11  nominations              7455 non-null   int64  
 12  prix                     7455 non-null   int64  
 13  annee_production         7455 non-null   int64  
 14  Semaine                 

In [687]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import FunctionTransformer
import numpy as np


 # Séparer les colonnes catégorielles et numériques
categorical_features = ['annee_production', 'type', 'vacances', 'saison', 'pays','genre',  'langue']
numeric_features = ['reputation_distributeur', 'duree','actors_popularity','director_popularity']



# Créer les transformers pour les colonnes catégorielles et numériques
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

# Créer une fonction pour appliquer le logarithme aux colonnes numériques
# def log_transform(X):
#     return np.log(X)

# Ajouter la fonction de transformation logarithmique au preprocesseur
# numeric_log_transformer = FunctionTransformer(log_transform)

# Mettre à jour le preprocesseur pour inclure la transformation logarithmique
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ]
)

In [688]:

# Diviser les données en caractéristiques (X) et la variable cible (y)
X = df[categorical_features+numeric_features]
y = df['Entrées_1ère_semaine']  # Variable cible
# Diviser les données en jeux d'entraînement (80%) et de test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# shuffle=True, 

In [689]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


# # Spécifier les hyperparamètres pour LightGBMRegressor
# hyperparameters_lgbm = {
#     "boosting_type": "gbdt",
#     "colsample_bytree": 0.8,
#     "learning_rate": 0.08421631578947368,
#     "max_bin": 63,
#     "max_depth": 5,
#     "min_data_in_leaf": 45,
#     "min_split_gain": 0.3157894736842105,
#     "n_estimators": 100,
#     "reg_alpha": 0.22499999999999998,
#     "reg_lambda": 1.2,
#     "subsample": 0.05,
#     "subsample_freq": 0
# }





# Créer le pipeline complet avec le nouveau modèle RandomForestRegressor
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
     
])


# RandomForest
param_distributions = {
    'regressor__n_estimators': [50, 100, 150, 200, 250],
    'regressor__max_depth': [None, 5, 10, 15, 20],
    'regressor__min_samples_split': [2, 5, 10, 15],
    'regressor__min_samples_leaf': [1, 2, 4, 6],
    # 'regressor__max_features': ['auto', 'sqrt', 'log2', None],
}



# # XGBoost
# param_distributions = {
#     'regressor__n_estimators': np.arange(50, 401, 50),
#     'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
#     'regressor__max_depth': [3, 5, 7, 9],
#     'regressor__min_child_weight': [1, 3, 5, 7],
#     'regressor__subsample': [0.7, 0.8, 0.9, 1.0],
#     'regressor__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
#     'regressor__gamma': [0, 0.1, 0.2, 0.3, 0.4]
# }

# # Créer l'objet RandomizedSearchCV pour effectuer la recherche aléatoire
random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=5, random_state=42
)

# Entraîner le modèle sur les données d'entraînement en utilisant la recherche aléatoire
random_search.fit(X_train, y_train)

# Afficher les meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres :")
print(random_search.best_params_)

# Accéder au meilleur modèle trouvé (avec les meilleurs hyperparamètres)
best_model = random_search.best_estimator_

# Évaluer le modèle sur les données de test
y_pred = best_model.predict(X_test)

# # Entraîner le modèle sur les données d'entraînement
# model.fit(X_train, y_train)

Meilleurs hyperparamètres :
{'regressor__n_estimators': 100, 'regressor__min_samples_split': 5, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 15}


In [690]:
best_model.score(X_train, y_train)

0.8180366068412964

In [691]:
best_model.score(X_test, y_test)

0.5304790503675831

# Enregistrement le model dans Pickle

In [692]:
import pickle

# Supposons que best_model contienne le meilleur modèle sélectionné
# Enregistrez le meilleur modèle dans un fichier pickle
# with open('best_model.pkl', 'wb') as file:
#     pickle.dump(model, file)


In [693]:
df[df['acteur_2'] ==  'Morgan Freeman']

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur
1145,La Chute du président,Gerard Butler,Morgan Freeman,Nick Nolte,Ric Roman Waugh,M6 Vidéo,2.5,121,Action,U.S.A.,...,188852,60.882,67.425,20.43,2.088153,True,Automne,5.00888,en,409.0
2817,Ben-Hur,Jack Huston,Morgan Freeman,Toby Kebbell,Timur Bekmambetov,Paramount Pictures France,1.6,124,Péplum,U.S.A.,...,185275,14.212,67.425,20.376,1.651539,False,Automne,4.634855,en,426.0
3642,Lucy,Scarlett Johansson,Morgan Freeman,Min-sik Choi,Luc Besson,EuropaCorp Distribution,2.7,89,Science fiction,France,...,1941424,54.46,67.425,9.571,2.835681,True,Été,4.886251,en,415.0
6680,Sans plus attendre,Jack Nicholson,Morgan Freeman,Sean Hayes,Rob Reiner,Warner Bros. France,2.2,96,Drame,U.S.A.,...,107748,16.844,67.425,9.005,2.696315,True,Printemps,4.546205,en,432.0
6929,Evan tout-puissant,Steve Carell,Morgan Freeman,Lauren Graham,Tom Shadyac,Paramount Pictures France,1.5,96,Comédie,U.S.A.,...,150994,33.087,67.425,7.738,1.829055,True,Été,4.693639,en,426.0
6978,Le Contrat,John Cusack,Morgan Freeman,Jamie Anderson,Bruce Beresford,Metropolitan FilmExport,2.1,97,Thriller,U.S.A.,...,74260,33.916,67.425,1.502,1.172172,True,Été,4.64288,en,398.0
7526,Million Dollar Baby,Clint Eastwood,Morgan Freeman,Hilary Swank,Clint Eastwood,Mars Distribution,4.9,132,Drame,U.S.A.,...,774989,46.266,67.425,17.905,3.855791,False,Printemps,4.887307,en,412.0
7807,La Somme de toutes les peurs,Ben Affleck,Morgan Freeman,Ciarán Hinds,Phil Alden Robinson,United International Pictures (UIP),2.5,124,Thriller,U.S.A.,...,272609,37.807,67.425,18.71,1.390286,True,Été,4.82785,en,427.0
7882,Crimes et pouvoir,Ashley Judd,Morgan Freeman,Amanda Peet,Carl Franklin,UFD,2.7,115,Thriller,U.S.A.,...,139400,17.901,67.425,22.011,1.046266,True,Été,4.685247,en,422.0


In [694]:
df[df['Entrées_1ère_semaine']>800000]

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur
6,Les Trois Mousquetaires: D'Artagnan,François Civil,Vincent Cassel,Romain Duris,Martin Bourboulon,Pathé,3.7,121,Aventure,France,...,1007159,21.277,21.751,10.465,1.122654,True,Printemps,3.998072,fr,424.0
10,Astérix et Obélix : L'Empire du milieu,Guillaume Canet,Gilles Lellouche,Marion Cotillard,Guillaume Canet,Pathé,2.4,112,Aventure,France,...,1882686,14.502,13.133,28.489,2.941857,False,Hiver,4.045224,fr,424.0
15,"Super Mario Bros, le film",Pierre Tessier,Chris Pratt,Jérémie Covillault,Aaron Horvath,Universal Pictures International France,3.1,92,Animation,U.S.A.,...,1866914,2.088,46.533,2.529,0.830297,True,Printemps,3.954124,en,419.0
21,Mission: Impossible – Dead Reckoning Partie 1,Tom Cruise,Hayley Atwell,Ving Rhames,Christopher McQuarrie,Paramount Pictures France,4.0,163,Action,U.S.A.,...,930141,61.823,61.355,13.373,2.410452,True,Été,4.923995,en,426.0
35,Indiana Jones et le Cadran de la Destinée,Harrison Ford,Phoebe Waller-Bridge,Mads Mikkelsen,James Mangold,The Walt Disney Company France,3.5,154,Action,U.S.A.,...,1089875,31.086,16.301,54.518,2.907393,False,Été,4.633806,en,434.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,Ce que veulent les femmes,Mel Gibson,Helen Hunt,Marisa Tomei,Nancy Meyers,Bac Films,3.7,127,Comédie,U.S.A.,...,959614,59.839,32.989,21.715,1.719368,True,Hiver,4.749643,fr,383.0
8102,American Pie 2,Jason Biggs,Seann William Scott,Chris Klein,James B. Rogers,United International Pictures (UIP),2.2,105,Comédie,U.S.A.,...,1250508,19.984,13.571,12.678,1.611635,True,Automne,3.855093,en,427.0
8108,Le Placard,Daniel Auteuil,Gérard Depardieu,Thierry Lhermitte,Francis Veber,Gaumont Buena Vista International (GBVI),3.6,84,Comédie,France,...,1190496,7.955,18.644,3.174,1.524969,False,Hiver,3.426638,fr,431.0
8117,La Vérité si je mens ! 2,Richard Anconina,José Garcia,Gad Elmaleh,Thomas Gilou,Warner Bros. France,3.7,105,Comédie,France,...,2830489,3.254,10.095,6.070,0.768255,True,Hiver,3.016466,fr,432.0


In [695]:
df1 = df[df['pays'] == 'France']
df2 = df[df['pays'] == 'U.S.A.']

In [696]:
df1.describe()

Unnamed: 0,note_presse,duree,nominations,prix,annee_production,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,actors_popularity,reputation_distributeur
count,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0
mean,3.062015,97.979468,1.566139,1.840458,2013.164591,139273.4,8.584148,7.060488,5.506184,0.993451,2.784345,339.414339
std,0.629303,15.418208,2.899655,4.665381,5.784704,240195.1,14.822846,8.327876,6.86798,0.553941,0.796436,92.53607
min,1.0,0.0,0.0,0.0,1983.0,5.0,0.6,0.6,0.6,0.470004,1.029619,1.0
25%,2.7,89.0,0.0,0.0,2009.0,11602.5,2.417,2.0975,1.683,0.470004,2.319491,313.0
50%,3.1,97.0,0.0,0.0,2014.0,59868.0,5.183,4.413,3.47,0.875469,2.803724,364.0
75%,3.5,105.0,2.0,0.0,2018.0,163628.5,9.896,8.652,6.6,1.302097,3.287972,407.0
max,4.7,220.0,53.0,62.0,2023.0,2830489.0,544.768,76.703,94.116,4.18785,6.39092,435.0


In [697]:
df1[df1['actor_1_popularity'] > 70]

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur
898,Irréversible - Inversion Intégrale,Monica Bellucci,Vincent Cassel,Albert Dupontel,Gaspar Noé,Carlotta Films,3.6,90,Drame,France,...,2905,76.703,21.751,6.437,2.141242,True,Été,4.66241,fr,318.0
2573,Mechanic Résurrection,Jason Statham,Jessica Alba,Tommy Lee Jones,Dennis Gansel,Metropolitan FilmExport,2.8,99,Action,France,...,198401,119.453,26.161,35.507,1.885098,True,Automne,5.204671,en,398.0
3617,3 Days to Kill,Kevin Costner,Amber Heard,Hailee Steinfeld,McG,EuropaCorp Distribution,2.0,116,Action,France,...,186899,86.909,23.793,49.125,3.418874,False,Printemps,5.080329,en,415.0
4264,Only God Forgives,Ryan Gosling,Kristin Scott Thomas,Vithaya Pansringarm,Nicolas Winding Refn,Wild Side Films / Le Pacte,3.4,90,Thriller,France,...,247780,171.69,16.807,6.758,1.811562,False,Printemps,5.279415,en,361.0
5090,Colombiana,Zoe Saldana,Amandla Stenberg,Michael Vartan,Olivier Megaton,EuropaCorp Distribution,2.3,105,Action,France,...,284215,81.067,23.651,7.719,1.089235,True,Été,4.731248,en,415.0
5171,Un Été Brûlant,Monica Bellucci,Louis Garrel,Céline Sallette,Philippe Garrel,Wild Bunch Distribution,3.3,95,Drame,France,...,10867,76.703,12.641,5.813,0.929799,False,Automne,4.565982,fr,392.0
5301,La Taupe,Gary Oldman,Mark Strong,John Hurt,Tomas Alfredson,StudioCanal,4.0,122,Espionnage,France,...,277172,544.768,34.643,15.994,1.333157,True,Hiver,6.39092,fr,407.0
5378,The Lady,Michelle Yeoh,David Thewlis,Benedict Wong,Luc Besson,EuropaCorp Distribution,2.4,127,Biopic,France,...,223314,71.795,26.989,23.817,2.835681,False,Hiver,4.817059,fr,415.0
6315,Le Transporteur III,Jason Statham,Natalya Rudakova,Robert Knepper,Olivier Megaton,EuropaCorp Distribution,2.6,105,Action,France,...,678945,119.453,22.354,20.818,1.089235,False,Automne,5.097577,en,415.0
6723,Une histoire italienne,Monica Bellucci,Luca Zingaretti,Alessio Boni,Marco Tullio Giordana,Océan Films,2.4,148,Drame,France,...,20013,76.703,7.072,6.069,1.141991,True,Été,4.509144,fr,348.0


In [698]:
df2.describe()

Unnamed: 0,note_presse,duree,nominations,prix,annee_production,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,actors_popularity,reputation_distributeur
count,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0
mean,2.947086,109.095238,1.730458,1.944295,2011.152291,295546.5,30.212436,27.114221,22.122357,1.804054,4.20615,401.466307
std,0.707037,19.146952,3.118792,5.135604,6.923576,425987.1,25.256046,32.969306,25.742628,0.733877,0.667639,51.457324
min,1.0,0.0,0.0,0.0,1926.0,34.0,0.6,0.6,0.6,0.470004,1.029619,13.0
25%,2.5,96.0,0.0,0.0,2007.0,51250.25,15.5415,13.84675,10.51925,1.292533,3.965938,398.0
50%,3.0,106.0,0.0,0.0,2011.0,141500.5,25.379,21.793,18.2225,1.729795,4.308172,419.0
75%,3.4,120.0,2.0,0.0,2016.0,337876.5,38.708,32.9675,27.777,2.221641,4.598654,426.0
max,4.9,201.0,26.0,37.0,2023.0,2935984.0,544.768,544.768,544.768,4.476302,6.479567,434.0


In [699]:
df2['distributeur'].unique()

array([' Paramount Pictures France ',
       ' Universal Pictures International France ',
       ' Sony Pictures Releasing France ', ' Metropolitan FilmExport ',
       ' The Walt Disney Company France ', ' Warner Bros. France ',
       ' SND ', ' KMBO ', ' Tandem ', ' ARP Sélection ', ' Apollo Films ',
       ' Damned Distribution ', ' Les Films du Losange ', ' Kinovista ',
       ' Diaphana Distribution ', ' The Jokers / Les Bookmakers ',
       ' Pathé Live ', ' ESC Films ', ' Star Invest Films France ',
       ' StudioCanal ', ' Le Pacte ', ' Carlotta Films ',
       ' Condor Distribution ', " L'Atelier Distribution ",
       ' Capricci Films ', ' Alba Films ', ' UFO Distribution ',
       ' M6 Vidéo ', ' Orange Studio Distribution / UGC Distribution ',
       ' Potemkine Films ', ' Saje Distribution ', ' JHR Films ',
       ' Wild Bunch Distribution ', ' Wayna Pitch ', ' Nour Films ',
       ' Twentieth Century Fox France ', ' Amazon Prime Video ',
       ' LFR Films ', ' Mars Fil

In [700]:
df['distributeur'].value_counts()

 Metropolitan FilmExport                     405
 Warner Bros. France                         292
 Universal Pictures International France     255
 Pathé                                       238
 Twentieth Century Fox France                227
                                            ... 
 Eivissa Productions                           1
 SBR PROD                                      1
 Seven 7                                       1
 Cartel Distrib                                1
 Swashbuckler Films                            1
Name: distributeur, Length: 415, dtype: int64

In [701]:
df[df['Entrées_1ère_semaine'] < 1000]['distributeur'].value_counts()


 Cinéma Saint-André des Arts     53
 Night ed films                  21
 Shellac                         17
 Bodega Films                    15
 Aanna Films                     15
                                 ..
 Meret Films                      1
 Lutine & Cie                     1
 Amazon Prime Video               1
 Sinociné                         1
 Swashbuckler Films               1
Name: distributeur, Length: 249, dtype: int64

In [702]:
df[df['titre']=='Une saison pour Maurice Pons']

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur


In [703]:
df

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur
0,L'Amour et les Forêts,Virginie Efira,Melvil Poupaud,Dominique Reymond,Valérie Donzelli,Diaphana Distribution,3.800000,105,Thriller,France,...,220327,32.967,4.671,1.294,1.859574,False,Printemps,3.687178,fr,362.0
1,Monsieur Constant,Jean-Claude Drouot,Cali,Danièle Evenou,Alan Simon,Babaika,2.300000,108,Comédie dramatique,France,...,40,2.312,1.387,2.066,0.603222,False,Printemps,1.911762,fr,18.0
2,Les Herbes sèches,Deniz Celiloğlu,Merve Dizdar,Musab Ekici,Nuri Bilge Ceylan,Memento Distribution,4.000000,197,Drame,Turquie,...,32179,5.924,15.291,1.316,1.892359,True,Été,3.158319,tr,355.0
3,Carmen,Paul Mescal,Melissa Barrera,Rossy de Palma,Benjamin Millepied,Pathé,2.500000,116,Drame,Australie,...,24397,8.724,36.632,12.756,1.521917,False,Été,4.079434,en,424.0
4,Carry On Jatta 3,Gippy Grewal,Sonam Bajwa,Binnu Dhillon,Smeep Kang,Friday Entertainment,3.082899,140,Comédie,Inde,...,427,3.024,12.425,1.550,1.178039,False,Été,2.890316,pa,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8229,The Strange Affair of Uncle Harry,George Sanders,Geraldine Fitzgerald,Ella Raines,Robert Siodmak,Swashbuckler Films,3.600000,81,Drame,U.S.A.,...,543,9.128,3.765,6.560,0.873801,True,Automne,3.018130,en,120.0
8230,Le Fils unique,Chôko Iida,Shin'ichi Himori,Masao Hayama,Yasujirô Ozu,Carlotta Films,4.600000,87,Comédie dramatique,Japon,...,3458,2.881,3.788,2.039,1.745192,False,Été,2.272950,ja,318.0
8231,La Maison de la mort,Boris Karloff,Melvyn Douglas,Charles Laughton,James Whale,Carlotta Films,2.900000,72,Epouvante-horreur,U.S.A.,...,911,7.142,3.723,7.058,0.875469,False,Automne,2.940378,en,318.0
8232,The Lodger: A Story of the London Fog,Marie Ault,Arthur Chesney,Reginald Gardiner,Alfred Hitchcock,Carlotta Films,5.000000,79,Drame,Grande-Bretagne,...,615,0.657,1.710,2.063,2.580520,False,Automne,1.691939,en,318.0


In [704]:
df['reputation_distributeur'].sort_values()

2809      1.0
2976      2.0
3347      3.0
1169      4.0
778       7.0
        ...  
621     434.0
379     434.0
490     434.0
761     434.0
1940    435.0
Name: reputation_distributeur, Length: 7455, dtype: float64

In [705]:
DF=df.groupby('distributeur')['Entrées_1ère_semaine'].mean()

In [706]:
DF.sort_values()

distributeur
 Meret Films                                  5.000000e+00
 Ciné Croisière                               1.100000e+01
 Festizicnema distribution                    1.200000e+01
 Sonata Films                                 1.300000e+01
 8½ Distribution                              1.700000e+01
                                                  ...     
 Gaumont Buena Vista International (GBVI)     4.594944e+05
 Warner Bros. France                          4.597354e+05
 Buena Vista International                    5.302640e+05
 The Walt Disney Company France               6.261127e+05
 EuropaCorp Distribution / ARP Sélection      1.524218e+06
Name: Entrées_1ère_semaine, Length: 415, dtype: float64

## Brouillon

In [707]:
df[df['distributeur'] == ' Warner Bros. France ']

Unnamed: 0,titre,acteur_1,acteur_2,acteur_3,réalisateur,distributeur,note_presse,duree,genre,pays,...,Entrées_1ère_semaine,actor_1_popularity,actor_2_popularity,actor_3_popularity,director_popularity,vacances,saison,actors_popularity,langue,reputation_distributeur
36,The Flash,Ezra Miller,Sasha Calle,Michael Shannon,Andy Muschietti,Warner Bros. France,2.5,144,Action,U.S.A.,...,388700,30.375,56.883,32.868,1.881143,False,Été,4.796831,en,432.0
52,Hawaii,Bérénice Bejo,Élodie Bouchez,Émilie Caen,Melissa Drigeard,Warner Bros. France,2.5,104,Comédie,France,...,71825,8.430,10.899,2.426,0.470004,False,Printemps,3.124785,en,432.0
82,Shazam! La Rage des Dieux,Zachary Levi,Asher Angel,Jack Dylan Grazer,David F. Sandberg,Warner Bros. France,2.4,130,Fantastique,U.S.A.,...,262525,17.647,29.778,17.391,1.533420,False,Printemps,4.186863,en,432.0
92,Sacrées momies,Lou Jean,Ana Esther Alborg,Roser Aldabó Arnau,Juan Jesús García Galocha,Warner Bros. France,3.7,89,Animation,Espagne,...,168937,2.926,0.721,1.094,1.115142,True,Hiver,1.747633,es,432.0
114,Sage-Homme,Karin Viard,Melvin Boomer,Steve Tientcheu,Jennifer Devoldere,Warner Bros. France,3.6,100,Comédie dramatique,France,...,279606,5.104,0.600,6.197,0.683097,False,Printemps,2.557305,fr,432.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8117,La Vérité si je mens ! 2,Richard Anconina,José Garcia,Gad Elmaleh,Thomas Gilou,Warner Bros. France,3.7,105,Comédie,France,...,2830489,3.254,10.095,6.070,0.768255,True,Hiver,3.016466,fr,432.0
8141,Wild Wild West,Will Smith,Kevin Kline,Kenneth Branagh,Barry Sonnenfeld,Warner Bros. France,3.4,104,Aventure,U.S.A.,...,37616,34.928,16.601,28.079,2.023665,False,Automne,4.389598,en,432.0
8151,Négociateur,Samuel L. Jackson,Kevin Spacey,David Morse,F. Gary Gray,Warner Bros. France,3.5,140,Thriller,Allemagne,...,85976,52.806,16.877,20.167,1.567782,False,Automne,4.509210,en,432.0
8154,"South Park, le film",Trey Parker,Matt Stone,Mary Kay Bergman,Trey Parker,Warner Bros. France,3.8,81,Animation,U.S.A.,...,73036,9.717,6.012,4.726,2.371831,False,Automne,3.065958,en,432.0


In [708]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from catboost import CatBoostRegressor
# from xgboost import XGBRegressor

# # Supposons que X_train et y_train sont vos données d'entraînement
# # Supposons que cat_cols est la liste des noms des colonnes catégorielles dans le dataframe
# # Supposons que num_cols est la liste des noms des colonnes numériques dans le dataframe

# # Créer les préprocesseurs pour les colonnes catégorielles et numériques
# cat_preprocessor = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# num_preprocessor = Pipeline(steps=[
#     ('scaler', MinMaxScaler())
# ])

# # Créer le preprocessor complet en utilisant ColumnTransformer pour gérer les colonnes catégorielles et numériques
# preprocessor = ColumnTransformer(transformers=[
#     ('num', num_preprocessor, numeric_features),
#     ('cat', cat_preprocessor, categorical_features)
# ])

# # Créer une liste de modèles avec les hyperparamètres à ajuster
# models = [
#     {
#         'name': 'RandomForestRegressor',
#         'model': RandomForestRegressor(),
#         'params': {
#             'randomforestregressor__n_estimators': [50, 100, 150],
#             'randomforestregressor__max_depth': [None, 5, 10],
#             'randomforestregressor__min_samples_split': [2, 5, 10]
#         }
#     },
  
# ]

# # Effectuer la recherche d'hyperparamètres avec validation croisée pour chaque modèle
# for model_info in models:
#     model = model_info['model']
#     params = model_info['params']
    
#     # Créer le pipeline pour le modèle en combinant le préprocesseur approprié avec le modèle
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), (model_info['name'].lower(), model)])
    
#     grid_search = GridSearchCV(pipeline, params, cv=5)  # cv=5 pour une validation croisée 5-fold
#     grid_search.fit(X_train, y_train)
#     best_params = grid_search.best_params_
#     best_score = grid_search.best_score_
    
#     print(f"Modèle : {model_info['name']}")
#     print(f"Meilleurs hyperparamètres : {best_params}")
#     print(f"Meilleur score : {best_score}")
