In [10]:
import pandas as pd  
df = pd.read_csv("data/allocine_silver.csv")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8637 entries, 0 to 8636
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   titre                 8637 non-null   object 
 1   acteur_1              8637 non-null   object 
 2   acteur_2              8637 non-null   object 
 3   acteur_3              8637 non-null   object 
 4   réalisateur           8637 non-null   object 
 5   distributeur          8637 non-null   object 
 6   note_presse           8637 non-null   float64
 7   duree                 8637 non-null   int64  
 8   genre                 8637 non-null   object 
 9   pays                  8637 non-null   object 
 10  type                  8637 non-null   object 
 11  nominations           8637 non-null   int64  
 12  prix                  8637 non-null   int64  
 13  annee_production      8637 non-null   int64  
 14  Semaine               8637 non-null   object 
 15  Entrées_1ère_semaine 

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = ["note_presse", "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

rf_reg = RandomForestRegressor(n_estimators=32, max_depth=9, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf_reg', rf_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


######## R2 score : 
TRAIN : 0.6534614081069462
TEST : 0.4332830536399158


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver_2.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = ["note_presse", "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

rf_reg = RandomForestRegressor(n_estimators=32, max_depth=9, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf_reg', rf_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


######## R2 score : 
TRAIN : 0.6733288278549511
TEST : 0.3234760606146241


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import Normalizer

# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver_2.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = ["note_presse", "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

xgb_reg = XGBRegressor(n_estimators=32, max_depth=9, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_reg', xgb_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


######## R2 score : 
TRAIN : 0.9089379348793828
TEST : 0.4501884988986926


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import Normalizer

# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = ["note_presse", "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

xgb_reg = XGBRegressor(n_estimators=32, max_depth=9, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_reg', xgb_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


######## R2 score : 
TRAIN : 0.8993297945099384
TEST : 0.43904308607047093


### Voici le code modifié pour réduire la complexité du modèle en ajustant les hyperparamètres de XGBRegressor. 
### J’ai réduit la profondeur maximale (max_depth) à 5 et augmenté le taux d’apprentissage (learning_rate) à 0.1:

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import Normalizer

# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver_2.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = ["note_presse", "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

xgb_reg = XGBRegressor(n_estimators=32, max_depth=5, learning_rate=0.1, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_reg', xgb_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


######## R2 score : 
TRAIN : 0.6216000196889995
TEST : 0.4229689346360884


Voici le code modifié pour utiliser la régularisation L2 en ajustant le paramètre reg_lambda de XGBRegressor. 

J’ai défini reg_lambda à 1.0 pour ajouter une pénalité L2 aux coefficients du modèle:

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import Normalizer
import pickle


# Assuming you have already read the correct CSV file
df = pd.read_csv("data/allocine_silver_2.csv")

# Make sure to update the column names according to your DataFrame
categorical_features = ["acteur_1", "acteur_2", "acteur_3", "réalisateur", "distributeur", "genre", "pays"]
numerical_features = [ "duree", "nominations", "prix", "annee_production"]

# Use get_dummies to encode the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

X = df_encoded.drop(["Entrées_1ère_semaine","note_presse","titre","type","Semaine"], axis=1)
y = df_encoded["Entrées_1ère_semaine"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough"  # Use "passthrough" to keep the encoded categorical features
)

xgb_reg = XGBRegressor(n_estimators=32, max_depth=9, reg_lambda=1.0, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_reg', xgb_reg)
])

pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## R2 score : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred_test))


# Enregistrer le modèle entraîné sous forme de fichier pickle
with open('pickle/test_cine.pkl', 'wb') as file:
    pickle.dump(pipe, file)

######## R2 score : 
TRAIN : 0.8873735476759437
TEST : 0.5036992155101085
