In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import holidays


In [2]:
df = pd.read_csv('../data_processed/traffic_shift_nord.csv')


In [7]:
df['date'] = pd.to_datetime(df['annee'].astype(str) + '-' + df['mois'].astype(str) + '-' + df['jour'].astype(str))


In [10]:
# Assurons-nous que les noms de colonnes sont corrects
print(df.columns)

# Créer une colonne de date

# Utiliser la bibliothèque holidays pour déterminer les jours fériés en France
fr_holidays = holidays.France()

# Ajouter une colonne pour indiquer si c'est un jour férié
df['jour_ferie'] = df['date'].apply(lambda x: 1 if x in fr_holidays else 0)

# Vérification des premières lignes pour confirmer les ajouts
print(df.head())


Index(['Unnamed: 0', 'Identifiant_arc', 'C_AR', 'AM/PM', 'annee', 'mois',
       'jour', 'vacances', 'semaine', 'Débit_horaire', 'Taux_d'occupation',
       'Etat_trafic', 'date'],
      dtype='object')
   Unnamed: 0  Identifiant_arc  C_AR AM/PM  annee  mois  jour  vacances  \
0           0              256   9.0    AM   2023     6     1         0   
1           1              256   9.0    AM   2023     6     2         0   
2           2              256   9.0    AM   2023     6     3         0   
3           3              256   9.0    AM   2023     6     4         0   
4           4              256   9.0    AM   2023     6     5         0   

   semaine  Débit_horaire  Taux_d'occupation  Etat_trafic       date  \
0       22     370.000000           8.188256          1.0 2023-06-01   
1       22     425.571429          10.511111          1.0 2023-06-02   
2       22     241.428571           4.100240          1.0 2023-06-03   
3       22     182.428571           2.797226          1.0 

In [12]:
df=df.dropna()

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Préparation des données
X = df.drop(columns=['Débit_horaire', 'date'])
y = df['Débit_horaire']

# Identification des colonnes numériques et catégorielles
numeric_features = ['C_AR', 'annee', 'mois', 'jour', 'vacances', 'semaine', "Taux_d'occupation", 'Etat_trafic']
categorical_features = ['Identifiant_arc', 'AM/PM', 'jour_ferie']

# Préparation des transformateurs
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Création du transformateur en colonnes
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Modèles à tester
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'SVR': SVR()
}

# Résultats
results = {}

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    # Création de la pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Entraînement du modèle
    pipeline.fit(X_train, y_train)
    
    # Prédictions
    y_pred = pipeline.predict(X_test)
    
    # Évaluation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'R2': r2}

# Affichage des résultats
for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"Mean Squared Error: {metrics['MSE']}")
    print(f"R2 Score: {metrics['R2']}")
    print()

# Exemple d'optimisation pour RandomForestRegressor
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30]
}

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestRegressor())])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best R2 score: ", grid_search.best_score_)

# Évaluation sur le test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test set R2 score: {r2_score(y_test, y_pred)}")


Model: LinearRegression
Mean Squared Error: 58103.49507930377
R2 Score: 0.9725546604757057

Model: RandomForest
Mean Squared Error: 14208.418213246074
R2 Score: 0.9932886160904182

Model: SVR
Mean Squared Error: 2192141.50743628
R2 Score: -0.03546383698211075

Best parameters found:  {'model__max_depth': 30, 'model__n_estimators': 300}
Best R2 score:  0.9921669708600692
Test set R2 score: 0.9931231534470438
