In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from collections import Counter

df = pd.read_csv('/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/datasets_projet/accidents_77_final.csv')
df["accident"] = df["nombre_d_accidents"].apply(lambda x: False if x == 0 else 1)
df.drop(columns=["nombre_d_accidents"], inplace=True)
df.drop(columns=["jour", "mois", "an"], inplace=True)
df["date"] = pd.to_datetime(df["date"])
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["weekday"] = df["date"].dt.weekday  # Lundi = 0, Dimanche = 6
df["weekend"] = df["weekday"].apply(lambda x: 1 if x >= 5 else 0)  # 1 si samedi/dimanche
df.drop(columns=["date"], inplace=True)
bool_cols = ["jour_ferie", "vacances_Zone_A", "vacances_Zone_B", "vacances_Zone_C", "accident"]
df[bool_cols] = df[bool_cols].astype(int)
commune_mapping = {com: idx for idx, com in enumerate(df["com"].unique())}
df["com"] = df["com"].map(commune_mapping)

mapping_df = pd.DataFrame(list(commune_mapping.items()), columns=["commune", "id"])
mapping_df.to_csv("/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/datasets_projet/commune_mapping.csv", index=False)


# üîπ D√©finir la target et les features
y = df["accident"]  # Target
X = df.drop(columns=["accident"])  # Features

# üîπ S√©parer en train/test (80% entra√Ænement, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Calcul du ratio entre classes
counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]  # Ratio des classes

# üîπ Cr√©ation du mod√®le XGBoost avec gestion du d√©s√©quilibre
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,  # ‚ö†Ô∏è Correction du d√©s√©quilibre
    random_state=42
)

# üîπ Entra√Ænement du mod√®le
model.fit(X_train, y_train)

# üîπ Pr√©dictions sous forme de probabilit√©s
y_proba = model.predict_proba(X_test)[:,1]  # Probabilit√© que ce soit un accident

optimal_threshold = 0.8  
y_pred_adjusted = (y_proba >= optimal_threshold).astype(int)

# üîπ Nouvelle √©valuation des performances
accuracy = accuracy_score(y_test, y_pred_adjusted)
f1 = f1_score(y_test, y_pred_adjusted)
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

# üîπ Affichage des r√©sultats avec le seuil ajust√©
print(f"‚úÖ Accuracy: {accuracy:.4f}")
print(f"‚úÖ F1 Score: {f1:.4f}")
print("‚úÖ Matrice de confusion:\n", conf_matrix)



‚úÖ Accuracy: 0.9938
‚úÖ F1 Score: 0.3353
‚úÖ Matrice de confusion:
 [[73446   380]
 [   80   116]]


In [2]:
df

Unnamed: 0,com,jour_ferie,vacances_Zone_A,vacances_Zone_B,vacances_Zone_C,dep,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,accident,year,month,day,weekday,weekend
0,0,1,1,1,1,77,8.610000,7.160833,5.34,14.29,1024.541667,94.625000,2.794583,59.666667,0,2022,1,1,5,1
1,1,1,1,1,1,77,8.610000,7.160833,5.34,14.29,1024.541667,94.625000,2.794583,59.666667,0,2022,1,1,5,1
2,0,1,1,1,1,77,8.610000,7.160833,5.34,14.29,1024.541667,94.625000,2.794583,59.666667,0,2022,1,1,5,1
3,2,1,1,1,1,77,8.610000,7.160833,5.34,14.29,1024.541667,94.625000,2.794583,59.666667,0,2022,1,1,5,1
4,3,1,1,1,1,77,8.610000,7.160833,5.34,14.29,1024.541667,94.625000,2.794583,59.666667,0,2022,1,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370105,9,0,1,1,1,77,8.809583,5.634583,6.72,10.84,1003.583333,84.583333,7.989583,74.000000,0,2023,12,31,6,1
370106,23,0,1,1,1,77,8.809583,5.634583,6.72,10.84,1003.583333,84.583333,7.989583,74.000000,0,2023,12,31,6,1
370107,42,0,1,1,1,77,8.809583,5.634583,6.72,10.84,1003.583333,84.583333,7.989583,74.000000,0,2023,12,31,6,1
370108,120,0,1,1,1,77,8.809583,5.634583,6.72,10.84,1003.583333,84.583333,7.989583,74.000000,0,2023,12,31,6,1


In [2]:
import os
import pickle

# D√©finir le chemin du dossier et du fichier
model_dir = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/models"
model_path = os.path.join(model_dir, "xgboost_accidents.pkl")

# V√©rifier si le dossier existe, sinon le cr√©er
os.makedirs(model_dir, exist_ok=True)

# Sauvegarde du mod√®le XGBoost
with open(model_path, "wb") as file:
    pickle.dump(model, file)

# Retourner le chemin du fichier sauvegard√©
model_path


'/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/models/xgboost_accidents.pkl'

In [2]:
import pandas as pd
import pickle
import xgboost as xgb

# üìç Chemins des fichiers
csv_path = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/output_final_reordered.csv"
model_path = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/models/xgboost_accidents.pkl"
output_csv = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/output_with_predictions.csv"

# üì• Charger les donn√©es
df = pd.read_csv(csv_path)

# üì§ Charger le mod√®le XGBoost
with open(model_path, "rb") as f:
    model = pickle.load(f)

# üìå V√©rifier les colonnes attendues par le mod√®le
expected_features = model.get_booster().feature_names
if expected_features is None:
    raise ValueError("Le mod√®le ne contient pas de noms de colonnes. V√©rifie que le mod√®le a √©t√© entra√Æn√© avec `feature_names`.")

# üõ† R√©organiser les colonnes pour correspondre √† celles du mod√®le
if set(expected_features) != set(df.columns):
    missing_cols = set(expected_features) - set(df.columns)
    extra_cols = set(df.columns) - set(expected_features)
    raise ValueError(f"Probl√®me de colonnes :\nManquantes: {missing_cols}\nEn trop: {extra_cols}")

df = df[expected_features]  # R√©organiser l'ordre des colonnes

# üîÆ Faire la pr√©diction
df["accident"] = model.predict(df)  # Pas besoin de convertir en DMatrix !

# üì§ Sauvegarder le fichier avec la colonne accident
df.to_csv(output_csv, index=False)

print(f"‚úÖ Fichier sauvegard√© avec les pr√©dictions : {output_csv}")


‚úÖ Fichier sauvegard√© avec les pr√©dictions : /Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/output_with_predictions.csv


In [3]:
import pandas as pd

# Charger le dataset
file_path = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/output_with_predictions_mapping.csv"
df = pd.read_csv(file_path)

# V√©rifier la structure des donn√©es
print(df.info())

# V√©rifier la pr√©sence de valeurs manquantes
print(df.isnull().sum())

# V√©rifier les valeurs uniques dans la colonne 'accident'
print(df['accident'].value_counts())

# Filtrer uniquement les lignes o√π 'accident' est √©gal √† 1
df_accidents = df[df['accident'] == 1]

# Regrouper par commune et voir les jours concern√©s
accident_days_by_com = df_accidents.groupby("com")["day"].unique().reset_index()

# Afficher les r√©sultats
print(accident_days_by_com)

# Sauvegarder le r√©sultat en CSV si n√©cessaire
output_path = "/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/accidents_by_commune.csv"
accident_days_by_com.to_csv(output_path, index=False)

print(f"R√©sultats enregistr√©s dans {output_path}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1521 entries, 0 to 1520
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   com              1521 non-null   int64  
 1   jour_ferie       1521 non-null   int64  
 2   vacances_Zone_A  1521 non-null   int64  
 3   vacances_Zone_B  1521 non-null   int64  
 4   vacances_Zone_C  1521 non-null   int64  
 5   dep              1521 non-null   int64  
 6   temp             1521 non-null   float64
 7   feels_like       1521 non-null   float64
 8   temp_min         1521 non-null   float64
 9   temp_max         1521 non-null   float64
 10  pressure         1521 non-null   int64  
 11  humidity         1521 non-null   int64  
 12  wind_speed       1521 non-null   float64
 13  clouds_all       1521 non-null   int64  
 14  year             1521 non-null   int64  
 15  month            1521 non-null   int64  
 16  day              1521 non-null   int64  
 17  weekday       