In [2]:
import requests
import os
import pandas as pd
import chardet
import re


In [3]:
search_url = "https://www.data.gouv.fr/api/1/datasets/"

params = {
    "q": "accidents corporels circulation routière",  # Mots-clés pertinents
    "page_size": 20  # Nombre de résultats max
}

response = requests.get(search_url, params=params)

if response.status_code == 200:
    datasets = response.json()["data"]
    
    print("Liste des datasets:")
    for dataset in datasets:
        print(f"- {dataset['title']} -> ID: {dataset['id']}")
else:
    print("Erreur lors de la récupération des datasets")


Liste des datasets:
- Accidents corporels de la circulation routière -> ID: 58d49a4ba3a7293affeffb39
- Données des accidents corporels de la circulation routière -> ID: 53a8bd3fa3a72905b7ce5961
- Bases de données annuelles des accidents corporels de la circulation routière - Années de 2005 à 2023 -> ID: 53698f4ca3a729239d2036df


In [None]:
dataset_id = "53698f4ca3a729239d2036df" 

dataset_url = f"https://www.data.gouv.fr/api/1/datasets/{dataset_id}/"

response = requests.get(dataset_url)

if response.status_code == 200:
    dataset_info = response.json()
    
    print("Liste complète des fichiers disponibles :")
    for resource in dataset_info.get("resources", []):
        file_title = resource.get("title", "Sans titre").lower()
        file_url = resource.get("url", "URL non disponible")
        
        if any(keyword in file_title for keyword in ["usager", "vehicule", "lieu", "caract"]) and "vehicules-immatricules" not in file_title:
            print(f"- {file_title} -> {file_url}")
else:
    print("Erreur lors de la récupération des fichiers du dataset")


Liste complète des fichiers disponibles :
- usagers-2023.csv -> https://static.data.gouv.fr/resources/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2023/20241023-153328/usagers-2023.csv
- vehicules-2023.csv -> https://static.data.gouv.fr/resources/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2023/20241023-153253/vehicules-2023.csv
- lieux-2023.csv -> https://static.data.gouv.fr/resources/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2023/20241023-153219/lieux-2023.csv
- caract-2023.csv -> https://static.data.gouv.fr/resources/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2023/20241028-103125/caract-2023.csv
- usagers-2022.csv -> https://static.data.gouv.fr/resources/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2021/20231005-094229/usagers-2022

In [5]:
dataset_id = "53698f4ca3a729239d2036df" 
dataset_url = f"https://www.data.gouv.fr/api/1/datasets/{dataset_id}/"

save_folder = "datasets_accidents"
os.makedirs(save_folder, exist_ok=True)

response = requests.get(dataset_url)
if response.status_code == 200:
    dataset_info = response.json()

    print("Début du téléchargement")

    for resource in dataset_info.get("resources", []):
        file_title = resource.get("title", "Sans titre").lower()
        file_url = resource.get("url", "")

        if any(keyword in file_title for keyword in ["usager", "vehicule", "lieu", "caracteristique"]) and \
           "vehicules-immatricules-baac" not in file_title and \
           "description" not in file_title:

            file_name = file_title.replace(" ", "_")
            if not file_name.endswith(".csv"):
                file_name += ".csv"
            
            file_path = os.path.join(save_folder, file_name)
            print(f"Téléchargement de {file_name}...")
            file_response = requests.get(file_url)

            if file_response.status_code == 200:
                with open(file_path, "wb") as file:
                    file.write(file_response.content)
                print(f"{file_name} enregistré")
            else:
                print(f"Erreur lors du téléchargement de {file_name}")

    print("Tous les fichiers sélectionnés ont été traités !")
else:
    print("Erreur lors de la récupération des fichiers du dataset")


Début du téléchargement
Téléchargement de usagers-2023.csv...
usagers-2023.csv enregistré
Téléchargement de vehicules-2023.csv...
vehicules-2023.csv enregistré
Téléchargement de lieux-2023.csv...
lieux-2023.csv enregistré
Téléchargement de usagers-2022.csv...
usagers-2022.csv enregistré
Téléchargement de vehicules-2022.csv...
vehicules-2022.csv enregistré
Téléchargement de lieux-2022.csv...
lieux-2022.csv enregistré
Téléchargement de usagers-2021.csv...
usagers-2021.csv enregistré
Téléchargement de vehicules-2021.csv...
vehicules-2021.csv enregistré
Téléchargement de lieux-2021.csv...
lieux-2021.csv enregistré
Téléchargement de usagers-2020.csv...
usagers-2020.csv enregistré
Téléchargement de vehicules-2020.csv...
vehicules-2020.csv enregistré
Téléchargement de lieux-2020.csv...
lieux-2020.csv enregistré
Téléchargement de caracteristiques-2020.csv...
caracteristiques-2020.csv enregistré
Téléchargement de usagers-2019.csv...
usagers-2019.csv enregistré
Téléchargement de vehicules-2019.c

In [6]:
search_url = "https://www.data.gouv.fr/api/1/datasets/"

params = {
    "q": "accidents", 
    "page_size": 100 
}

response = requests.get(search_url, params=params)

if response.status_code == 200:
    datasets = response.json()["data"]
    
    print("Liste des datasets:")
    for dataset in datasets:
        print(f"- {dataset['title']} -> ID: {dataset['id']}")
else:
    print("Erreur lors de la récupération des datasets")

Liste des datasets:
- Evolution des accidents de service ou de travail et des accidents de trajet des civils, sur 23 ans -> ID: 53699532a3a729239d20465e
- Accidents de vélo -> ID: 5d56becc8b4c412584b830c9
- Accidents de la route -> ID: 53698e4ba3a729239d203415
- Les accidents du travail -> ID: 536997eaa3a729239d204dff
- Accidents passagers depuis 2008 -> ID: 540f9af6a3a72928898af4a8
- Les accidents de trajet  -> ID: 536997eba3a729239d204e00
- Accidents 2009 - 2013 - Haute-Garonne -> ID: 673778e2484c5f09bec253ba
- Accidents 2005 - 2009 - Haute-Garonne -> ID: 673778e2484c5f09bec253b9
- Accidents corporels sur Rennes Métropole -> ID: 64ffd45c425da93b05ae9bff
- Accidents de circulation (Brest métropole) -> ID: 64809565c802b9f92cd0ba5c
- Les accidents de la circulation -> ID: 536997eaa3a729239d204dfd
- Accidents corporels de la circulation routière -> ID: 58d49a4ba3a7293affeffb39
- Accidents de la circulation 76 2021 -> ID: 6662501cba764811e59fc8f3
- Mortalité due aux accidents de la route 

In [7]:
data_folder = "datasets_accidents"

output_folder = os.path.join("merge_datasets")
os.makedirs(output_folder, exist_ok=True)

categories = ["caracteristique", "usager", "lieu", "vehicule"]

merged_data = {category: [] for category in categories}

ignored_files = []

def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        raw_data = f.read(10000) 
    return chardet.detect(raw_data)["encoding"]

def read_csv_with_fallback(file_path):
    encoding = detect_encoding(file_path)

    try:
        df = pd.read_csv(file_path, encoding=encoding, sep=None, engine="python")
    except Exception as e:
        try:
            df = pd.read_csv(file_path, encoding="ISO-8859-1", sep=None, engine="python")
        except Exception as e2:
            try:
                df = pd.read_csv(file_path, encoding="latin1", sep=None, engine="python")
            except Exception as e3:
                return None, f"Erreur de lecture : {e}, {e2}, {e3}"

    return df, None

for file_name in os.listdir(data_folder):
    file_path = os.path.join(data_folder, file_name)

    matched_category = None
    for category in categories:
        if category in file_name:
            matched_category = category
            break
    
    if matched_category:
        print(f"Lecture de {file_name}")

        df, error = read_csv_with_fallback(file_path)

        if error:
            print(f"{file_name} ignoré. Raison : {error}")
            ignored_files.append((file_name, error))
            continue

        match = re.search(r"(\d{4})", file_name)
        df["annee"] = match.group(1) if match else "unknown"

        merged_data[matched_category].append(df)

for category, df_list in merged_data.items():
    if df_list:
        print(f"Fusion des fichiers pour {category}.")
        merged_df = pd.concat(df_list, ignore_index=True, sort=False)

        output_path = os.path.join(output_folder, f"{category}.csv")
        merged_df.to_csv(output_path, index=False)
        print(f"Fichier fusionné enregistré : {output_path}")

if ignored_files:
    print("Fichiers ignorés en raison d'erreurs :")
    for file, reason in ignored_files:
        print(f"{file} → {reason}")

print("Fusion terminée avec succès !")


Lecture de usagers_2006.csv
Lecture de usagers_2012.csv
Lecture de usagers_2013.csv
Lecture de usagers_2007.csv
Lecture de lieux_2016.csv
Lecture de lieux_2014.csv
Lecture de usagers_2011.csv
Lecture de usagers_2005.csv
Lecture de usagers_2010.csv
Lecture de lieux_2015.csv
Lecture de lieux_2011.csv
Lecture de lieux_2005.csv
Lecture de usagers_2014.csv
Lecture de usagers_2015.csv
Lecture de lieux_2010.csv
Lecture de lieux_2006.csv
Lecture de lieux_2012.csv
Lecture de vehicules_2009.csv
Lecture de caracteristiques-2017.csv
Lecture de usagers_2016.csv
Lecture de vehicules_2008.csv
Lecture de lieux_2013.csv
Lecture de lieux_2007.csv
Lecture de lieux-2021.csv
Lecture de usagers-2018.csv
Lecture de usagers-2019.csv
Lecture de lieux-2020.csv
Lecture de lieux-2022.csv
Lecture de lieux-2023.csv
Lecture de usagers-2022.csv
Lecture de usagers-2023.csv
Lecture de vehicules-2017.csv
Lecture de lieux-2018.csv
Lecture de usagers-2021.csv
Lecture de caracteristiques_2009.csv
Lecture de caracteristique

In [8]:
fusion_folder = "merge_datasets"
output_file = os.path.join(fusion_folder, "all_accident.csv")

files_to_merge = ["caracteristique.csv", "usager.csv", "lieu.csv", "vehicule.csv"]

dfs = {}
for file in files_to_merge:
    file_path = os.path.join(fusion_folder, file)
    
    if os.path.exists(file_path):
        print(f"Lecture de {file}...")
        df = pd.read_csv(file_path, low_memory=False)
        
        if "Num_Acc" not in df.columns:
            print(f"Erreur : 'Num_Acc' absent dans {file}. Fichier ignoré.")
            continue
        
        dfs[file.split(".")[0]] = df

if len(dfs) < 2:
    print("Pas assez de fichiers valides pour la fusion.")
else:
    print("Fusion des fichiers sur 'Num_Acc'")

    merged_df = dfs.pop("caracteristique")
    for name, df in dfs.items():
        print(f"Fusion avec {name}")
        merged_df = merged_df.merge(df, on="Num_Acc", how="outer", suffixes=("", f"_{name}"))

    merged_df.to_csv("all_accident_2005_20223.csv",index=False)
    print(f"Fichier final enregistré : {output_file}")


Lecture de caracteristique.csv...
Lecture de usager.csv...
Lecture de lieu.csv...
Lecture de vehicule.csv...
Fusion des fichiers sur 'Num_Acc'
Fusion avec usager
Fusion avec lieu
Fusion avec vehicule
Fichier final enregistré : merge_datasets/all_accident.csv


In [9]:
df = pd.read_csv('/Users/maurice/Documents/jedha/jedha/lead/00_projet_lead/API_csv/all_accident_2005_20223.csv', low_memory=False)
df

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catv,occutc,obs,obsm,choc,manv,num_veh_vehicule,annee_vehicule,id_vehicule_vehicule,motor
0,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
1,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
2,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
3,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
4,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5314179,202300054822,,,,,,,,,,...,50,,14.0,2.0,1.0,1.0,B01,2023,155 583 345,3.0
5314180,202300054822,,,,,,,,,,...,7,,0.0,2.0,7.0,22.0,A01,2023,155 583 344,1.0
5314181,202300054822,,,,,,,,,,...,50,,14.0,2.0,1.0,1.0,B01,2023,155 583 345,3.0
5314182,202300054822,,,,,,,,,,...,7,,0.0,2.0,7.0,22.0,A01,2023,155 583 344,1.0


In [10]:
mask_geoloc = (~df['lat'].isna()) & (~df['lat'].astype(str).str.match(r'^0+(\.0+)?$'))
df_geoloc = df[mask_geoloc]
df_geoloc

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catv,occutc,obs,obsm,choc,manv,num_veh_vehicule,annee_vehicule,id_vehicule_vehicule,motor
0,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
1,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
2,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
3,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
4,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515164,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515165,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,4.0,23.0,B01,2020,154 658 351,1.0
4515166,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515167,202000047744,2020.0,11.0,18.0,17:44,4.0,2.0,1.0,1.0,6.0,...,32,,0.0,1.0,1.0,1.0,A01,2020,154 658 350,1.0


In [11]:
mask_mois = df['mois'].notna() & ~df['mois'].astype(str).str.match(r'^0+(\.0+)?$')
mask_jour = df['jour'].notna() & ~df['jour'].astype(str).str.match(r'^0+(\.0+)?$')

mask_date = mask_mois & mask_jour
df_date = df[mask_date]

df_date

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catv,occutc,obs,obsm,choc,manv,num_veh_vehicule,annee_vehicule,id_vehicule_vehicule,motor
0,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
1,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
2,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
3,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
4,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515164,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515165,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,4.0,23.0,B01,2020,154 658 351,1.0
4515166,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515167,202000047744,2020.0,11.0,18.0,17:44,4.0,2.0,1.0,1.0,6.0,...,32,,0.0,1.0,1.0,1.0,A01,2020,154 658 350,1.0


In [12]:
mask_combined = mask_geoloc & mask_date
df_final = df[mask_combined]

df_final

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catv,occutc,obs,obsm,choc,manv,num_veh_vehicule,annee_vehicule,id_vehicule_vehicule,motor
0,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
1,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
2,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
3,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,8.0,10.0,B02,2005,,
4,200500000001,5.0,1.0,12.0,1900,3.0,2.0,1.0,1.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,A01,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515164,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515165,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,4.0,23.0,B01,2020,154 658 351,1.0
4515166,202000047743,2020.0,11.0,28.0,11:45,1.0,1.0,1.0,1.0,2.0,...,7,,0.0,2.0,1.0,2.0,A01,2020,154 658 352,1.0
4515167,202000047744,2020.0,11.0,18.0,17:44,4.0,2.0,1.0,1.0,6.0,...,32,,0.0,1.0,1.0,1.0,A01,2020,154 658 350,1.0


In [13]:
print("Percentage of missing values: ")
display(100 * df.isnull().sum() / df.shape[0])

Percentage of missing values: 


Num_Acc                  0.000000
an                      15.035516
mois                    15.035516
jour                    15.035516
hrmn                    15.035516
                          ...    
manv                     0.049264
num_veh_vehicule         0.000000
annee_vehicule           0.000000
id_vehicule_vehicule    76.451512
motor                   76.451512
Length: 64, dtype: float64

In [14]:
missing_lat = df[df['lat'].isnull()]
display(missing_lat)


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catv,occutc,obs,obsm,choc,manv,num_veh_vehicule,annee_vehicule,id_vehicule_vehicule,motor
169,200500000032,5.0,1.0,18.0,900,1.0,2.0,0.0,3.0,3.0,...,7,0.0,0.0,2.0,7.0,19.0,A01,2005,,
170,200500000032,5.0,1.0,18.0,900,1.0,2.0,0.0,3.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,B02,2005,,
171,200500000032,5.0,1.0,18.0,900,1.0,2.0,0.0,3.0,3.0,...,7,0.0,0.0,2.0,7.0,19.0,A01,2005,,
172,200500000032,5.0,1.0,18.0,900,1.0,2.0,0.0,3.0,3.0,...,7,0.0,0.0,2.0,1.0,1.0,B02,2005,,
200,200500000043,5.0,1.0,21.0,700,3.0,1.0,2.0,1.0,1.0,...,2,0.0,0.0,2.0,1.0,5.0,A01,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5314179,202300054822,,,,,,,,,,...,50,,14.0,2.0,1.0,1.0,B01,2023,155 583 345,3.0
5314180,202300054822,,,,,,,,,,...,7,,0.0,2.0,7.0,22.0,A01,2023,155 583 344,1.0
5314181,202300054822,,,,,,,,,,...,50,,14.0,2.0,1.0,1.0,B01,2023,155 583 345,3.0
5314182,202300054822,,,,,,,,,,...,7,,0.0,2.0,7.0,22.0,A01,2023,155 583 344,1.0
