In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def clean_quotes_semicolon(input_file:str, output_file:str) -> None:
    """
    Clean a CSV file by removing colons, semicolons, and quotes from each field.
    
    Parameters:
        input_file (str): The path to the input CSV file.
        output_file (str): The path to the output CSV file.
    
    Returns:
        None
    """
    data = list()
    with open(input_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=';', quotechar='"')

        # Create a new CSV file to write the cleaned data
        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

            for row in reader:
                # Cleaning
                row = [field.replace('"', '').replace("'", "") for field in row]
                row = [field.replace(';', '').replace(',', '.') for field in row]
                
                # Cleaned row write
                writer.writerow(row)
    

In [3]:
current_path = os.getcwd()
data_path = current_path + '/data'

In [4]:
import csv

# Setting up input and output files
files = {
    "caracteristiques":["carcteristiques-{year}.csv", "carcteristiques-{year}-cleaned.csv"],
    "lieux":["lieux-{year}.csv", "lieux-{year}-cleaned.csv"],
    "usagers":["usagers-{year}.csv", "usagers-{year}-cleaned.csv"],
    "vehicules":["vehicules-{year}.csv", "vehicules-{year}-cleaned.csv"]
}

# Run cleaning
year = '2022'
for g_file in files.values():
    input_file = f'{data_path}/2022/{g_file[0].replace("{year}",year)}'  
    output_file = f'{data_path}/cleaned/{year}/{g_file[1].replace("{year}", year)}'    
    clean_quotes_semicolon(input_file=input_file,
                           output_file=output_file)     

In [5]:
caracteristiques = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["caracteristiques"][1].replace("{year}", year)}')
lieux = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["lieux"][1].replace("{year}", year)}')
usagers = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["usagers"][1].replace("{year}", year)}')
vehicules = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["vehicules"][1].replace("{year}", year)}')


  lieux = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["lieux"][1].replace("{year}", year)}')


In [6]:
usagers.head()


Unnamed: 0,num_acc,id_usager,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202200000001,1 099 700,813 952,A01,1,1,3,1,2008.0,5,2,8,-1,-1,-1,-1
1,202200000001,1 099 701,813 953,B01,1,1,1,1,1948.0,5,1,8,-1,-1,-1,-1
2,202200000002,1 099 698,813 950,B01,1,1,4,1,1988.0,9,1,0,-1,0,0,-1
3,202200000002,1 099 699,813 951,A01,1,1,1,1,1970.0,4,1,0,-1,0,0,-1
4,202200000003,1 099 696,813 948,A01,1,1,1,1,2002.0,0,1,0,-1,-1,-1,-1


In [7]:
merged_df = pd.merge(pd.merge(pd.merge(caracteristiques, lieux, on="num_acc")
                     , vehicules
                     , on="num_acc"), usagers, on="num_acc")
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241487 entries, 0 to 241486
Data columns (total 57 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_acc        241487 non-null  int64  
 1   jour           241487 non-null  int64  
 2   mois           241487 non-null  int64  
 3   an             241487 non-null  int64  
 4   hrmn           241487 non-null  object 
 5   lum            241487 non-null  int64  
 6   dep            241487 non-null  object 
 7   com            241487 non-null  object 
 8   agg            241487 non-null  int64  
 9   int            241487 non-null  int64  
 10  atm            241487 non-null  int64  
 11  col            241487 non-null  int64  
 12  adr            235980 non-null  object 
 13  lat            241487 non-null  float64
 14  long           241487 non-null  float64
 15  catr           241487 non-null  int64  
 16  voie           225068 non-null  object 
 17  v1             241487 non-nul

In [13]:
df_joined = pd.read_csv(filepath_or_buffer=f'{data_path}/Accidents_joined.csv', sep=',', header=0, index_col=0)
df_joined.head()


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [10]:
# Setting up duplicateds using ID as key
dubs_caracteristiques = caracteristiques.duplicated(subset=['num_acc'])
print('caracteristiques total duplicated: %s' % len(caracteristiques[dubs_caracteristiques]))

dubs_lieux = lieux.duplicated(subset=['num_acc'])
print('lieux total duplicated: %s' % len(lieux[dubs_lieux]))


dubs_usagers = usagers.duplicated(subset=['id_usager'])
print('usagers total duplicated: %s' % len(usagers[dubs_usagers]))

dubs_vehicules = vehicules.duplicated(subset=['id_vehicule'])
print('vehicules total duplicated: %s' % len(vehicules[dubs_vehicules]))


caracteristiques total duplicated: 0
lieux total duplicated: 0
usagers total duplicated: 0
vehicules total duplicated: 0


In [11]:
from numpy import nan 
from sklearn.feature_selection import VarianceThreshold


data = merged_df.iloc[:,1:5].values
X = data[:, :-1]
Y = data[:, -1]
print(
    'Shapes', 'X', X.shape, 'Y', Y.shape
)

feature = VarianceThreshold()
# feature.fit_transform(X)

Shapes X (241487, 3) Y (241487,)


In [12]:
from numpy import nan 
new_merge = merged_df.fillna(nan)
new_merge.head()

Unnamed: 0,num_acc,jour,mois,an,hrmn,lum,dep,com,agg,int,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202200000001,19,10,2022,16:15,1,26,26198,2,3,...,3,1,2008.0,5,2,8,-1,-1,-1,-1
1,202200000001,19,10,2022,16:15,1,26,26198,2,3,...,1,1,1948.0,5,1,8,-1,-1,-1,-1
2,202200000001,19,10,2022,16:15,1,26,26198,2,3,...,3,1,2008.0,5,2,8,-1,-1,-1,-1
3,202200000001,19,10,2022,16:15,1,26,26198,2,3,...,1,1,1948.0,5,1,8,-1,-1,-1,-1
4,202200000002,20,10,2022,08:34,1,25,25204,2,3,...,4,1,1988.0,9,1,0,-1,0,0,-1


In [13]:
new_merge.to_csv(f'{data_path}/cleaned/2005-2021/cleaned.csv', sep=',', index=False)
