In [None]:
import os
import csv
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from data_cleaning_utils import clean_csv, filter_columns
from utils import set_pandas_display_options
set_pandas_display_options()

# Cleaning dataframes
At this section we will clean all DF's using following statements.
- Function `clean_csv` used before read csv into DF.
    - Clean quotes.
    - Clean spaces.
    - Delete semicolon and replace each one by colon.

- Clean each DF columns (Coluns in Notion, only yes take as necessary col's).
- Deleting duplicates from caracteristiques.


In [None]:
current_path = os.getcwd()
data_path = current_path + '/data'

In [None]:
# Setting up input and output files
files = {
    "caracteristiques":["carcteristiques-{year}.csv", "carcteristiques-{year}-cleaned.csv"],
    "lieux":["lieux-{year}.csv", "lieux-{year}-cleaned.csv"],
    "usagers":["usagers-{year}.csv", "usagers-{year}-cleaned.csv"],
    "vehicules":["vehicules-{year}.csv", "vehicules-{year}-cleaned.csv"]
}

# Run cleaning
year = '2022'
for g_file in files.values():
    input_file = f'{data_path}/2022/{g_file[0].replace("{year}",year)}'  
    output_file = f'{data_path}/cleaned/{year}/{g_file[1].replace("{year}", year)}'    
    clean_csv(input_file=input_file,
              output_file=output_file)     

In [None]:
caracteristiques = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["caracteristiques"][1].replace("{year}", year)}')
lieux = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["lieux"][1].replace("{year}", year)}')
usagers = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["usagers"][1].replace("{year}", year)}')
vehicules = pd.read_csv(filepath_or_buffer=f'{data_path}/cleaned/{year}/{files["vehicules"][1].replace("{year}", year)}')

In [None]:
lieux_col_drop = ['voie', 'v1', 'v2', 'circ', 'nbv', 'pr', 'pr1', 'vosp', 'prof', 'plan', 'lartpc', 'larrout', 'situ']
carac_col_drop = ['adr']
usag_col_drop = ['secu1', 'secu2', 'secu3']
vehic_col_drop = ['senc', 'motor', 'occutc']

caracteristiques = filter_columns(caracteristiques, carac_col_drop)
lieux = filter_columns(lieux, lieux_col_drop)
usagers = filter_columns(usagers, usag_col_drop)
vehicules = filter_columns(vehicules, vehic_col_drop)


In [None]:
# Checking and deleting duplicates from caracteristiques
caract_dubs = caracteristiques.duplicated(subset=['num_acc'])
# last check identify if there are duplicates in the data using boolean values
print('joined data total duplicated: %s / %s' % (len(caracteristiques[caract_dubs]), len(caracteristiques)))
# drop duplicates
caracteristiques = caracteristiques.drop_duplicates(subset=['num_acc'])
print('Droped duplicates: %s / %s' % (len(caracteristiques[caract_dubs]), len(caracteristiques)))

## NOTE
- Identified: no unecessary duplicates,
- Identified, all data is normalized (no categorical values).
- Identified: There's no treshold variance

# Merge DF's

Each Df will be merged in left join:

- merge_1 = Caracteristiques with lieux using `num_acc`.
- merge_2 = Usagers with vehicules using `num_acc`, `id_vehicule` and `num_veh`.

Each result will be merged to from result merge_1 to merge_2 using `num_acc`



In [None]:
merge_1 = caracteristiques.merge(lieux, on="num_acc", how='left')
merge_2 = usagers.merge(vehicules, on=['num_acc', 'id_vehicule', 'num_veh'], how='left')
full_data = merge_2.merge(merge_1, on='num_acc', how='left')
# full_data.info()


In [None]:
from numpy import nan 
# Final cleaning

# Filled all NaN values with nan
full_data = full_data.fillna(nan)

# Replacing all an_nais (born year) values by integer values and filling NaN values with 0
full_data['an_nais'] = full_data['an_nais'].fillna(0).astype(int)

full_data['date'] = pd.to_datetime(full_data['jour'].astype(str) + '-' + full_data['mois'].astype(str) + '-' + full_data['an'].astype(str) + ' ' + full_data['hrmn'].astype(str), format='%d-%m-%Y %H:%M')
full_data['timestamp'] = full_data['date'].astype(int) // 10**9
data_cols_drop = ['jour', 'mois', 'an', 'hrmn', 'date', 'num_veh', 'num_acc', 'adr', 'voie', 'v1', 'v2', 'occutc', 'lartpc']
full_data = filter_columns(full_data, data_cols_drop)

# Replacing row specific errors
full_data.replace({'A': '10', 'B': '11', "(1)" : '1'}, regex=False, inplace=True)
full_data.replace({'2A' : '96', '2B' : '97'}, regex=True, inplace = True) #On remplace les corses
full_data.drop(full_data[full_data['com'] == "N/C"].index, inplace=True)
full_data.drop(full_data[full_data['nbv'] == '#ERREUR'].index, inplace=True)

# Removing NaN indexes
full_data.dropna(inplace=True)

full_data.head()

In [None]:
# Values dep, com, num_veh, actp are non integer values and is not possible to convert them to integer values
# full_data.info()
for col in full_data.columns:
    print(col)


In [None]:
# saving data
full_data.to_csv(f'{data_path}/full_data.csv', index=False)