## Importing libraries

In [103]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [145]:
charac_df = pd.read_csv("archive/caracteristics.csv", encoding='ISO-8859-1')
users_df = pd.read_csv("archive/users.csv")
vehicles_df = pd.read_csv("archive/vehicles.csv")
places_df = pd.read_csv("archive/places.csv")
holidays_df = pd.read_csv("archive/holidays.csv")

## Renaming columns

#### Characteristics renaming

In [146]:
charac_columns = {"an":"year",
                  "mois":"month",
                  "jour":"day",
                  "hrmn":"time",
                  "lum":"lighting",
                  "agg":"agg??",
                  "int":"intersection",
                  "atm":"atmosphere",
                  "col":"collision",
                  "com":"municipality",
                  "adr":"postal_address",
                  "gps":"GPS",
                  "lat":"latitude",
                  "long":"longitude",
                  "dep":"department"}

charac_df.rename(columns=charac_columns, inplace=True)

In [147]:
charac_df.head()

Unnamed: 0,Num_Acc,year,month,day,time,lighting,agg??,intersection,atmosphere,collision,municipality,postal_address,GPS,latitude,longitude,department
0,201600000001,16,2,1,1445,1,2,1,8.0,3.0,5.0,"46, rue Sonneville",M,0.0,0.0,590
1,201600000002,16,3,16,1800,1,2,6,1.0,6.0,5.0,1a rue du cimetière,M,0.0,0.0,590
2,201600000003,16,7,13,1900,1,1,1,1.0,6.0,11.0,,M,0.0,0.0,590
3,201600000004,16,8,15,1930,2,2,1,7.0,3.0,477.0,52 rue victor hugo,M,0.0,0.0,590
4,201600000005,16,12,23,1100,1,2,3,1.0,3.0,11.0,rue Joliot curie,M,0.0,0.0,590


#### Users renaming

In [148]:
users_columns = {"place":"place??",
                 "catu":"user_category",
                 "grav":"accident_severity",
                 "sexe":"user_sex",
                 "trajet":"trajet??",
                 "secu":"safety_equipt",
                 "locp":"pedestrian_loc",
                 "actp":"pedestrian_action",
                 "etatp":"pedestrian_company",
                 "an_nais":"an_nais??",
                 "num_veh":"vehicle_ID"}

users_df.rename(columns=users_columns, inplace=True)

In [149]:
users_df.head()

Unnamed: 0,Num_Acc,place??,user_category,accident_severity,user_sex,trajet??,safety_equipt,pedestrian_loc,pedestrian_action,pedestrian_company,an_nais??,vehicle_ID
0,201600000001,1.0,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,B02
1,201600000001,1.0,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,A01
2,201600000002,1.0,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0,A01
3,201600000002,2.0,2,3,1,0.0,11.0,0.0,0.0,0.0,2000.0,A01
4,201600000002,3.0,2,3,2,0.0,11.0,0.0,0.0,0.0,1962.0,A01


#### Vehicles renaming

In [150]:
vehicles_columns = {"senc":"senc??",
                    "catv":"vehicle_category",
                    "occutc":"occutc??",
                    "obs":"obs??",
                    "obsm":"obsm??",
                    "choc":"choc??",
                    "manv":"manv??",
                    "num_veh":"vehicle_ID"}

vehicles_df.rename(columns=vehicles_columns, inplace=True)

In [151]:
vehicles_df["vehicle_category"].nunique()

33

In [171]:
vehicles_df.head()

Unnamed: 0,Num_Acc,vehicle_category,vehicle_ID
0,201600000001,7,B02
1,201600000001,2,A01
2,201600000002,7,A01
3,201600000003,7,A01
4,201600000004,32,B02


#### Places renaming

In [153]:
places_columns = {}

places_df.rename(columns=places_columns, inplace=True)

In [154]:
places_df.head()

Unnamed: 0,Num_Acc,catr,voie,v1,v2,circ,nbv,pr,pr1,vosp,prof,plan,lartpc,larrout,surf,infra,situ,env1
0,201600000001,3.0,39,,,2.0,0.0,,,0.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0
1,201600000002,3.0,39,,,1.0,0.0,,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0
2,201600000003,3.0,1,,,2.0,2.0,,,0.0,1.0,3.0,0.0,68.0,2.0,0.0,3.0,99.0
3,201600000004,4.0,0,,,2.0,0.0,,,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,99.0
4,201600000005,4.0,0,,,0.0,0.0,,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0


#### Dropping some irrelevant columns

In [155]:
charac_df.drop(columns=["agg??", "department", "municipality"], inplace=True)
users_df.drop(columns=["place??", "trajet??", "an_nais??"], inplace=True)
vehicles_df.drop(columns=["senc??", "occutc??", "obs??", "obsm??", "choc??", "manv??"], inplace=True)
#places_df.drop(columns=[""], inplace=True)

## Mapping of feature's values

#### Characteristics mapping

In [156]:
# Creating the localisation column from postal address
charac_df["localisation"] = charac_df["postal_address"].isnull()

In [157]:
localisation_map = {True:"Out of agglomeration", False:"In built-up areas"}

lighting_map = {1:"Full day", 2:"Twilight or dawn", 3:"Night without public lighting",
            4:"Night with public lighting not lit", 5:"Night with public lighting on"}

intersection_map ={1:"Out of intersection", 2:"X Intersection", 3:" T Intersection",
                   4:"Y Intersection", 5:"More than 4 branches", 6:"Giratory",
                   7:"Place", 8:"Level crossing", 9:"Other intersection"}

atmosphere_map = {1:"Normal", 2:"Light rain", 3:"Heavy rain",
                  4:"Snow - hail", 5:"Fog - smoke", 6:"Strong wind - storm",
                  7:"Dazzling weather", 8:"Cloudy weather", 9:"Other"}

collision_map = {1:"Frontal", 2:"Rear", 3:"By the side", 4:"Chain",
                 5:"Multiple", 6:"Other", 7:"Without"}

GPS_map = {"M":"Métropole", "A":"Antilles (Martinique or Guadeloupe)",
           "G":"Guyane", "R":"Réunion", "Y":"Mayotte"}


charac_df["localisation"] = charac_df["localisation"].map(localisation_map)
charac_df["lighting"] = charac_df["lighting"].map(lighting_map)
charac_df["intersection"] = charac_df["intersection"].map(intersection_map)
charac_df["atmosphere"] = charac_df["atmosphere"].map(atmosphere_map)
charac_df["collision"] = charac_df["collision"].map(collision_map)
charac_df["GPS"] = charac_df["GPS"].map(GPS_map)

#### Users mapping

#### Vehicles mapping

#### Places mapping

## Merging datasets

In [158]:
df = pd.merge(charac_df, users_df)
df.head()

Unnamed: 0,Num_Acc,year,month,day,time,lighting,intersection,atmosphere,collision,postal_address,...,longitude,localisation,user_category,accident_severity,user_sex,safety_equipt,pedestrian_loc,pedestrian_action,pedestrian_company,vehicle_ID
0,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,0.0,In built-up areas,1,1,2,11.0,0.0,0.0,0.0,B02
1,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,0.0,In built-up areas,1,3,1,21.0,0.0,0.0,0.0,A01
2,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,0.0,In built-up areas,1,3,1,11.0,0.0,0.0,0.0,A01
3,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,0.0,In built-up areas,2,3,1,11.0,0.0,0.0,0.0,A01
4,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,0.0,In built-up areas,2,3,2,11.0,0.0,0.0,0.0,A01


In [162]:
print(df.shape)
print(charac_df.shape)
print(users_df.shape)

(839985, 14)
(1876005, 9)


In [164]:
df = pd.merge(df, vehicles_df)
df.head()

Unnamed: 0,Num_Acc,year,month,day,time,lighting,intersection,atmosphere,collision,postal_address,...,localisation,user_category,accident_severity,user_sex,safety_equipt,pedestrian_loc,pedestrian_action,pedestrian_company,vehicle_ID,vehicle_category
0,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,In built-up areas,1,1,2,11.0,0.0,0.0,0.0,B02,7
1,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,In built-up areas,1,3,1,21.0,0.0,0.0,0.0,A01,2
2,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,In built-up areas,1,3,1,11.0,0.0,0.0,0.0,A01,7
3,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,In built-up areas,2,3,1,11.0,0.0,0.0,0.0,A01,7
4,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,In built-up areas,2,3,2,11.0,0.0,0.0,0.0,A01,7


In [165]:
print(df.shape)
print(vehicles_df.shape)

(1875983, 23)
(1433389, 3)


In [166]:
df = pd.merge(df, places_df)
df.head()

Unnamed: 0,Num_Acc,year,month,day,time,lighting,intersection,atmosphere,collision,postal_address,...,pr1,vosp,prof,plan,lartpc,larrout,surf,infra,situ,env1
0,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,,0.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0
1,201600000001,16,2,1,1445,Full day,Out of intersection,Cloudy weather,By the side,"46, rue Sonneville",...,,0.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0
2,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0
3,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0
4,201600000002,16,3,16,1800,Full day,Giratory,Normal,Other,1a rue du cimetière,...,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0


In [168]:
print(df.shape)
print(places_df.shape)

(1875983, 40)
(839985, 18)


In [170]:
df.to_csv("merged_dataset.csv")