In [2]:
import featuretools as ft
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 5)


In [5]:
lieux = pd.read_csv("original-utf8/lieux-2018.csv")
lieux = lieux[["Num_Acc","catr","prof","surf", "dep"]]
print(lieux.head())

Num_Acc  catr  prof  surf
0  201800000001     3   1.0   1.0
1  201800000002     4   1.0   1.0
2  201800000003     3   1.0   1.0
3  201800000004     3   1.0   1.0
4  201800000005     4   1.0   1.0


In [11]:
time = pd.read_csv("time.csv")
time["h"].value_counts().to_json()

'{"17":4992,"18":4806,"16":4130,"19":3697,"15":3518,"8":3438,"14":3167,"12":3114,"9":2963,"11":2933,"13":2848,"10":2614,"20":2605,"7":2499,"21":1857,"22":1490,"23":1257,"6":1224,"1":882,"5":881,"2":640,"3":637,"4":571,"55":2,"59":1,"50":1,"34":1,"30":1}'

In [13]:
caracteristiques = pd.read_csv("original-utf8/caracteristiques-2018.csv")
caracteristiques = caracteristiques[["Num_Acc","mois","hrmn","lum","agg","int","atm","lat","long"]]
caracteristiques["lat"] = caracteristiques["lat"] / 100000 
caracteristiques["long"] = caracteristiques["long"] / 100000 
caracteristiques.to_csv("carateristiques-2018.csv")
print(caracteristiques.head())

Num_Acc  mois  hrmn  lum  agg  int  atm       lat     long
0  201800000001     1  1505    1    1    4  1.0  50.55737  2.94992
1  201800000002     2  1015    1    2    7  7.0  50.52936  2.93151
2  201800000003     3  1135    1    2    3  1.0  50.51243  2.91714
3  201800000004     5  1735    1    2    1  7.0  50.51974  2.89123
4  201800000005     6  1605    1    2    1  1.0  50.51607  2.90605


In [4]:
usagers = pd.read_csv("original-utf8/usagers-2018.csv")
usagers = usagers[["Num_Acc","grav"]]
print(usagers.head())

Num_Acc  grav
0  201800000001     3
1  201800000001     1
2  201800000002     1
3  201800000002     4
4  201800000003     3


In [5]:
vehicules = pd.read_csv("original-utf8/vehicules-2018.csv")
print(vehicules.head())

Num_Acc  catv  obs  obsm  manv
0  201800000001     7  0.0   2.0   1.0
1  201800000001     7  0.0   2.0  15.0
2  201800000002     7  0.0   1.0   1.0
3  201800000003    33  1.0   2.0   1.0
4  201800000003     7  0.0   2.0  15.0


In [6]:
es = ft.EntitySet(id='Num_Acc')

In [7]:
es = es.entity_from_dataframe(entity_id = 'lieux', 
                  dataframe = lieux,
                  index = 'Num_Acc')

In [8]:
es = es.entity_from_dataframe(entity_id = 'caracteristiques', 
                  dataframe = caracteristiques,
                  index = 'Num_Acc')

In [9]:
es = es.entity_from_dataframe(entity_id = 'usagers',
                  make_index = True, 
                  index="usagers_id",
                  dataframe = usagers)

In [10]:
es = es.entity_from_dataframe(entity_id = 'vehicules', 
                  make_index = True, 
                  dataframe = vehicules,
                  index = 'vehicules_id')

In [11]:
r_acc_previous = ft.Relationship(es['lieux']['Num_Acc'], es['caracteristiques']['Num_Acc'])
es = es.add_relationship(r_acc_previous)

r_acc_previous = ft.Relationship(es['lieux']['Num_Acc'], es['usagers']['Num_Acc'])
es = es.add_relationship(r_acc_previous)

r_acc_previous = ft.Relationship(es['lieux']['Num_Acc'], es['vehicules']['Num_Acc'])
es = es.add_relationship(r_acc_previous)
es

Entityset: Num_Acc
  Entities:
    lieux [Rows: 57783, Columns: 4]
    caracteristiques [Rows: 57783, Columns: 11]
    usagers [Rows: 130169, Columns: 3]
    vehicules [Rows: 98876, Columns: 6]
  Relationships:
    caracteristiques.Num_Acc -> lieux.Num_Acc
    usagers.Num_Acc -> lieux.Num_Acc
    vehicules.Num_Acc -> lieux.Num_Acc

In [12]:
features, feature_names = ft.dfs(entityset = es, target_entity = 'lieux')
print(features.head())

catr  prof  surf  SUM(caracteristiques.an)  \
Num_Acc                                                    
201800000001     3   1.0   1.0                        18   
201800000002     4   1.0   1.0                        18   
201800000003     3   1.0   1.0                        18   
201800000004     3   1.0   1.0                        18   
201800000005     4   1.0   1.0                        18   

              SUM(caracteristiques.long)  SUM(caracteristiques.hrmn)  \
Num_Acc                                                                
201800000001                    294992.0                        1505   
201800000002                    293151.0                        1015   
201800000003                    291714.0                        1135   
201800000004                    289123.0                        1735   
201800000005                    290605.0                        1605   

              SUM(caracteristiques.mois)  SUM(caracteristiques.agg)  \
Num_Acc          

In [13]:
feature_matrix_enc, features_enc = ft.encode_features(features, feature_names)

In [14]:
feature_matrix_enc.to_csv("test.csv")