## Lecture du dataframe

In [67]:
import pandas as pd
import numpy as np
import src
from src.data.securite_routiere import print_memory
from src import INTDIR, RAWDIR

In [68]:
fn = INTDIR / "carcteristiques-2021.parquet"
df = pd.read_parquet(fn)
print(df.shape)
df

(56518, 12)


Unnamed: 0,an_mois_jour_hrmn,Num_Acc,luminosity,dep,com,agglomeration,intersection,weather,collision,adr,lat,long
0,2021-11-30 07:32:00,202100000001,2,30,30319,1,1,1,1,CD 981,44.038958,4.348022
1,2021-09-25 14:20:00,202100000002,1,51,51544,1,3,1,3,Aire de repos croisement D20E9 après sortie D977,49.242129,4.554546
2,2021-07-15 07:55:00,202100000003,1,85,85048,2,1,7,6,15 rue François Nicolas,46.921950,-0.964460
3,2021-03-27 19:45:00,202100000004,5,93,93005,2,2,3,6,Route de Mitry,48.949363,2.519664
4,2021-02-25 07:20:00,202100000005,5,76,76429,2,1,1,2,PARIS. ROUTE DE,49.408380,1.145810
...,...,...,...,...,...,...,...,...,...,...,...,...
56513,2021-01-01 06:10:00,202100056514,3,33,33021,1,1,5,6,,44.526619,-0.195544
56514,2021-01-01 10:20:00,202100056515,1,38,38405,1,1,2,6,D520,45.380291,5.704914
56515,2021-01-01 18:00:00,202100056516,3,26,26064,1,1,2,1,Route départementale 538,44.911210,5.019636
56516,2021-01-01 10:55:00,202100056517,1,33,33003,1,2,1,6,Voie rapide Bassens Ambes,44.954275,-0.517921


##  Good practices

In [69]:
df.dtypes

an_mois_jour_hrmn    datetime64[ns]
Num_Acc                       int64
luminosity                    int64
dep                          object
com                          object
agglomeration                 int64
intersection                  int64
weather                       int64
collision                     int64
adr                          object
lat                         float64
long                        float64
dtype: object

In [70]:
def print_memory(df, cols=None):
    """
    Prints the memory consumed by each individual column of the DataFrame.
    """
    cols = cols or df.columns
    mem = np.round(df[cols].memory_usage(deep=True) / 1e6, 1)
    print("total_memory = {}MB".format(np.round(mem.sum(), 1)))
    return mem 

In [71]:
print_memory(df)

total_memory = 15.6MB


Index                0.0
an_mois_jour_hrmn    0.5
Num_Acc              0.5
luminosity           0.5
dep                  3.3
com                  3.5
agglomeration        0.5
intersection         0.5
weather              0.5
collision            0.5
adr                  4.3
lat                  0.5
long                 0.5
dtype: float64

In [72]:
int_cols = list(df.select_dtypes("integer").columns)
print_memory(df, int_cols)

total_memory = 3.0MB


Index            0.0
Num_Acc          0.5
luminosity       0.5
agglomeration    0.5
intersection     0.5
weather          0.5
collision        0.5
dtype: float64

In [73]:
encodings = [np.int8, np.int16, np.int32]
for c in int_cols:
    m = np.abs(df[c]).max()
    for e in encodings:
        if m < np.iinfo(e).max:
            df[c] = e(df[c])
            break
df[int_cols].dtypes

Num_Acc          int64
luminosity        int8
agglomeration     int8
intersection      int8
weather           int8
collision         int8
dtype: object

In [74]:
float_cols = list(df.select_dtypes("float").columns)
for c in float_cols:
    df[c] = np.float32(df[c])
df[float_cols].dtypes

lat     float32
long    float32
dtype: object

In [75]:
nu = df.select_dtypes("object").nunique()
nu

dep      107
com    11150
adr    29668
dtype: int64

In [76]:
obj_cols = list(df.select_dtypes("object").columns)
print_memory(df, obj_cols)

total_memory = 11.1MB


Index    0.0
dep      3.3
com      3.5
adr      4.3
dtype: float64

In [77]:
for c in obj_cols:
    df[c] = df[c].astype("category")
df[obj_cols].dtypes

dep    category
com    category
adr    category
dtype: object

In [78]:
print_memory(df)

total_memory = 6.6MB


Index                0.0
an_mois_jour_hrmn    0.5
Num_Acc              0.5
luminosity           0.1
dep                  0.1
com                  1.1
agglomeration        0.1
intersection         0.1
weather              0.1
collision            0.1
adr                  3.5
lat                  0.2
long                 0.2
dtype: float64