In [1]:
import pandas as pd
import numpy as np

In [2]:
# Accidents

accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")

# Extract date info
    # The unix time is UTC time, which is five hours ahead of Bogotá
    # https://thispointer.com/subtract-hours-from-datetime-in-python/
accidents_df["FECHA_HORA_ACC_r"] = pd.to_datetime(accidents_df["FECHA_HORA_ACC"], unit = "ms") - pd.DateOffset(hours = 5)

    # The following approach is more formal, but leaves the column with a final "flag" indicating the time difference w.r.t. UTC
#accidents_df["FECHA_HORA_ACC_r"] = pd.to_datetime(accidents_df["FECHA_HORA_ACC"], unit = "ms").dt.tz_localize("UTC").\
#    dt.tz_convert("America/Bogota")

accidents_df["ANO_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.year

    # Install the Spanish locale before running the following code
    # https://serverpilot.io/docs/how-to-install-locales/
accidents_df["MES_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.month_name(locale = "es_ES.UTF-8")
accidents_df["MES_NRO_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.month
accidents_df["DIA_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.day_name(locale = "es_ES.UTF-8")

    # 1 is Monday, 2 is Tuesday and so on
accidents_df["DIA_NRO_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.dayofweek + 1

accidents_df["DIA_MES_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.day
accidents_df["HORA_OCURRENCIA_ACC"] = accidents_df["FECHA_HORA_ACC_r"].dt.hour

# Uppercase month and day of week to keep consistency with original format
accidents_df["MES_OCURRENCIA_ACC"] = accidents_df["MES_OCURRENCIA_ACC"].str.upper()
accidents_df["DIA_OCURRENCIA_ACC"] = accidents_df["DIA_OCURRENCIA_ACC"].str.upper()

# Rename the longitude and latitude columns
accidents_df.rename(columns = {"geometry.x": "LONGITUDE", "geometry.y": "LATITUDE"}, inplace = True)

# CIV and PK_CALZADA NaN treatment: fill with 0s and then change their type to int
accidents_df["CIV"] = accidents_df["CIV"].fillna(0)
accidents_df["PK_CALZADA"] = accidents_df["PK_CALZADA"].fillna(0)
accidents_df["CIV"] = accidents_df["CIV"].astype(int)
accidents_df["PK_CALZADA"] = accidents_df["PK_CALZADA"].astype(int)

accidents_df.to_json("shapefiles/accidents_2015-2022_r1.json")

In [3]:
accidents_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229238 entries, 0 to 229237
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   OBJECTID                229238 non-null  int64         
 1   FORMULARIO              229238 non-null  object        
 2   LOCALIDAD               229192 non-null  object        
 3   CIV                     229238 non-null  int64         
 4   PK_CALZADA              229238 non-null  int64         
 5   CLASE_ACC               229238 non-null  object        
 6   GRAVEDAD                229238 non-null  object        
 7   FECHA_HORA_ACC          229238 non-null  int64         
 8   LONGITUDE               229238 non-null  float64       
 9   LATITUDE                229238 non-null  float64       
 10  FECHA_HORA_ACC_r        229238 non-null  datetime64[ns]
 11  ANO_OCURRENCIA_ACC      229238 non-null  int64         
 12  MES_OCURRENCIA_ACC      229238

In [4]:
accidents_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,LOCALIDAD,CIV,PK_CALZADA,CLASE_ACC,GRAVEDAD,FECHA_HORA_ACC,LONGITUDE,LATITUDE,FECHA_HORA_ACC_r,ANO_OCURRENCIA_ACC,MES_OCURRENCIA_ACC,MES_NRO_OCURRENCIA_ACC,DIA_OCURRENCIA_ACC,DIA_NRO_OCURRENCIA_ACC,DIA_MES_OCURRENCIA_ACC,HORA_OCURRENCIA_ACC
0,398271,A10979,SUBA,11009611,202458,CHOQUE,SOLO DANOS,1438024200000,-74.0526,4.7173,2015-07-27 14:10:00,2015,JULIO,7,LUNES,1,27,14
1,397828,A000040348,KENNEDY,8006024,245295,ATROPELLO,CON HERIDOS,1420407480000,-74.1712,4.6241,2015-01-04 16:38:00,2015,ENERO,1,DOMINGO,7,4,16
2,397852,A2454,KENNEDY,8005274,197189,CHOQUE,CON HERIDOS,1424992500000,-74.1422,4.6272,2015-02-26 18:15:00,2015,FEBRERO,2,JUEVES,4,26,18
3,397889,A000277508,KENNEDY,50008335,271869,CHOQUE,CON HERIDOS,1448316900000,-74.1399,4.6184,2015-11-23 17:15:00,2015,NOVIEMBRE,11,LUNES,1,23,17
4,397891,A000238124,KENNEDY,8004223,197043,CHOQUE,SOLO DANOS,1443207900000,-74.1352,4.6322,2015-09-25 14:05:00,2015,SEPTIEMBRE,9,VIERNES,5,25,14


In [5]:
# Injured people

injured_people_df = pd.read_json("shapefiles/injured/injured_people_2015-2022.json")

# Extract date info
    # The unix time is UTC time, which is five hours ahead of Bogotá
    # https://thispointer.com/subtract-hours-from-datetime-in-python/
injured_people_df["FECHA_HORA_ACC_r"] = pd.to_datetime(injured_people_df["FECHA_HORA_ACC"], unit = "ms") - pd.DateOffset(hours = 5)

    # The following approach is more formal, but leaves the column with a final "flag" indicating the time difference w.r.t. UTC
#injured_people_df["FECHA_HORA_ACC_r"] = pd.to_datetime(injured_people_df["FECHA_HORA_ACC"], unit = "ms").dt.tz_localize("UTC").\
#    dt.tz_convert("America/Bogota")

injured_people_df["ANO_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.year

    # Install the Spanish locale before running the following code
    # https://serverpilot.io/docs/how-to-install-locales/
injured_people_df["MES_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.month_name(locale = "es_ES.UTF-8")
injured_people_df["MES_NRO_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.month
injured_people_df["DIA_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.day_name(locale = "es_ES.UTF-8")

    # 1 is Monday, 2 is Tuesday and so on
injured_people_df["DIA_NRO_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.dayofweek + 1

injured_people_df["DIA_MES_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.day
injured_people_df["HORA_OCURRENCIA_ACC"] = injured_people_df["FECHA_HORA_ACC_r"].dt.hour

# Uppercase month and day of week to keep consistency with original format
injured_people_df["MES_OCURRENCIA_ACC"] = injured_people_df["MES_OCURRENCIA_ACC"].str.upper()
injured_people_df["DIA_OCURRENCIA_ACC"] = injured_people_df["DIA_OCURRENCIA_ACC"].str.upper()

# Rename the longitude and latitude columns
injured_people_df.rename(columns = {"geometry.x": "LONGITUDE", "geometry.y": "LATITUDE"}, inplace = True)

# GENERO NaN treatment: fill with "SIN INFORMACION"
injured_people_df["GENERO"] = injured_people_df["GENERO"].fillna("SIN INFORMACION")

# EDAD NaN treatment: since we don't have information on what a zero represents (is it a baby that hasn't turned one year yet or
# a null value?), we leave this feature as it is

injured_people_df.to_json("shapefiles/injured_people_2015-2022_r1.json")

In [6]:
injured_people_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119973 entries, 0 to 119972
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   OBJECTID                119973 non-null  int64         
 1   FORMULARIO              119973 non-null  object        
 2   LOCALIDAD               119950 non-null  object        
 3   CLASE_ACC               119973 non-null  object        
 4   CONDICION               119973 non-null  object        
 5   GENERO                  119973 non-null  object        
 6   EDAD                    119402 non-null  float64       
 7   FECHA_HORA_ACC          119973 non-null  int64         
 8   LONGITUDE               119973 non-null  float64       
 9   LATITUDE                119973 non-null  float64       
 10  FECHA_HORA_ACC_r        119973 non-null  datetime64[ns]
 11  ANO_OCURRENCIA_ACC      119973 non-null  int64         
 12  MES_OCURRENCIA_ACC      119973

In [7]:
injured_people_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,LOCALIDAD,CLASE_ACC,CONDICION,GENERO,EDAD,FECHA_HORA_ACC,LONGITUDE,LATITUDE,FECHA_HORA_ACC_r,ANO_OCURRENCIA_ACC,MES_OCURRENCIA_ACC,MES_NRO_OCURRENCIA_ACC,DIA_OCURRENCIA_ACC,DIA_NRO_OCURRENCIA_ACC,DIA_MES_OCURRENCIA_ACC,HORA_OCURRENCIA_ACC
0,750107,A284367,TEUSAQUILLO,CHOQUE,CICLISTA,MASCULINO,44.0,1444908600000,-74.091857,4.62946,2015-10-15 06:30:00,2015,OCTUBRE,10,JUEVES,4,15,6
1,731755,A5846,SUBA,CHOQUE,PASAJERO,FEMENINO,23.0,1432659000000,-74.074398,4.73915,2015-05-26 11:50:00,2015,MAYO,5,MARTES,2,26,11
2,731754,A5846,SUBA,CHOQUE,MOTOCICLISTA,MASCULINO,26.0,1432659000000,-74.074398,4.73915,2015-05-26 11:50:00,2015,MAYO,5,MARTES,2,26,11
3,760009,A000239038,KENNEDY,ATROPELLO,PEATON,FEMENINO,79.0,1442419200000,-74.169008,4.618647,2015-09-16 11:00:00,2015,SEPTIEMBRE,9,MIÉRCOLES,3,16,11
4,749002,A9675,FONTIBON,ATROPELLO,PEATON,MASCULINO,21.0,1437103200000,-74.138451,4.678535,2015-07-16 22:20:00,2015,JULIO,7,JUEVES,4,16,22


In [8]:
# Killed people

killed_people_df = pd.read_json("shapefiles/killed/killed_people_2015-2022_r1.json")

# Extract date info
    # The unix time is UTC time, which is five hours ahead of Bogotá
    # https://thispointer.com/subtract-hours-from-datetime-in-python/
killed_people_df["FECHA_HORA_ACC_r"] = pd.to_datetime(killed_people_df["FECHA_HORA_ACC"], unit = "ms") - pd.DateOffset(hours = 5)

    # The following approach is more formal, but leaves the column with a final "flag" indicating the time difference w.r.t. UTC
#killed_people_df["FECHA_HORA_ACC_r"] = pd.to_datetime(killed_people_df["FECHA_HORA_ACC"], unit = "ms").dt.tz_localize("UTC").\
#    dt.tz_convert("America/Bogota")

killed_people_df["ANO_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.year

    # Install the Spanish locale before running the following code
    # https://serverpilot.io/docs/how-to-install-locales/
killed_people_df["MES_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.month_name(locale = "es_ES.UTF-8")
killed_people_df["MES_NRO_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.month
killed_people_df["DIA_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.day_name(locale = "es_ES.UTF-8")

    # 1 is Monday, 2 is Tuesday and so on
killed_people_df["DIA_NRO_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.dayofweek + 1

killed_people_df["DIA_MES_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.day
killed_people_df["HORA_OCURRENCIA_ACC"] = killed_people_df["FECHA_HORA_ACC_r"].dt.hour

# Uppercase month and day of week to keep consistency with original format
killed_people_df["MES_OCURRENCIA_ACC"] = killed_people_df["MES_OCURRENCIA_ACC"].str.upper()
killed_people_df["DIA_OCURRENCIA_ACC"] = killed_people_df["DIA_OCURRENCIA_ACC"].str.upper()

# Rename the longitude and latitude columns
killed_people_df.rename(columns = {"geometry.x": "LONGITUDE", "geometry.y": "LATITUDE"}, inplace = True)

# GENERO NaN treatment: fill with "SIN INFORMACION"
killed_people_df["GENERO"] = killed_people_df["GENERO"].fillna("SIN INFORMACION")

# MUERTE_POSTERIOR NaN treatment: fill with "N" (the person die in the accident, not after)
killed_people_df["MUERTE_POSTERIOR"] = killed_people_df["MUERTE_POSTERIOR"].fillna("N")

# EDAD NaN treatment: since we don't have information on what a zero represents (is it a baby that hasn't turned one year yet or
# a null value?), we leave this feature as it is

killed_people_df.to_json("shapefiles/killed_people_2015-2022_r2.json")

In [9]:
killed_people_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3829 entries, 0 to 3834
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   OBJECTID                3829 non-null   int64         
 1   FORMULARIO              3829 non-null   object        
 2   LOCALIDAD               3828 non-null   object        
 3   CLASE_ACC               3829 non-null   object        
 4   CONDICION               3829 non-null   object        
 5   GENERO                  3829 non-null   object        
 6   EDAD                    3590 non-null   float64       
 7   MUERTE_POSTERIOR        3829 non-null   object        
 8   FECHA_POSTERIOR_MUERTE  1373 non-null   float64       
 9   FECHA_HORA_ACC          3829 non-null   int64         
 10  LONGITUDE               3829 non-null   float64       
 11  LATITUDE                3829 non-null   float64       
 12  FECHA_HORA_ACC_r        3829 non-null   datetime

In [10]:
killed_people_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,LOCALIDAD,CLASE_ACC,CONDICION,GENERO,EDAD,MUERTE_POSTERIOR,FECHA_POSTERIOR_MUERTE,FECHA_HORA_ACC,LONGITUDE,LATITUDE,FECHA_HORA_ACC_r,ANO_OCURRENCIA_ACC,MES_OCURRENCIA_ACC,MES_NRO_OCURRENCIA_ACC,DIA_OCURRENCIA_ACC,DIA_NRO_OCURRENCIA_ACC,DIA_MES_OCURRENCIA_ACC,HORA_OCURRENCIA_ACC
0,1580567,A001342911,KENNEDY,CHOQUE,MOTOCICLISTA,FEMENINO,42.0,N,,1632825300000,-74.139547,4.655748,2021-09-28 05:35:00,2021,SEPTIEMBRE,9,MARTES,2,28,5
1,1572714,A001341550,KENNEDY,CHOQUE,CICLISTA,MASCULINO,38.0,N,,1630632900000,-74.125406,4.62958,2021-09-02 20:35:00,2021,SEPTIEMBRE,9,JUEVES,4,2,20
2,1560220,A001390796,KENNEDY,ATROPELLO,PEATON,MASCULINO,22.0,S,1644970000000.0,1644054600000,-74.139016,4.595142,2022-02-05 04:50:00,2022,FEBRERO,2,SÁBADO,6,5,4
3,1305496,A001390796,KENNEDY,ATROPELLO,MOTOCICLISTA,MASCULINO,35.0,N,,1644054600000,-74.139016,4.595142,2022-02-05 04:50:00,2022,FEBRERO,2,SÁBADO,6,5,4
4,1306185,A001447711,FONTIBON,CHOQUE,MOTOCICLISTA,MASCULINO,28.0,N,,1649113440000,-74.139,4.697,2022-04-04 18:04:00,2022,ABRIL,4,LUNES,1,4,18


In [11]:
# Actors

actors_df = pd.read_json("shapefiles/actors/actors_2015-2022.json")

# GENERO NaN treatment: fill with "SIN INFORMACION"
actors_df["GENERO"] = actors_df["GENERO"].fillna("SIN INFORMACION")

# EDAD NaN treatment: since we don't have information on what a zero represents (is it a baby that hasn't turned one year yet or
# a null value?), we leave this feature as it is

# CODIGO_VEHICULO and CONDICION NaN treatment: PASAJERO, PEATON and null values in CONDICION are associated with null values in
# CODIGO_VEHICULO. We fill null values in CONDICION with "SIN INFORMACION" and null values in CODIGO_VEHICULO with 0s and then
# change their type to int
actors_df["CONDICION"] = actors_df["CONDICION"].fillna("SIN INFORMACION")
actors_df["CODIGO_VEHICULO"] = actors_df["CODIGO_VEHICULO"].fillna(0)
actors_df["CODIGO_VEHICULO"] = actors_df["CODIGO_VEHICULO"].astype(int) 

# ESTADO NaN treatment: fill with "ILESO"
actors_df["ESTADO"] = actors_df["ESTADO"].fillna("ILESO")

# Note:
# len(actors_df[(actors_df["ESTADO"] == "MUERTO")]) = 2738.
# len(actors_df[(actors_df["ESTADO"] == "HERIDO") & (actors_df["MUERTE_POSTERIOR"] == "S")]) = 1090.
# The sum should be 3829, which is the length of killed_people_df, but the sum is 3828.
# However, digging deeper into the actors dataset, there's one uninjured actor (ESTADO ILESO) with MUERTE_POSTERIOR S. Adding
# this actor, the sum is 3829

# MUERTE_POSTERIOR NaN treatment: this feature is related to ESTADO. There're some combinations that don't make much sense when
# compared to other combinations in the dataset (ESTADO MUERTO - MUERTE_POSTERIOR S, ESTADO HERIDO - MUERTE POSTERIOR S and the
# ILESO case mentioned above). It could be that ESTADO MUERTO - MUERTE_POSTERIOR S corresponds to ESTADO HERIDO - 
# MUERTE_POSTERIOR S.
# Given these inconsistencies, we don't further modify ESTADO and leave MUERTE_POSTERIOR as it is.
# However, we engineer a new feature called ESTADO_FINAL
actors_df["ESTADO_FINAL"] = np.where((actors_df["ESTADO"] == "MUERTO") | (actors_df["MUERTE_POSTERIOR"] == "S"), "MUERTO", \
    actors_df["ESTADO"])
# len(actors_df[(actors_df["ESTADO_FINAL"] == "MUERTO")]) = 3829.
# A thorough approach would have to check actors_df against accidents_df GRAVEDAD, injured_people_df and killed_people_df to fix
# inconsistencies if any

actors_df.to_json("shapefiles/actors_2015-2022_r1.json")

In [12]:
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 494399 entries, 0 to 1507504
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   OBJECTID                494399 non-null  int64  
 1   FORMULARIO              494399 non-null  object 
 2   CODIGO_VICTIMA          494399 non-null  int64  
 3   CODIGO_VEHICULO         494399 non-null  int64  
 4   CONDICION               494399 non-null  object 
 5   GENERO                  494399 non-null  object 
 6   EDAD                    484413 non-null  float64
 7   ESTADO                  494399 non-null  object 
 8   MUERTE_POSTERIOR        194590 non-null  object 
 9   FECHA_POSTERIOR_MUERTE  1373 non-null    float64
 10  ESTADO_FINAL            494399 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 65.3+ MB


In [13]:
actors_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,CODIGO_VICTIMA,CODIGO_VEHICULO,CONDICION,GENERO,EDAD,ESTADO,MUERTE_POSTERIOR,FECHA_POSTERIOR_MUERTE,ESTADO_FINAL
0,1,A001342709,1,2,CONDUCTOR,MASCULINO,43.0,ILESO,N,,ILESO
1,2,A001340900,0,1,CONDUCTOR,MASCULINO,42.0,ILESO,N,,ILESO
2,3,A001340900,1,2,CONDUCTOR,MASCULINO,35.0,ILESO,N,,ILESO
3,4,A001342643,0,1,CONDUCTOR,MASCULINO,28.0,ILESO,N,,ILESO
4,5,A001342643,1,2,CONDUCTOR,MASCULINO,27.0,ILESO,N,,ILESO


In [14]:
# Vehicles

vehicles_df = pd.read_json("shapefiles/vehicles/vehicles_2015-2022.json")

# CODIGO_VEHICULO NaN treatment: fill with 1s and then change its type to int
vehicles_df["CODIGO_VEHICULO"] = vehicles_df["CODIGO_VEHICULO"].fillna(1)
vehicles_df["CODIGO_VEHICULO"] = vehicles_df["CODIGO_VEHICULO"].astype(int)

# CLASE NaN treatment: fill with "SIN INFORMACION"
vehicles_df["CLASE"] = vehicles_df["CLASE"].fillna("SIN INFORMACION")

# SERVICIO NaN treatment: fill with "SIN INFORMACION"
# Do some cleaning in rows with values " SIN INFORMACION"
vehicles_df["SERVICIO"] = vehicles_df["SERVICIO"].fillna("SIN INFORMACION")
vehicles_df["SERVICIO"] = np.where(vehicles_df["SERVICIO"] == " SIN INFORMACION", "SIN INFORMACION", vehicles_df["SERVICIO"])

# MODALIDAD NaN treatment: fill with "SIN INFORMACION".
# While this could be done in a single step, we first checked with SERVICIO PUBLICO since the understanding was that MODALIDAD
# was a feature that only applied to SERVICIO PUBLICO vehicles
vehicles_df["MODALIDAD"] = np.where((vehicles_df["SERVICIO"] == "PUBLICO") & (vehicles_df["MODALIDAD"].isna()), \
    "SIN INFORMACION", vehicles_df["MODALIDAD"])
vehicles_df["MODALIDAD"] = np.where((vehicles_df["SERVICIO"] != "PUBLICO") & (vehicles_df["MODALIDAD"].isna()), \
    "SIN INFORMACION", vehicles_df["MODALIDAD"])

# ENFUGA NaN treatment: there're only 3 null values, but the info is rather confusing. So, we leave it as it is

vehicles_df.to_json("shapefiles/vehicles_2015-2022_r1.json")

In [15]:
vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 434396 entries, 12393 to 848454
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   OBJECTID         434396 non-null  int64 
 1   FORMULARIO       434396 non-null  object
 2   CODIGO_VEHICULO  434396 non-null  int64 
 3   CLASE            434396 non-null  object
 4   SERVICIO         434396 non-null  object
 5   MODALIDAD        434396 non-null  object
 6   ENFUGA           434393 non-null  object
dtypes: int64(2), object(5)
memory usage: 46.5+ MB


In [16]:
vehicles_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,CODIGO_VEHICULO,CLASE,SERVICIO,MODALIDAD,ENFUGA
12393,17398,A000041000,1,CAMIONETA,PARTICULAR,SIN INFORMACION,N
12394,17399,A000041049,1,AUTOMOVIL,PUBLICO,PASAJEROS - INDIVIDUAL,N
12395,17400,A000041049,2,AUTOMOVIL,PARTICULAR,SIN INFORMACION,N
12396,17401,A000040444,1,AUTOMOVIL,PUBLICO,PASAJEROS - INDIVIDUAL,N
12397,17402,A000040444,2,AUTOMOVIL,PARTICULAR,SIN INFORMACION,N


In [17]:
# Causes

causes_df = pd.read_json("shapefiles/causes/causes_2015-2022.json")

# The NaN values are associated with an undetermined cause present in 11 rows. They are in TIPO and TIPO_CAUSA
causes_df.fillna("SIN ESTABLECER", inplace = True)

# Do some cleaning in rows with values "OTRAS"
causes_df["NOMBRE"] = np.where(causes_df["NOMBRE"] == "OTRAS", "OTRA", causes_df["NOMBRE"])

causes_df.to_json("shapefiles/causes_2015-2022_r1.json")

In [18]:
causes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267525 entries, 30000 to 658182
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   OBJECTID         267525 non-null  int64 
 1   FORMULARIO       267525 non-null  object
 2   CODIGO_VEHICULO  267525 non-null  int64 
 3   CODIGO_CAUSA     267525 non-null  object
 4   NOMBRE           267525 non-null  object
 5   TIPO             267525 non-null  object
 6   TIPO_CAUSA       267525 non-null  object
dtypes: int64(2), object(5)
memory usage: 26.3+ MB


In [19]:
causes_df.head()

Unnamed: 0,OBJECTID,FORMULARIO,CODIGO_VEHICULO,CODIGO_CAUSA,NOMBRE,TIPO,TIPO_CAUSA
30000,270001,A000690385,1,157,OTRA,CG,CONDUCTOR
30001,270002,A000690956,1,411,OTRA,PE,PEATON
30003,270004,A000688815,2,103,ADELANTAR CERRANDO,CG,CONDUCTOR
30004,270005,A000691245,1,157,OTRA,CG,CONDUCTOR
30005,270006,A000690664,2,134,REVERSO IMPRUDENTE,CG,CONDUCTOR
