In [33]:
import pandas as pd
import json
import os
import glob

In [None]:
# Accidents

df_list = [] # List to store DataFrames generated from JSON files

# Retrieve list of JSON files
json_pattern = os.path.join("./shapefiles/accidents",'*.json')
file_list = glob.glob(json_pattern)

i = 0 # We create this auxiliary variable to keep track of the process
for file in file_list:
    # Read a JSON file
    with open(file, 'r') as f:
        data = json.loads(f.read())
    
    # Load the JSON file into a DataFrame and store the DataFrame in a list of DataFrames
        # https://towardsdatascience.com/how-to-convert-json-into-a-pandas-dataframe-100b2ae1e0d8. Check Section 3
    df_list.append(pd.json_normalize(data, "features"))
    i += 1
    print("Iter " + str(i))

# Concatenate all DataFrames from the list into a single DataFrame
accidents_df = pd.concat(df_list, axis = 0)

column_dict = {} # Dictionary to store old and new column names for the concatenated DataFrame

# We rename the columns by removing "attributes." from their names
for i in range(len(accidents_df.columns)):
    # Map the old column names with the new ones (without "attributes.") and store the mapping in a dictionary
    column_dict.update({accidents_df.columns[i]: accidents_df.columns[i].replace("attributes.", "")})

# We rename the columns using the dictionary
accidents_df.rename(columns = column_dict, inplace = True)

# There're repeated indexes, which can cause problems when saving the data in a JSON.
# We take care of this by resetting the index and dropping the generated index column
accidents_df.reset_index(inplace = True)
accidents_df.drop("index", axis = 1, inplace = True)

# We save the merged data in a JSON file
accidents_df.to_json("shapefiles/accidents/accidents_2015-2022.json")

# accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")

In [None]:
# Injured people

df_list = [] # List to store DataFrames generated from JSON files

# Retrieve list of JSON files
json_pattern = os.path.join("./shapefiles/injured",'*.json')
file_list = glob.glob(json_pattern)

i = 0 # We create this auxiliary variable to keep track of the process
for file in file_list:
    # Read a JSON file
    with open(file, 'r') as f:
        data = json.loads(f.read())
    
    # Load the JSON file into a DataFrame and store the DataFrame in a list of DataFrames
        # https://towardsdatascience.com/how-to-convert-json-into-a-pandas-dataframe-100b2ae1e0d8. Check Section 3
    df_list.append(pd.json_normalize(data, "features"))
    i += 1
    print("Iter " + str(i))

# Concatenate all DataFrames from the list into a single DataFrame
injured_people_df = pd.concat(df_list, axis = 0)

column_dict = {} # Dictionary to store old and new column names for the concatenated DataFrame

# We rename the columns by removing "attributes." from their names
for i in range(len(injured_people_df.columns)):
    # Map the old column names with the new ones (without "attributes.") and store the mapping in a dictionary
    column_dict.update({injured_people_df.columns[i]: injured_people_df.columns[i].replace("attributes.", "")})

# We rename the columns using the dictionary
injured_people_df.rename(columns = column_dict, inplace = True)

# There're repeated indexes, which can cause problems when saving the data in a JSON.
# We take care of this by resetting the index and dropping the generated index column
injured_people_df.reset_index(inplace = True)
injured_people_df.drop("index", axis = 1, inplace = True)

# We save the merged data in a JSON file
injured_people_df.to_json("shapefiles/injured/injured_people_2015-2022.json")

# injured_people_df = pd.read_json("shapefiles/injured_people/injured_people_2015-2022.json")

In [None]:
# Killed people

# We make sure to remove unnecessary data from killed people.
# We're working with data from 2015 up to Aug 2022. When bringing data from killed people, we brought it from 2015 up to the
# current day.
# We need to remove data from Sep 2022

# We load data from killed people
with open('shapefiles/killed/killed_people_2015-2022.json','r') as f:
    data = json.loads(f.read())
killed_people_df = pd.json_normalize(data, record_path = ["features"])

column_dict = {} # Dictionary to store old and new column names for the DataFrame

# We rename the columns by removing "attributes." from their names
for i in range(len(killed_people_df.columns)):
    # Map the old column names with the new ones (without "attributes.") and store the mapping in a dictionary
    column_dict.update({killed_people_df.columns[i]: killed_people_df.columns[i].replace("attributes.", "")})

# We rename the columns using the dictionary
killed_people_df.rename(columns = column_dict, inplace = True)

# We bring the accident codes to a list
# accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")
accident_code_list = accidents_df["FORMULARIO"].to_list()

# We filter killed people by accident codes 
killed_people_df_r = killed_people_df[killed_people_df["FORMULARIO"].isin(accident_code_list)].copy()

# We save the DataFrame in a JSON file
killed_people_df_r.to_json("shapefiles/killed/killed_people_2015-2022_r1.json")

# killed_people_df = pd.read_json("shapefiles/killed/killed_people_2015-2022_r1.json")

In [None]:
# Vehicles

df_list = []

json_pattern = os.path.join("./shapefiles/vehicles",'*.json')
file_list = glob.glob(json_pattern)

i = 0
for file in file_list:
    with open(file, 'r') as f:
        data = json.loads(f.read())
    
    df_list.append(pd.json_normalize(data, "features"))
    i += 1
    print("Iter " + str(i))

vehicles_df = pd.concat(df_list, axis = 0)

column_dict = {}

for i in range(len(vehicles_df.columns)):
    column_dict.update({vehicles_df.columns[i]: vehicles_df.columns[i].replace("attributes.", "")})

vehicles_df.rename(columns = column_dict, inplace = True)

vehicles_df.reset_index(inplace = True)
vehicles_df.drop("index", axis = 1, inplace = True)

# We make sure to remove unnecessary data from vehicles.
# We're working with data from 2015 up to Aug 2022. When bringing data from vehicles, we brought it all up to the current day

# We filter vehicles by accident codes 
# accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")
vehicles_df_r = vehicles_df[vehicles_df["FORMULARIO"].isin(accident_code_list)].copy()

# We save the merged data in a JSON file
vehicles_df_r.to_json("shapefiles/vehicles/vehicles_2015-2022.json")

# vehicles_df = pd.read_json("shapefiles/vehicles/vehicles_2015-2022.json")

In [None]:
# Causes

df_list = []

json_pattern = os.path.join("./shapefiles/causes",'*.json')
file_list = glob.glob(json_pattern)

i = 0
for file in file_list:
    with open(file, 'r') as f:
        data = json.loads(f.read())
    
    df_list.append(pd.json_normalize(data, "features"))
    i += 1
    print("Iter " + str(i))

causes_df = pd.concat(df_list, axis = 0)

column_dict = {}

for i in range(len(causes_df.columns)):
    column_dict.update({causes_df.columns[i]: causes_df.columns[i].replace("attributes.", "")})

causes_df.rename(columns = column_dict, inplace = True)

causes_df.reset_index(inplace = True)
causes_df.drop("index", axis = 1, inplace = True)

# We make sure to remove unnecessary data from causes.
# We're working with data from 2015 up to Aug 2022. When bringing data from causes, we brought it all up to the current day

# We filter causes by accident codes 
# accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")
causes_df_r = causes_df[causes_df["FORMULARIO"].isin(accident_code_list)].copy()

# We save the merged data in a JSON file
causes_df_r.to_json("shapefiles/causes/causes_2015-2022.json")

# causes_df = pd.read_json("shapefiles/causes/causes_2015-2022.json")

In [None]:
# Actors

df_list = []

json_pattern = os.path.join("./shapefiles/actors",'*.json')
file_list = glob.glob(json_pattern)

i = 0
for file in file_list:
    with open(file, 'r') as f:
        data = json.loads(f.read())
    
    df_list.append(pd.json_normalize(data, "features"))
    i += 1
    print("Iter " + str(i))

actors_df = pd.concat(df_list, axis = 0)

column_dict = {}

for i in range(len(actors_df.columns)):
    column_dict.update({actors_df.columns[i]: actors_df.columns[i].replace("attributes.", "")})

actors_df.rename(columns = column_dict, inplace = True)

actors_df.reset_index(inplace = True)
actors_df.drop("index", axis = 1, inplace = True)

# We make sure to remove unnecessary data from actors.
# We're working with data from 2015 up to Aug 2022. When bringing data from actors, we brought it all up to the current day

# We filter actors by accident codes 
# accidents_df = pd.read_json("shapefiles/accidents/accidents_2015-2022.json")
actors_df_r = actors_df[actors_df["FORMULARIO"].isin(accident_code_list)].copy()

# We save the merged data in a JSON file
actors_df_r.to_json("shapefiles/actors/actors_2015-2022.json")

# actors_df = pd.read_json("shapefiles/actors/actors_2015-2022.json")