Delete pregnant and set age to 0

# Victims

In [2]:
import pandas as pd
import numpy as np

In [3]:
import pickle

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [57]:
dtype_victims={
    'case_id': str,
    'id': int,
    'party_number': int,
    'victim_age': pd.Int16Dtype(), # 999?
    'victim_degree_of_injury': str,# map to ints
    'victim_ejected': pd.Int16Dtype(),
    'victim_role': int,# map to ints
    'victim_safety_equipment_1': str,# map char to ints
    'victim_safety_equipment_2': str,# map char to ints
    'victim_seating_position': pd.Int16Dtype(),# no letters in data so we can use ints
    'victim_sex': str, #char
}

In [58]:
victims_data = pd.read_csv("CSV-2018/victims2018.csv",dtype=dtype_victims,usecols=dtype_victims.keys()) 

In [59]:
clean_data = victims_data.copy()

In [65]:
translation_victim_degree_of_injury = {
    "killed": 1,
    "severe injury": 2,
    "other visible injury": 3,
    "complaint of pain": 4,
    "suspected serious injury": 5,
    "suspected minor injury": 6,
    "possible injury": 7,
    "no injury": 0,
    "7": 7,
}

translation_victim_ejected = {
    "not ejected": 0,
    "fully ejected": 1,
    "partially ejected": 2,
    "unknown": 3,
}

#translation_victim_role = {
#    "driver": 1,
#    "passenger (includes non-operator on bicycle or any victim on/in parked vehicle or multiple victims on/in non-motor vehicle)": 2,
#    "pedestrian": 3,
#    "bicyclist": 4,
#    "other (single victim on/in non-motor vehicle; e.g. ridden animal, horse-drawn carriage, train, or building)": 5,
#    "non-injured party": 6,
#}

translation_victim_sex = {
    "male": 'M',
    "female": 'F',
}

In [66]:
clean_data["victim_degree_of_injury"] = victims_data["victim_degree_of_injury"].map(lambda s: translation_victim_degree_of_injury.get(s, None)).astype(pd.Int16Dtype())
clean_data["victim_sex"] = victims_data["victim_sex"].map(lambda s: translation_victim_sex.get(s, None))
clean_data["victim_ejected"] = victims_data["victim_ejected"].map(lambda s: None if pd.isna(s) or s>3 or s <0 else s).astype(pd.Int16Dtype())
clean_data["victim_seating_position"] = victims_data["victim_seating_position"].map(lambda s: None if pd.isna(s) else 2 if s==3 or s==4 or s==5 or s==6 else s).astype(pd.Int16Dtype())

For victim seating position, we removed duplicated data and therefore map values from 2-6 to 2.

In victim_ejected column, there were 4 values that were 4. Since 4 is not one of the preset options for victim_ejected we decided to replace the 4's by null

---------------------------------------

# Create CSV files

## Victim CSV

In [67]:
tmp_keys=[
    'case_id',
    'id',
    'party_number',
    'victim_age',
    'victim_degree_of_injury',
    'victim_ejected',
    'victim_role',
    'victim_seating_position',
    'victim_sex',
]
victims_csv = clean_data[tmp_keys].copy()

### Handle pregnancy

In order to clean the data and make querying easier, we decided to create a new field: "unborn" which is a boolean telling if the victim was born or not. We set "unborn" from the convention saying that if the age is a 999 then the victim is the fetus of a pregnant woman. Then we replaced the age 999 by None. We chose to replace it by None and not 0 because we thought it would make more sense and that it would be weird if the mean of age of a 30 years old pregnant woman is 15 years.

In [68]:
victims_csv["unborn"] = victims_csv['victim_age'].map(lambda x: None if pd.isna(x) else 'T' if x==999 else 'F')
victims_csv["victim_age"] = victims_data["victim_age"].map(lambda age: None if not pd.isna(age) and age == 999 else age).astype(pd.Int16Dtype())

### Associate party_id to each victim

Here we used data for parties CSV to map case_id and party_number to a unique party_id

In [69]:
clean_parties = load_obj("clean_parties")

In [70]:
# Dict mapping:
# (case_id, party_number) : party_id
dict_party_id = dict()
for x, party_number in zip(zip(clean_parties["case_id"],clean_parties["party_number"]), clean_parties["id"]):
    dict_party_id[x] = party_number

In [71]:
victims_csv["party_id"] = list(map(lambda x: dict_party_id[(x[0],x[1])], list(zip(victims_csv["case_id"],  victims_csv["party_number"]))))

In [72]:
relevant_keys=[
    'id',
    'party_id',
    'victim_age',
    'victim_degree_of_injury',
    'victim_ejected',
    'victim_role',
    'victim_seating_position',
    'victim_sex',
    'unborn',
]
victims_csv = victims_csv[relevant_keys].copy()

In [38]:
victims_csv.to_csv('out/victims.csv', index=False)

## safety_equipment relation CSV

In [73]:
translate= {
    "id": "victim_id", 
    "victim_safety_equipment_1": "safety_equipment_id", 
    "victim_safety_equipment_2": "safety_equipment_id"
}

safety_equipment_1 = clean_data[~clean_data["victim_safety_equipment_1"].isnull()][["id", "victim_safety_equipment_1"]].copy().rename(columns=translate)
safety_equipment_2 = clean_data[~clean_data["victim_safety_equipment_2"].isnull()][["id", "victim_safety_equipment_2"]].copy().rename(columns=translate)
safety_equipment_csv = pd.concat([safety_equipment_1, safety_equipment_2], axis=0).drop_duplicates()
sort_safety_equipment_csv = safety_equipment_csv.sort_values("victim_id")


In [74]:
sort_safety_equipment_csv.to_csv('out/victim_equiped_with_safety_equipment.csv', index=False)

KeyboardInterrupt: 

In [75]:
set(sort_safety_equipment_csv["safety_equipment_id"])

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'J',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y'}

In [None]:
sort_safety_equipment_csv

In [82]:
sort_safety_equipment_csv[sort_safety_equipment_csv[['victim_id']].apply(lambda x: x[0][0] == "0", axis=1)]

Unnamed: 0,victim_id,safety_equipment_id


In [None]:
sort_safety_equipment_csv[sort_safety_equipment_csv[['victim_id']].apply(lambda x: not x[0].isdigit(), axis=1)]