Delete pregnant and set age to 0

# Victims

In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
dtype_victims={
    'case_id': np.uint64, # cannot use int since biggest case id is 9.870011e+18, np.uint64: 64-bit unsigned integer (from 0 to 2**64-1)
    'id': int,
    'party_number': int,
    'victim_age': pd.Int16Dtype(), # 999?
    'victim_degree_of_injury': str,# map to ints
    'victim_ejected': pd.Int16Dtype(),
    'victim_role': int,# map to ints
    'victim_safety_equipment_1': str,# map char to ints
    'victim_safety_equipment_2': str,# map char to ints
    'victim_seating_position': pd.Int16Dtype(),# no letters in data so we can use ints
    'victim_sex': str, #char
}

In [4]:
victims_data = pd.read_csv("CSV-2018/victims2018.csv",dtype=dtype_victims,usecols=dtype_victims.keys()) 

In [5]:
victims_data

Unnamed: 0,case_id,id,party_number,victim_age,victim_degree_of_injury,victim_ejected,victim_role,victim_safety_equipment_1,victim_safety_equipment_2,victim_seating_position,victim_sex
0,3,3,1,21,no injury,0,2,G,,3,male
1,5,4,1,44,severe injury,0,1,G,,1,male
2,8,5,1,59,no injury,0,2,G,,3,female
3,8,6,2,31,no injury,0,2,G,,3,male
4,9,7,2,14,complaint of pain,0,2,C,,6,female
...,...,...,...,...,...,...,...,...,...,...,...
4082680,9870011226102009803,8817531,2,20,other visible injury,0,1,G,,1,female
4082681,9870011231152508671,8817532,1,27,complaint of pain,0,2,H,,3,female
4082682,9870011231152508671,8817533,2,74,complaint of pain,0,2,G,,3,female
4082683,9870011231152508671,8817534,2,64,complaint of pain,0,1,G,,1,male


In [6]:
clean_data = victims_data.copy()

In [7]:
translation_victim_age = {
    "not stated": 998,
    "pregnancy": 999,
}

translation_victim_degree_of_injury = {
    "killed": 1,
    "severe injury": 2,
    "other visible injury": 3,
    "complaint of pain": 4,
    "suspected serious injury": 5,
    "suspected minor injury": 6,
    "possible injury": 7,
    "no injury": 0,
    "7": 7,
}

translation_victim_ejected = {
    "not ejected": 0,
    "fully ejected": 1,
    "partially ejected": 2,
    "unknown": 3,
}

translation_victim_role = {
    "driver": 1,
    "passenger (includes non-operator on bicycle or any victim on/in parked vehicle or multiple victims on/in non-motor vehicle)": 2,
    "pedestrian": 3,
    "bicyclist": 4,
    "other (single victim on/in non-motor vehicle; e.g. ridden animal, horse-drawn carriage, train, or building)": 5,
    "non-injured party": 6,
}

translation_victim_sex = {
    "male": 'M',
    "female": 'F',
}

In [8]:
clean_data["victim_degree_of_injury"] = victims_data["victim_degree_of_injury"].map(lambda s: translation_victim_degree_of_injury.get(s, None)).astype(pd.Int16Dtype())
clean_data["victim_sex"] = victims_data["victim_sex"].map(lambda s: translation_victim_sex.get(s, None))
clean_data["victim_ejected"] = victims_data["victim_ejected"].map(lambda s: None if pd.isna(s) or s>3 or s <0 else s).astype(pd.Int16Dtype())

In victim_ejected column, there were 4 values that were 4. Since 4 is not one of the preset options for victim_ejected we decided to replace the 4's by null

---------------------------------------

# Create CSV files

## Victim CSV

In [9]:
clean_parties = load_obj("clean_parties")

In [10]:
dict_party_id = dict()
for x, party_number in zip(zip(clean_parties["case_id"],clean_parties["party_number"]), clean_parties["id"]):
    dict_party_id[x] = party_number

In [13]:
tmp_keys= [x for x in dtype_victims.keys() if x not in ["victim_safety_equipment_1", "victim_safety_equipment_2"]]
relevant_keys= [x for x in dtype_victims.keys() 
                if x not in ["victim_safety_equipment_1", "victim_safety_equipment_2", "case_id", "party_number"]]
relevant_keys.extend(["party_id", "unborn"])

In [14]:
victims_csv = clean_data[tmp_keys].copy()
victims_csv["unborn"] = victims_csv['victim_age'].map(lambda x: None if pd.isna(x) else 'T' if x==999 else 'F')
victims_csv["victim_age"] = victims_data["victim_age"].map(lambda age: None if not pd.isna(age) and age == 999 else age).astype(pd.Int16Dtype())
victims_csv["party_id"] = list(map(lambda x: dict_party_id[(x[0],x[1])], list(zip(victims_csv["case_id"],  victims_csv["party_number"]))))
victims_csv = victims_csv[relevant_keys].copy()

In [15]:
victims_csv.to_csv('out/victims.csv', index=False)

## safety_equipment relation CSV

In [25]:
translate= {"id": "victim_id", "victim_safety_equipment_1": "safety_equipment_id", "victim_safety_equipment_2": "safety_equipment_id"}

In [39]:
safety_equipment_1 = clean_data[~clean_data["victim_safety_equipment_1"].isnull()][["id", "victim_safety_equipment_1"]].copy().rename(columns=translate)

In [41]:
safety_equipement_2 = clean_data[~clean_data["victim_safety_equipment_2"].isnull()][["id", "victim_safety_equipment_2"]].copy().rename(columns=translate)

In [47]:
safety_equipement_2

Unnamed: 0,victim_id,safety_equipment_id
35,38,G
66,69,G
114,117,L
115,118,L
116,119,C
...,...,...
3799842,8476890,G
3799843,8515485,H
3799844,8533030,G
3799845,8533031,G


In [49]:
safety_equipment_csv = pd.concat([null_safety_equipment, safety_equipment_1, safety_equipement_2], axis=0)

In [50]:
sort_safety_equipment_csv = safety_equipment_csv.sort_values("victim_id")

In [51]:
sort_safety_equipment_csv.to_csv('out/victim_equiped_with_safety_equipment.csv', index=False)