# Data preparation

### Data Loading and Merging

The data comes from two separate sources: Manually coded data and ISS data. These are joined.

In [1158]:
import numpy as np
import pandas as pd

In [1159]:
data_dir = "../data/"
dd = pd.read_csv(
    data_dir + "raw/data_dictionary.csv",
    delimiter = ","
)
sample = pd.read_csv(
    data_dir + "raw/sample_iss_18012020.csv",
    low_memory = False
)[["centre", "pid", "iss"]]
nomesco_snomed = pd.read_csv(
    data_dir + "raw/ttris_nomesco_snomed.csv", 
    delimiter = ";",
    low_memory = False
)

In [1160]:
data = pd.merge(nomesco_snomed, sample, on = ["pid", "centre"])

In [1161]:
data.head()

Unnamed: 0,centre,pid,cr,qcrec,vsqs,ic,age,sex,moi,tyi,...,iopinj4,iopinj4icd,iopinj5,iopinj5icd,iopinj6,iopinj6icd,testvar,locked,X0,iss
0,1515,5042,0,0,0,1,31,1,W01,0,...,999,999,999,999,999,999,0,0,,9.0
1,1515,5108,0,0,0,1,72,1,W01,0,...,999,999,999,999,999,999,0,0,,
2,1515,5077,0,0,0,1,39,1,W13,0,...,999,999,999,999,999,999,0,0,,9.0
3,1515,5028,0,0,0,1,33,1,W01,0,...,999,999,999,999,999,999,0,0,,
4,1515,5024,0,0,0,1,35,1,W01,0,...,999,999,999,999,999,999,0,0,,


Subset relevant variables

In [1162]:
sv = ["age", "moi", "sex", "mot", "tran", "s30d", "egcs", "mgcs", "vgcs", "avpu", "hr",
      "sbp", "dbp", "spo2", "rr", "tc", "ic", "doar", "toar", "toa", "doa", "doi", "toi", "s24h", "hd",
      "taicu", "daicu", "tos", "dos", "nomesco", "snomed", "s", "iss"]
data = data[sv]

In [1163]:
data.to_csv(data_dir + "interim/merged_samples.csv")

### Recoding of Data

Recode,
* `999` indicates missing..
* `0` for doa and toa is not admitted, same for doar and toar, hence missing.
* `0` for dos and tos is not taken to surgery within 30 days of admission.

In [1164]:
data.replace([999, "999"], np.nan, inplace = True)

In [1165]:
data[[
    "toa",
    "toi",
    "taicu",
    "tos",
    "toar"
]] = data[["toa", "toi", "taicu", "tos", "toar"]].replace('0', '00:00:00')

In [1166]:
data[[
    "doa",
    "daicu",
    "dos",
    "doar"
]] = data[["doa", "daicu", "dos", "doar"]].replace('0', '1970-01-01')

### Exclusion criteria

Include only
* Patients that infomed consent,
* Adult patients (Those >= 18 years old)
* Patients with complete data on s30d, and
* Patients with complete data on triage category.

In [1167]:
data_excl = data.loc[
    (data.ic == 1)&
    (data.age >= 18) &
    (data.s30d.notnull()) &
    (data.tc.notnull())
]

In [1168]:
data_excl.to_csv(data_dir + "interim/merged_samples_clean.csv")

### Feature engineering

* Create `time_to_arrival` feature
* Add admission to ICU within 48H
* Add binary for major surgery within 24H
* Add ISS15, binary if iss is greater than or equal to 15
* Add composite outcome
* Collapse mechanism of injury
* Conditional for outcome

#### Time to arrival feature

Enter 0 if any is 0, nan if nan, else convert to datetime. Probably exists a more efficient way to do this.

In [1169]:
time_of_injury = pd.to_datetime(data_excl.doi + " " + data_excl.toi)

In [1170]:
time_of_arrival = pd.to_datetime(data_excl.doa + " " + data_excl.toa)

In [1171]:
data_excl = data_excl.assign(delay = (time_of_arrival - time_of_injury).astype('timedelta64[m]'))
data_excl.delay = [0 if x < 0 else x for x in data_excl.delay]

#### Admission to ICU with 48h feature

In [1172]:
ed_admit = pd.to_datetime(data_excl.doar + " " + data_excl.toar)
icu_admit = pd.to_datetime(data_excl.daicu + " " + data_excl.taicu)
time_to_icu = (icu_admit - ed_admit).astype('timedelta64[h]')

In [1173]:
icu48h = [1 if x <= 48 else 0 for x in time_to_icu]
data_excl = data_excl.assign(icu48h = icu48h)

In [1174]:
data_excl.drop(columns = [
    "daicu",
    "taicu"
], inplace = True)

#### Binary for major surgery within 24h

In [1175]:
true_major_codes = "446816008.0|446683008.0|446115005.0|178674000.0|275093002.0|272300006.0|112777008.0|57470004.0|8476009.0|35340001.0|23036009.0|439756000.0|274457001.0|74011006.0|73231008.0|67319007.0|56413001.0|36777000.0|74770008.0"

In [1176]:
true_major = data_excl.snomed.astype(str).str.contains(true_major_codes)

In [1177]:
date_time_surgery = pd.to_datetime(data_excl.dos + " " + data_excl.tos)
date_time_ed = pd.to_datetime(data_excl.doar + " " + data_excl.toar)
time_to_surgery = (date_time_surgery - date_time_ed).astype('timedelta64[h]')

In [1178]:
majors24h = ((true_major == True) & (time_to_surgery <= 24)).astype(int)

If `dos` or `tos` is 0, then s should be coded as 0 

In [1179]:
cond = (data_excl.dos == '1970-01-01') | (data_excl.tos == '00:00:00')
data_excl.loc[:, "s"] = pd.Series(['0' if b == True else x for b, x in zip(cond, data_excl.s)])

If NA in `s`, then `majors24h` should also be NA

In [1180]:
majors24h = pd.Series([x if pd.isnull(x) else m24h for x, m24h in zip(s, majors24h)])

Drop the irrelevant columns

In [1181]:
data_excl.drop(columns = [
    "s",
    "dos",
    "tos",
    "doar",
    "toar",
    "toa",
    "doa",
    "doi", 
    "toi",
    "snomed",
    "nomesco",
], inplace = True)

In [1182]:
data_excl = data_excl.assign(majors24h = majors24h)

#### Add ISS15

In [1183]:
iss15 = np.where(data_excl.iss >= 15, 1, 0)
iss15 = pd.Series([np.nan if pd.isnull(x) else y for x,y in zip(data_excl.iss, iss15)])

In [1184]:
data_excl = data_excl.assign(iss15 = iss15).drop(columns = ["iss"])

#### Add composite outcome

In [1185]:
data_excl = data_excl.assign(
    composite = ((icu48h == 1) | (data_excl.s24h == "Yes") | (majors24h == 1) | (iss15 == 1)).astype(int)
)

#### Collapse mechanism of injury to single variable

In [1186]:
below_ten = ["0" + str(x) for x in range(10)]

In [1187]:
d = {
    "Transportation accident" : ["V" + str(x) for x in below_ten + [str(y) for y in range(10, 100)]],  ## Transport accidents
    "Falls" : ["W" + str(x) for x in below_ten + [str(y) for y in range(10, 20)]],                     ## Falls
    "Burns" : ["X" + str(x) for x in below_ten + [str(y) for y in range(10, 20)]],                     ## Burns
    "Other external causes of accidental injury" : ["W" + str(x) for x in range(20, 100)] + ["X" + str(x) for x in range(20, 60)],  ## Other external causes of accidental injury
    "Intentional self-harm" : ["X" + str(x) for x in range(60, 85)],                                   ## Intential self-harm
    "Assault" : ["X" + str(x) for x in range(85, 100)] + ["Y" + str(x) for x in below_ten],            ## Assault
    "Event of undetermined intent" : ["Y" + str(x) for x in range(10, 35)],                            ## Events of undetermined intent
    "Legal intervention" : ["Y" + str(x) for x in [35, 36]],                                           ## Legal intervention
}

In [1188]:
for k, v in d.items():
    for i, x in enumerate(data_excl.moi):
        if x in v: data_excl.moi.iloc[i, :] = k

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


#### Set to outcome

Outcome variable is set to dead if patients were dead on discharge after 30 days or if dead after 24 hours

In [1189]:
data_excl.loc[((data_excl.hd == 1) & pd.notnull(data_excl.hd)) | ((data_excl.s24h == 1) & (pd.notnull(data_excl.s24h))), "s30d"] = 1

Remove the informed consent variable variable

In [1190]:
data_excl.drop(columns = ["ic", "icu48h", "majors24h", "iss15"], inplace = True)

Write the prepared sample to disk

### Write to disk

Write to separate datasets to disk,
* For the `s30d` outcome, drop `s24h` and `composite`
* For the `composite` outcome, drop `s30d`, `s24h`

Split datasets first

In [1191]:
from sklearn.model_selection import train_test_split

For `s30d`

In [1192]:
X_train_s30d, X_test_s30d, y_train_s30d, y_test_s30d = train_test_split(
    data_excl.drop(columns = ["s24h", "composite"]),
    data_excl.s30d,
    random_state = 123,
    stratify = data_excl.s30d
)

In [1193]:
y_train_s30d.value_counts()

0.0    5860
1.0     328
2.0      33
Name: s30d, dtype: int64

In [1194]:
X_train_s30d.to_csv(data_dir + "processed/s30d/X_train_s30d.csv")
X_test_s30d.to_csv(data_dir + "processed/s30d/X_test_s30d.csv")
y_train_s30d.to_csv(data_dir + "processed/s30d/y_train_s30d.csv")
y_test_s30d.to_csv(data_dir + "processed/s30d/y_test_s30d.csv")

For `composite`

In [1195]:
X_train_composite, X_test_composite, y_train_composite, y_test_composite = train_test_split(
    data_excl.drop(columns = ["s30d", "s24h"]),
    data_excl.composite,
    random_state = 123,
    stratify = data_excl.composite
)

In [1196]:
y_train_composite.value_counts()

0    6101
1     120
Name: composite, dtype: int64

In [1197]:
X_train_composite.to_csv(data_dir + "processed/composite/X_train_composite.csv")
X_test_composite.to_csv(data_dir + "processed/composite/X_test_composite.csv")
y_train_composite.to_csv(data_dir + "processed/composite/y_train_composite.csv")
y_test_composite.to_csv(data_dir + "processed/composite/y_test_composite.csv")