# Data preparation

### Data Loading and Merging

The data comes from two separate sources: Manually coded data and ISS data. These are joined.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from tableone import TableOne

In [3]:
import os

In [4]:
os.chdir("..")

In [5]:
data_dir = "./data/"
data_dictionary = pd.read_csv(
    data_dir + "raw/data_dictionary.csv",
    delimiter = ","
)
sample = pd.read_csv(
    data_dir + "raw/sample_iss_18012020.csv",
    low_memory = False
)[["centre", "pid", "iss"]]
nomesco_snomed = pd.read_csv(
    data_dir + "raw/ttris_nomesco_snomed.csv", 
    delimiter = ";",
    low_memory = False
)

In [6]:
data = pd.merge(nomesco_snomed, sample, on = ["pid", "centre"])

Subset relevant variables

In [7]:
sv = ["age", "moi", "sex", "mot", "tran", "s30d", "egcs", "mgcs", "vgcs", "avpu", "hr",
      "sbp", "dbp", "spo2", "rr", "tc", "ic", "doar", "toar", "toa", "doa", "doi", "toi", "s24h", "hd",
      "taicu", "daicu", "tos", "dos", "nomesco", "snomed", "s", "iss"]
data = data[sv]

In [8]:
data.to_csv(data_dir + "interim/merged_samples.csv")

### Recoding of Data

Recode,
* `999` indicates missing..
* `0` for doa and toa is not admitted, same for doar and toar, hence missing.
* `0` for dos and tos is not taken to surgery within 30 days of admission.

In [9]:
data.replace([999, "999"], np.nan, inplace = True)

In [10]:
data[[
    "toa",
    "toi",
    "taicu",
    "tos",
    "toar"
]] = data[["toa", "toi", "taicu", "tos", "toar"]].replace('0', '00:00:00')

In [11]:
data[[
    "doa",
    "daicu",
    "dos",
    "doar"
]] = data[["doa", "daicu", "dos", "doar"]].replace('0', '1970-01-01')

### Exclusion criteria

Include only
* Patients that infomed consent,
* Adult patients (Those >= 18 years old)
* Patients with complete data on s30d, and
* Patients with complete data on triage category.

In [31]:
data_before_ic = data
data_after_ic = data_before_ic.loc[(data.ic == 1)]
n_removed_ic = len(data.index) - len(data_after_ic.index)

data_before_age = data_after_ic
data_after_age = data_before_age.loc[(data_before_age.age >= 18)]
n_removed_age = len(data_before_age.index) - len(data_after_age.index)

data_before_tc = data_after_age
data_after_tc = data_before_tc.loc[data_before_tc.tc.notnull()]
n_removed_tc = len(data_before_tc.index) - len(data_after_tc.index)

data_before_s30d = data_after_tc
data_after_s30d = data_before_s30d.loc[data_before_s30d.s30d.notnull()]
n_removed_s30d = len(data_before_s30d.index) - len(data_after_s30d.index)

data_excl = data_after_tc.copy(deep=True)

In [33]:
labels = ["original",
          "n_removed_ic", 
          "n_ic",
          "n_removed_age", 
          "n_age", 
          "n_removed_tc",
          "n_tc",
          "n_removed_s30d",
          "n_final",]

excl_series = pd.Series([
    len(data.index), 
    n_removed_ic,
    len(data_after_ic.index), 
    n_removed_age,
    len(data_after_age.index), 
    n_removed_tc,
    len(data_after_tc.index),
    n_removed_s30d, 
    len(data_after_s30d.index)
], index = labels)
excl_series.to_csv(data_dir + "interim/excl.csv")

In [34]:
pd.read_csv(data_dir + "interim/excl.csv", index_col = 0).iloc[:, 0]

original          12412
n_removed_ic        533
n_ic              11879
n_removed_age         7
n_age             11872
n_removed_tc        297
n_tc              11575
n_removed_s30d     3280
n_final            8295
Name: 0, dtype: int64

In [28]:
data_excl.to_csv(data_dir + "interim/merged_samples_clean.csv")

### Feature engineering

* Create `time_to_arrival` feature
* Add admission to ICU within 48H
* Add binary for major surgery within 24H
* Add ISS15, binary if iss is greater than or equal to 15
* Add composite outcome
* Collapse mechanism of injury
* Conditional for outcome

#### Time to arrival feature

Enter 0 if any is 0, nan if nan, else convert to datetime. Probably exists a more efficient way to do this.

In [17]:
time_of_injury = pd.to_datetime(data_excl.doi + " " + data_excl.toi)

In [18]:
time_of_arrival = pd.to_datetime(data_excl.doa + " " + data_excl.toa)

In [19]:
data_excl = data_excl.assign(delay = (time_of_arrival - time_of_injury).astype('timedelta64[m]'))
data_excl.delay = [0 if x < 0 else x for x in data_excl.delay]

#### Admission to ICU within 48h feature

In [20]:
ed_admit = pd.to_datetime(data_excl.doar + " " + data_excl.toar)
icu_admit = pd.to_datetime(data_excl.daicu + " " + data_excl.taicu)
time_to_icu = (icu_admit - ed_admit).astype('timedelta64[h]')

In [21]:
icu48h = [1 if x <= 48 else 0 for x in time_to_icu]
data_excl = data_excl.assign(icu48h = icu48h)

In [22]:
data_excl.drop(columns = [
    "daicu",
    "taicu"
], inplace = True)

#### Binary for major surgery within 24h

In [23]:
true_major_codes = "446816008.0|446683008.0|446115005.0|178674000.0|275093002.0|272300006.0|112777008.0|57470004.0|8476009.0|35340001.0|23036009.0|439756000.0|274457001.0|74011006.0|73231008.0|67319007.0|56413001.0|36777000.0|74770008.0"

In [24]:
true_major = data_excl.snomed.astype(str).str.contains(true_major_codes)

In [25]:
date_time_surgery = pd.to_datetime(data_excl.dos + " " + data_excl.tos)
date_time_ed = pd.to_datetime(data_excl.doar + " " + data_excl.toar)
time_to_surgery = (date_time_surgery - date_time_ed).astype('timedelta64[h]')

In [26]:
majors24h = ((true_major == True) & (time_to_surgery <= 24)).astype(int)

If `dos` or `tos` is 0, then s should be coded as 0 

In [27]:
cond = (data_excl.dos == '1970-01-01') | (data_excl.tos == '00:00:00')
data_excl.loc[:, "s"] = pd.Series(['0' if b == True else x for b, x in zip(cond, data_excl.s)])

If NA in `s`, then `majors24h` should also be NA

In [28]:
majors24h = pd.Series([x if pd.isnull(x) else m24h for x, m24h in zip(data_excl.s, majors24h)])

Drop the irrelevant columns

In [29]:
data_excl.drop(columns = [
    "s",
    "dos",
    "tos",
    "doar",
    "toar",
    "toa",
    "doa",
    "doi", 
    "toi",
    "snomed",
    "nomesco",
], inplace = True)

In [30]:
data_excl = data_excl.assign(majors24h = majors24h)

#### Add ISS15

In [31]:
iss15 = np.where(data_excl.iss >= 15, 1, 0)
iss15 = pd.Series([np.nan if pd.isnull(x) else y for x, y in zip(data_excl.iss, iss15)])

In [32]:
data_excl = data_excl.assign(iss15 = iss15).drop(columns = ["iss"])

#### Add composite outcome

In [33]:
data_excl = data_excl.assign(
    composite = ((icu48h == 1) | (data_excl.s24h == "Yes") | (majors24h == 1) | (iss15 == 1)).astype(int)
)

#### Collapse mechanism of injury to single variable

In [34]:
below_ten = ["0" + str(x) for x in range(10)]

In [35]:
l = [
    {k : "Transportation accident" for k in ["V" + str(x) for x in below_ten + [str(y) for y in range(10, 100)]]},  ## Transport accidents
    {k : "Falls" for k in ["W" + str(x) for x in below_ten + [str(y) for y in range(10, 20)]]},                     ## Falls
    {k : "Burns" for k in ["X" + str(x) for x in below_ten + [str(y) for y in range(10, 20)]]},                     ## Burns
    {k : "Other external causes of accidental injury" for k in ["W" + str(x) for x in range(20, 100)] + ["X" + str(x) for x in range(20, 60)]},  ## Other external causes of accidental injury
    {k : "Intentional self-harm" for k in ["X" + str(x) for x in range(60, 85)]},                                   ## Intential self-harm
    {k : "Assault" for k in ["X" + str(x) for x in list(range(85, 100))] + ["Y" + str(x) for x in below_ten]},      ## Assault
    {k : "Event of undetermined intent" for k in ["Y" + str(x) for x in range(10, 35)]},                            ## Events of undetermined intent
    {k : "Legal intervention" for k in ["Y" + str(x) for x in [35, 36]]},                                           ## Legal intervention
]
dd = {}
for d in l: dd.update(d)

In [36]:
moi_codes = data_excl.moi.value_counts().index.to_series()
moi_to_unlabelled = moi_codes[~moi_codes.isin(pd.Series(dd.keys()))]
dd = {**dd, **{k : "Unlabelled" for k in moi_to_unlabelled}}

In [37]:
data_excl.loc[:, "moi"] = data_excl.loc[:, "moi"].replace(dd)

Set values of 2 in s30d to 1 (I.e. if admitted to other hospital and alive, then set alive)

In [38]:
data_excl.loc[data_excl.s30d == 2, "s30d"] = 1

Outcome variable is set to dead if patients were dead on discharge after 30 days or if dead after 24 hours

In [39]:
data_excl.loc[((data_excl.hd == 1) & pd.notnull(data_excl.hd)) | ((data_excl.s24h == 1) & (pd.notnull(data_excl.s24h))), "s30d"] = 1

Remove the informed consent variable variable

In [40]:
data_excl.drop(columns = ["ic", "icu48h", "majors24h", "iss15", "hd"], inplace = True)

### Write to disk

Write to separate datasets to disk,
* For the `s30d` outcome, drop `s24h` and `composite`
* For the `composite` outcome, drop `s30d`, `s24h`

Do encoding of categorical variables before splitting 

In [41]:
cont_features = ["age", "hr", "sbp", "dbp", "spo2", "rr", "delay"]
cat_features = list(data_excl.loc[:, ~data_excl.columns.isin(cont_features + ["s24h", "composite", "s30d", "tc"])].columns)

Split datasets first

In [42]:
from sklearn.model_selection import train_test_split

For `s30d`

In [43]:
X_train_s30d, X_test_s30d, y_train_s30d, y_test_s30d, tc_train_s30d, tc_test_s30d = train_test_split(
    data_excl.drop(columns = ["s24h", "composite", "s30d", "tc"]),
    data_excl.s30d,
    data_excl.tc,
    random_state = 27,
    stratify = data_excl.s30d
)

Merge the dataset in order to generate summary table

In [44]:
df_s30d_train = X_train_s30d.assign(
    s30d=y_train_s30d, 
    tc=tc_train_s30d,
    partition=["Train"] * len(y_train_s30d)
)
df_s30d_test = X_test_s30d.assign(
    s30d=y_test_s30d, 
    tc=tc_test_s30d,
    partition=["Holdout"] * len(y_test_s30d)
)
df_s30d = pd.concat([df_s30d_train, df_s30d_test])

In [45]:
data_excl.to_csv(data_dir + "interim/merged_samples_clean.csv")

In [46]:
df_s30d.to_csv(data_dir + "interim/table_sample_s30d.csv")

Replace the values of the data with labels

In [47]:
from src.data.transform import parse_value_labels, label_categorical
from src.visualization.visualize import create_sample_characteristics_table

In [48]:
t1_s30d = create_sample_characteristics_table(
    df=df_s30d,
    data_dictionary=data_dictionary,
    categorical=cat_features + ["s30d", "tc"],
    nonnormal=cont_features, 
    groupby="partition"
)



Save features, outcome, and clinicians triage to separate files

In [49]:
X_train_s30d = label_categorical(X_train_s30d, cat_features)
X_test_s30d = label_categorical(X_test_s30d, cat_features)

In [50]:
X_train_s30d.to_csv(data_dir + "processed/s30d/X_train.csv")
X_test_s30d.to_csv(data_dir + "processed/s30d/X_test.csv")
y_train_s30d.to_csv(data_dir + "processed/s30d/y_train.csv")
y_test_s30d.to_csv(data_dir + "processed/s30d/y_test.csv")
tc_train_s30d.to_csv(data_dir + "processed/s30d/tc_train.csv")
tc_test_s30d.to_csv(data_dir + "processed/s30d/tc_test.csv")

For `composite`

In [57]:
cont_features

['age', 'hr', 'sbp', 'dbp', 'spo2', 'rr', 'delay']

In [51]:
X_train_composite, X_test_composite, y_train_composite, y_test_composite, tc_train_composite, tc_test_composite = train_test_split(
    data_excl.drop(columns = ["s30d", "s24h", "tc", "composite"]),
    data_excl.composite,
    data_excl.tc,
    random_state = 123,
    stratify = data_excl[["composite"]]
)

In [52]:
df_composite_train = X_train_composite.assign(
    composite=y_train_composite, 
    tc=tc_train_composite,
    partition=["Train"] * len(y_train_composite)
)
df_composite_test = X_test_composite.assign(
    composite=y_test_composite, 
    tc=tc_test_composite,
    partition=["Holdout"] * len(y_test_composite)
)
df_composite = pd.concat([df_composite_train, df_composite_test])

In [53]:
df_composite.to_csv(data_dir + "interim/table_sample_composite.csv")

In [54]:
t1_composite = create_sample_characteristics_table(
    df=df_composite,
    data_dictionary=data_dictionary,
    categorical=cat_features + ["composite", "tc"],
    nonnormal=cont_features, 
    groupby="partition"
)



In [55]:
t1_composite.to_markdown()

AttributeError: 'TableOne' object has no attribute 'to_markdown'

In [68]:
X_train_composite = label_categorical(X_train_composite, cat_features)
X_test_composite = label_categorical(X_test_composite, cat_features)

In [69]:
X_train_composite.to_csv(data_dir + "processed/composite/X_train.csv")
X_test_composite.to_csv(data_dir + "processed/composite/X_test.csv")
y_train_composite.to_csv(data_dir + "processed/composite/y_train.csv")
y_test_composite.to_csv(data_dir + "processed/composite/y_test.csv")
tc_train_composite.to_csv(data_dir + "processed/composite/tc_train.csv")
tc_test_composite.to_csv(data_dir + "processed/composite/tc_test.csv")