### Load data and filter it down to cols to keep

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("HES_Sample_APC.txt", sep="\t", low_memory=False)
cols_to_keep = pd.read_csv("cols_to_keep.csv", header=None).values.ravel()

In [3]:
data = data.loc[:,cols_to_keep]

### Save data into and SQLite DB

In [4]:
import sqlalchemy
engine = sqlalchemy.create_engine('sqlite:///my_db.sqlite')
data.to_sql('Resolved', engine, if_exists='append')

### Aggregate primary and secondary diagnoses on patient level

- First and foremost all ICD codes in the `DIAG_` columns have to be cleaned to only first four characters.
- Then all `DIAG_` columns need to be trunkated to either 1, 2, 3, or 4 characters.
- These are categorical variables so they need to be one hot encoded first. 
- ICD cdes of `DIAG_01` should become `DIAG_1_ICD`
- ICD codes from either of `DIAG_02`-`DIAG_20` columns have to be merged into `DIAG_2_ICD`. 

Following the generation of dummy variables for the primary and secondary ICD codes we need t aggregate them
- Each patient has several episodes (marked by `EPIKEY` columns). 
- Within each episode there could be several spells. 
- We want to aggregate ICD diagnoses codes on the patient level from all episodes but without counting them multiple times on the spell level. 
- So we will first aggregate on the episode level, if any ICD code has a count > 1 we set it to 1, then aggregate on patient level.

In [50]:
def create_aggregated_diagnoses_table(data, ICD_trim=4):
    # create list of ICD diagnoses columns
    diag_cols = ["DIAG_0" + str(i+1) for i in range(9)] + ["DIAG_" + str(i+10) for i in range(11)]

    # create function to clean and trim ICD codes to fixed length
    def get_first_n_chars(df, cols, n):
        df = df.copy()
        def converter(col):
            ICD_code = (col.astype(str)
                        .str.replace("\.", "")
                        .str.replace("-", "")
                        .str[:n])
            return ICD_code
        df[cols] = df[cols].apply(converter)
        return df

    # make trimmed ICD code data 
    data_ICD4 = data.pipe(get_first_n_chars, diag_cols, ICD_trim)

    # generate dummy vars from primary diagnosis codes
    diag_1_df = pd.get_dummies(data_ICD4.DIAG_01, prefix="DIAG_1")

    # generate dummy vars from primary diagnosis codes: DIAG_02
    diag_2_df = pd.get_dummies(data_ICD4[diag_cols[1]], prefix="DIAG_2")

    # generate dummies from and add all other secondary diagnoses codes DIAG_03 - DIAG_20 to DIAG_02
    diag_2_df = diag_2_df.astype(dtype=np.float64)
    for col in diag_cols[2:]:
        print(col)
        dummy_tmp = pd.get_dummies(data_ICD4[col], prefix="DIAG_2")
        dummy_tmp = dummy_tmp.astype(dtype=np.float64)
        diag_2_df = diag_2_df.add(dummy_tmp, fill_value=0)

    # create table of patient ids with episode ids as index
    ids = data[["ENCRYPTED_HESID", "EPIKEY"]]
    ids = ids.set_index("EPIKEY")

    # concatenate episode keys with the icd diagnoses tables
    diag_df = pd.concat([data["EPIKEY"], diag_1_df, diag_2_df], axis=1)

    # aggregate on episode level, then correct if there were multiple spells
    epi_level_agg = diag_df.groupby("EPIKEY").sum()
    #ela_larger_than_one_mask = (epi_level_agg > 1).values
    epi_level_agg[epi_level_agg > 1] = 1

    # get patient ids for each of the aggregated episode keys
    ids = ids.loc[epi_level_agg.index]

    # join ICD tables with patient ids series, on episode ids 
    epi_level_agg = ids.join(epi_level_agg)

    # unstack episode ids as index in the aggregated ICD table
    epi_level_agg.reset_index(level=0, inplace=True)

    # group on patient level, we need to sum diag vars, but count epikeys
    agg_funcs = {c:"sum" if c!="EPIKEY" else "count" for c in epi_level_agg.columns}
    
    # delete the patient ids from the dict as we'll group by that variable
    agg_funcs.pop('ENCRYPTED_HESID', None)
    
    # get aggregates on patient level
    patient_level_agg = epi_level_agg.groupby('ENCRYPTED_HESID').agg(agg_funcs)

    # unstack patient ids as index 
    patient_level_agg.reset_index(level=0, inplace=True)

    return patient_level_agg

In [51]:
icd_4 = create_aggregated_diagnoses_table(data, 4)
icd_4.to_csv("apc_icd_4.csv")
icd_3 = create_aggregated_diagnoses_table(data, 3)
icd_3.to_csv("apc_icd_3.csv")
icd_2 = create_aggregated_diagnoses_table(data, 2)
icd_2.to_csv("apc_icd_2.csv")
icd_1 = create_aggregated_diagnoses_table(data, 1)
icd_1.to_csv("apc_icd_1.csv")

DIAG_03
DIAG_04
DIAG_05
DIAG_06
DIAG_07
DIAG_08
DIAG_09
DIAG_10
DIAG_11
DIAG_12
DIAG_13
DIAG_14
DIAG_15
DIAG_16
DIAG_17
DIAG_18
DIAG_19
DIAG_20
DIAG_03
DIAG_04
DIAG_05
DIAG_06
DIAG_07
DIAG_08
DIAG_09
DIAG_10
DIAG_11
DIAG_12
DIAG_13
DIAG_14
DIAG_15
DIAG_16
DIAG_17
DIAG_18
DIAG_19
DIAG_20
DIAG_03
DIAG_04
DIAG_05
DIAG_06
DIAG_07
DIAG_08
DIAG_09
DIAG_10
DIAG_11
DIAG_12
DIAG_13
DIAG_14
DIAG_15
DIAG_16
DIAG_17
DIAG_18
DIAG_19
DIAG_20
DIAG_03
DIAG_04
DIAG_05
DIAG_06
DIAG_07
DIAG_08
DIAG_09
DIAG_10
DIAG_11
DIAG_12
DIAG_13
DIAG_14
DIAG_15
DIAG_16
DIAG_17
DIAG_18
DIAG_19
DIAG_20
