## Load data and dates

Make sure all elements are between index and lookbac

In [1]:
import pandas as pd
import numpy as np
import re

pos_index_lookback = pd.read_csv("Mock_data/results/pos_index_lookback.csv", 
                                 low_memory=False, index_col=0, parse_dates=["DATE_INDEX", "DATE_LOOKBACK"])
neg_index_lookback = pd.read_csv("Mock_data/results/neg_index_lookback.csv", 
                                 low_memory=False, index_col=0, parse_dates=["DATE_INDEX", "DATE_LOOKBACK"])
pos_data = pd.read_csv("Mock_data/merged_pos.csv", low_memory=False, index_col=0, parse_dates=["DATE"])
neg_data = pd.read_csv("Mock_data/merged_neg.csv", low_memory=False, index_col=0, parse_dates=["DATE"])

# add index and lookback date to pos and neg data - inner join: only keep patients who have index and lookback dates
pos_data = pos_data.set_index("ENCRYPTED_HESID")
pos_data = pos_index_lookback.join(pos_data, how="inner")
neg_data = neg_data.set_index("ENCRYPTED_HESID")
neg_data = neg_index_lookback.join(neg_data, how="inner")

# filter out rows that fall outside of lookback period
pos_rows_to_keep = ((pos_data.DATE_INDEX >= pos_data.DATE) & (pos_data.DATE >= pos_data.DATE_LOOKBACK))
pos_data = pos_data[pos_rows_to_keep]
pos_data.reset_index(inplace=True)
neg_rows_to_keep = ((neg_data.DATE_INDEX >= neg_data.DATE) & (neg_data.DATE >= neg_data.DATE_LOOKBACK))
neg_data = neg_data[neg_rows_to_keep]
neg_data.reset_index(inplace=True)

## Define Charlson coding

This code is taken from [here](http://gforge.se/calc-comorbidity-measures/) which was found by Harvey. I adopted and simplified the code.

In [2]:
# define the Charlson_v2 dict -  based on Quan et al 2005
charlsons_v2 = {}
charlsons_v2['MI'] = {'icd10': ['I2([12]|52)'],
'icd9': ['41[02]']}

charlsons_v2['CHF'] = {'icd10': ['I099', 'I1(10|3[02])', 'I255', 'I4(2[056789]|3)', 'I50', 'P290'],
'icd9': ['39891', '402(01|11|91)', '404(01|03|[19][13])', '42(5[456789]|8)']}

charlsons_v2['PVD'] = {'icd10': ['I7([01]|3[189]|71|9[02])', 'K55[189]', 'Z95[89]'],
'icd9': ['0930', '4373', '44([01]|3[123456789]|71)', '557[19]', 'V434']}

charlsons_v2['CEVD'] = {'icd10': ['G4[56]', 'H340', 'I6'],
'icd9': ['36234', '43[012345678]']}

charlsons_v2['DEM'] = {'icd10': ['F0([0123]|51)', 'G3(0|11)'],
'icd9': ['29(0|41)', '3312']}

charlsons_v2['COPD'] = {'icd10': ['I27[89]', 'J4[01234567]', 'J6([01234567]|84)', 'J70[13]'],
'icd9': ['416[89]', '49', '50([012345]|64|8[18])']}

charlsons_v2['Rheum'] = {'icd10': ['M0[56]', 'M3(15|[234]|5[13]|60)'],
'icd9': ['4465', '71(0[01234]|4[0128])', '725']}

charlsons_v2['PUD'] = {'icd10': ['K2[5678]'],
'icd9': ['53[1234]']}

charlsons_v2['MILDLD'] = {'icd10': ['B18', 'K7(0[01239]|1[3457]|[34]|6[023489])', 'Z944'],
'icd9': ['070([23][23]|[45]4|[69])', '57([01]|3[3489])', 'V427']}

charlsons_v2['DIAB_UC'] = {'icd10': ['E1[01234][01689]'],
'icd9': ['250[012389]']}

charlsons_v2['DIAB_C'] = {'icd10': ['E1[01234][23457]'],
'icd9': ['250[4567]']}

charlsons_v2['PARA'] = {'icd10': ['G041', 'G114', 'G8(0[12]|[12]|3[012349])'],
'icd9': ['3341', '34([23]|4[01234569])']}

charlsons_v2['RD'] = {'icd10': ['I120', 'I131', 'N0(3[234567]|5[234567])', 'N1[89]', 'N250', 'Z49[012]', 'Z940', 'Z992'],
'icd9': ['403[019]1', '404[019][23]', '58(2|3[01234567]|[56]|80)', 'V4(20|51)', 'V56']}

charlsons_v2['CANCER'] = {'icd10': ['C[01]', 'C2[0123456]', 'C3[01234789]', 'C4[01356789]', 'C5[012345678]', 'C6', 'C7[0123456]', 'C8[123458]', 'C9[01234567]'],
'icd9': ['1[456]', '17[012456789]', '18', '19[012345]', '20[012345678]', '2386']}

charlsons_v2['MSLD'] = {'icd10': ['I8(5[09]|64)', 'I982', 'K7(04|[12]1|29|6[567])'],
'icd9': ['456[012]', '572[2345678]']}

charlsons_v2['METS'] = {'icd10': ['C7[789]', 'C80'],
'icd9': [ '19[6789]']}

charlsons_v2['HIV'] = {'icd10': ['B2[0124]'],
'icd9': [ '04[234]']}

# define charlson weights
charlsons_weight = {}
charlsons_weight['MI'] = 1
charlsons_weight['CHF'] = 1
charlsons_weight['PVD'] = 1
charlsons_weight['CEVD'] = 1
charlsons_weight['DEM'] = 1
charlsons_weight['COPD'] = 1
charlsons_weight['Rheum'] = 1
charlsons_weight['PUD'] = 1
charlsons_weight['MILDLD'] = 1
charlsons_weight['DIAB_UC'] = 1
charlsons_weight['DIAB_C'] = 2
charlsons_weight['PARA'] = 2
charlsons_weight['RD'] = 2
charlsons_weight['CANCER'] = 2
charlsons_weight['METS'] = 6
charlsons_weight['MSLD'] = 3
charlsons_weight['HIV'] = 6

def find_charlson_match(icd_codes, charlson_dict, charlsons_weight):
    """Checks if a list of icd10 codes exists in any of the expressions
    and returns a string with the code for the charlsons"""
    matches = {}
    for icd_code in icd_codes:
        for key_disease, reg_expr in charlson_dict.items():
            if (key_disease not in matches):
                for expr in reg_expr['icd10']:
                    if (re.match(expr, icd_code, re.IGNORECASE)):
                        matches[key_disease] = charlsons_weight[key_disease]
                        break
    return matches

## Define helper functions

These functions do the following:
- `get_all_icd_for_patients`: extracts all ICD codes for each patients and the count a certain ICD code happened if `unique=False`
- `get_charlson_codes`: given a dataframe returned by `get_all_icd_for_patients`, this will return a dictionary where the patient id is the key and the values are dictionaries where each key is a Charlson code and its value is the corresponding Charlson weight

In [3]:
# function to clean and trim ICD codes to fixed length
def get_first_n_chars(df, cols, n):
    df = df.copy()
    def converter(col):
        code = (col
                .str.replace("\.", "")
                .str.replace("-", "")
                .str[:n])
        return code
    df[cols] = df[cols].apply(converter)
    return df

# this function extracts all ICD codes for a given patient and counts them
def get_all_icd_for_patients(data, diag_cols, id_col, unique=True, trim_level=4):
    df = data.copy()
    df.set_index(id_col, inplace=True)
    stacked_label = "ICD10"
    # trim ICD codes
    df = df.pipe(get_first_n_chars, diag_cols, trim_level)
    # extract columns, stack them into a single column
    cols_stacked = df[diag_cols].stack()
    # reset the second level of the index (introduced by stacking)
    cols_stacked = cols_stacked.reset_index(level=1)
    # drop the second level of index which is now a column
    cols_stacked = cols_stacked.drop("level_1", axis=1)
    # rename stacked column to the user defined one
    cols_stacked.rename(columns={0: "DIAG"}, inplace=True)
    # count for each patient the number of times a given icd happened
    cols_stacked = (cols_stacked.reset_index()
                    .groupby(["ENCRYPTED_HESID", "DIAG"])
                    .size()
                    .reset_index()
                    .set_index("ENCRYPTED_HESID"))
    cols_stacked.rename(columns={0: "COUNT"}, inplace=True)
    # if unique=True, set all counts to simply 1
    if unique:
        cols_stacked.COUNT = 1
    return cols_stacked

def get_charlson_codes(patient_icd_df):
    patients_icds = {}
    for patient in patient_icd_df.index:
        # extract all icd codes of a given patient
        icd_codes = patient_icd_df.loc[patient].DIAG
        
        # get charlson codes
        patients_icds[patient] = find_charlson_match(icd_codes, charlsons_v2, charlsons_weight)
    return patients_icds

## Compute comorbidity index for patients

For each patient with any ICD10 codes that belong the either of the Charlson categories, we sum up the weights of these categories.

In [4]:
# define diag codes
diag_cols = ["DIAG_0" + str(i+1) for i in range(9)] + ["DIAG_" + str(i+10) for i in range(11)]
# get icd codes for each patient with their counts
patient_icd_df = get_all_icd_for_patients(pos_data, diag_cols, "ENCRYPTED_HESID", unique=False)
# get charlson codes
patient_charlson_dict = get_charlson_codes(patient_icd_df)
# make dataframe from charlson codes
patient_charlson_df = pd.DataFrame().from_dict(patient_charlson_dict, orient="index")
# add charlson comorbidity score
patient_charlson_df.insert(0, "COMORB_SCORE", patient_charlson_df.sum(axis=1))
patient_charlson_df

Unnamed: 0,COMORB_SCORE,DIAB_UC,MI,COPD,CANCER,MSLD,MILDLD,RD,CHF,METS,Rheum,PUD,CEVD,PVD,DIAB_C,PARA,DEM
0017E9B890C74E54AF5B93AB663D2F65,3.0,1.0,1.0,1.0,,,,,,,,,,,,,
024D2F127EB94E81A6A3A79D6BD6D89C,5.0,,,,2.0,3.0,,,,,,,,,,,
044E3A98BC6D4E63B74B0FD25F751629,1.0,1.0,,,,,,,,,,,,,,,
046B06A93C5C459597D9BF07AA033599,1.0,,,1.0,,,,,,,,,,,,,
04BAD143C2F14556A72DF01D5486E807,2.0,,,,2.0,,,,,,,,,,,,
0564E88FD5D84287B4B4CC4011A8E18D,2.0,,,,2.0,,,,,,,,,,,,
0686F3EF4D5642298B71B8297B59CFFE,3.0,,,,2.0,,1.0,,,,,,,,,,
098C62946E8541FC87396327B876050E,1.0,,,1.0,,,,,,,,,,,,,
0A24A301A01442848EE59FA886AD531E,1.0,,,1.0,,,,,,,,,,,,,
0C8739BBCC104C5F99A47C82F2F461C6,1.0,,,1.0,,,,,,,,,,,,,
