In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp old

# old functions





In [None]:
def persons_with(df,
                 codes,
                 cols,
                 pid='pid',
                 sep=None,
                 merge=True,
                 first_date=None,
                 last_date=None,
                 group=False,
                 _fix=True):
    """
    Determine whether people have received a code

    Args:
        codes (list or dict): codes to mark for
            codes to search for
                - if list: each code will represent a column
                - if dict: the codes in each item will be aggregated to one indicator
            cols (str or list of str): Column(s) with the codes
            pid (str): colum with the person identifier
            first_date (str): use only codes after a given date
                the string either represents a date (same for all individuals)
                or the name of a column with dates (may be different for different individuals)
            last_date (str): only use codes after a given date
                the string either represents a date (same for all individuals)
                or the name of a column with dates (may be different for different individuals)

    Returns:
        Series or Dataframe


    Examples:
        fracture = persons_with(df=df, codes='S72*', cols='icdmain')
        fracture = persons_with(df=df, codes={'frac':'S72*'}, cols='icdmain')

    Todo:
        - function may check if pid_index is unique, in which it does not have to aggregate
        - this may apply in general? functions that work on event data may then also work on person level data
        - allow user to input person level dataframe source?
    """
    sub = df

    if _fix:
        df, cols = _to_df(df=df, cols=cols)
        codes, cols, allcodes, sep = _fix_args(df=df, codes=codes, cols=cols, sep=sep, merge=merge, group=group)
        rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False)
        sub = df[rows]

    df_persons = sub.groupby(pid)[cols].apply(lambda s: pd.unique(s.values.ravel()).tolist()).astype(str)

    # alternative approach, also good, and avoids creaintg personal dataframe
    # but ... regeis is fast since it stopw when it finds one true code!
    #    c=df.icdbi.str.split(', ', expand=True).to_sparse()
    #    c.isin(['S720', 'I10']).any(axis=1).any(level=0)

    persondf = pd.DataFrame(index=df[pid].unique().tolist())
    for name, codes in codes.items():
        codes_regex = '|'.join(codes)
        persondf[name] = df_persons.str.contains(codes_regex, na=False)

    return persondf

In [None]:
#export
def _format_codes(codes, merge=True):
    """
    Makes sure that the codes has the desired format: a dict with strings as
    keys (name) and a list of codes as values)

    Background: For several functions the user is allower to use strings
    when there is only one element in the list, and a list when there is
    no code replacement or aggregations, or a dict. To avoid (even more) mess
    the input is standardised as soon as possible in a function.

    Examples:
            codes = '4AB02'
            codes='4AB*'
            codes = ['4AB02', '4AB04', '4AC*']
            codes = ['4AB02', '4AB04']
            codes = {'tumor' : 'a4*', 'diabetes': ['d3*', 'd5-d9']}
            codes = 'S72*'
            codes = ['K50*', 'K51*']

            _format_codes(codes, merge=False)

    TODO: test for correctness of input, not just reformat (is the key a str?)
    """
    codes = _listify(codes)

    # treeatment of pure lists depends on whether special classes should be treated as one merged group or separate codes
    # exmple xounting of Z51* could mean count the total number of codes with Z51 OR a shorthand for saying "count all codes starting with Z51 separately
    # The option "merged, enables the user to switch between these two interpretations

    if isinstance(codes, list):
        if merge:
            codes = {'_'.join(codes): codes}
        else:
            codes = {code: [code] for code in codes}

    elif isinstance(codes, dict):
        new_codes = {}
        for name, codelist in codes.items():
            if isinstance(codelist, str):
                codelist = [codelist]
            new_codes[name] = codelist
        codes = new_codes

    return codes

In [None]:
#export
def _expand_regex(expr, full_list):
    exprs = _listify(expr)

    expanded = []

    if isinstance(full_list, pd.Series):
        pass
    elif isinstance(full_list, list):
        unique_series = pd.Series(full_list)
    elif isinstance(full_list, set):
        unique_series = pd.Series(list(full_list))

    for expr in exprs:
        match = unique_series.str.contains(expr)
        expanded.extend(unique_series[match])
    return expanded

In [None]:
#export
def _reverse_dict(dikt):
    new_dict = {}
    for name, codelist in dikt.items():
        codelist = _listify(codelist)
        new_dict.update({code: name for code in codelist})
    return new_dict

In [None]:
def persons_with(df,
                 codes,
                 cols,
                 pid='pid',
                 sep=None,
                 merge=True,
                 first_date=None,
                 last_date=None,
                 group=False,
                 _fix=True):
    """
    Determine whether people have received a code

    Args:
        codes (list or dict): codes to mark for
            codes to search for
                - if list: each code will represent a column
                - if dict: the codes in each item will be aggregated to one indicator
            cols (str or list of str): Column(s) with the codes
            pid (str): colum with the person identifier
            first_date (str): use only codes after a given date
                the string either represents a date (same for all individuals)
                or the name of a column with dates (may be different for different individuals)
            last_date (str): only use codes after a given date
                the string either represents a date (same for all individuals)
                or the name of a column with dates (may be different for different individuals)

    Returns:
        Series or Dataframe


    Examples:
        fracture = persons_with(df=df, codes='S72*', cols='icdmain')
        fracture = persons_with(df=df, codes={'frac':'S72*'}, cols='icdmain')

    Todo:
        - function may check if pid_index is unique, in which it does not have to aggregate
        - this may apply in general? functions that work on event data may then also work on person level data
        - allow user to input person level dataframe source?
    """
    sub = df

    if _fix:
        df, cols = _to_df(df=df, cols=cols)
        codes, cols, allcodes, sep = _fix_args(df=df, codes=codes, cols=cols, sep=sep, merge=merge, group=group)
        rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False)
        sub = df[rows]

    df_persons = sub.groupby(pid)[cols].apply(lambda s: pd.unique(s.values.ravel()).tolist()).astype(str)

    # alternative approach, also good, and avoids creaintg personal dataframe
    # but ... regeis is fast since it stopw when it finds one true code!
    #    c=df.icdbi.str.split(', ', expand=True).to_sparse()
    #    c.isin(['S720', 'I10']).any(axis=1).any(level=0)

    persondf = pd.DataFrame(index=df[pid].unique().tolist())
    for name, codes in codes.items():
        codes_regex = '|'.join(codes)
        persondf[name] = df_persons.str.contains(codes_regex, na=False)

    return persondf

# get inpatient data

To test the functions and to calculate the Charslon index we need some data. Here we will use data on hospital visits from Medicare: 


In [None]:
# Use pandas
import pandas as pd

In [None]:
# Read synthetic medicare sample data on inpatient hospital stays
path = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/'
inpatient_file = 'DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.zip'

inpatient = pd.read_csv(path+inpatient_file)

inpatient.columns = inpatient.columns.str.lower()
# easier to use a column called 'pid' than 'desynpuf_id'
inpatient['pid']=inpatient['desynpuf_id']

#set index to the personal id, but also keep id as a column
inpatient = inpatient.set_index('pid', drop=False)
inpatient.index.name='pid_index'


In [None]:
# Have a look
inpatient.head()

In [None]:
# make a list of columns with information about diagnostic codes
icd_cols = list(inpatient.columns[inpatient.columns.str.startswith('icd9_dgns_cd')])
icd_cols

Make a list of all unique ICD9 codes that exist, a all_codes:

In [None]:
# Codes to calculate CCI using ICD-9 (CM, US, Enhanced)
# Source: http://mchp-appserv.cpe.umanitoba.ca/concept/Charlson%20Comorbidities%20-%20Coding%20Algorithms%20for%20ICD-9-CM%20and%20ICD-10.pdf

infarction = '''
      410* 
      412*
      '''

heart_failure = '''
        390.91 
        402.21 402.11 402.91 
        404.01 404.03 404.11 404.13 404.91 404.93 
        425.4-425.9 
        428*
        '''

peripheral_vascular = '''
        093.0
        437.3
        440*
        441*
        443.1-443.9
        447.1
        557.1 557.9
        V43.4
        '''

cerebrovascular = '''
        362.34
        430*-438*
        '''
dementia = '''
        290*
        294.1
        331.2
        '''

pulmonary ='''
      416.8 416.9
      490*-505* 
      506.4
      508.1 508.8
      '''
rheumatic = '''
      446.5
      710.0-710.4
      714.0-714.2 714.8
      725*
      '''

peptic_ulcer = '531*-534*'

liver_mild ='''
      070.22
      070.23
      070.32
      070.33
      070.44
      070.54
      070.6
      070.9
      570.*
      571.*
      573.3 573.4 573.8 573.9
      V42.7
      '''
# Interesting, diabetes seems to be 5 digits long in the data, but not the specified codes
diabetes_without_complication = '250.0*-250.3* 250.8* 250.9*'

diabetes_with_complication = '250.4*-250.7*'

plegia = '''
    334.1
    342.*
    343.*
    344.0-344.6
    344.9
    '''

renal = '''
    403.01 403.11,403.91 
    404.02 404.03 404.12 404.13 404.92 404.93
    582.*  
    583.0-583.7
    585*
    586*
    588.0
    V42.0
    V45.1
    V56*
    '''

malignancy = '''
    140*-172*
    174.0-195.8
    200*-208*
    238.6
    '''

liver_not_mild = '''
    456.0-456.2
    572.2-572.8
    '''

tumor = '196*-199*'

hiv = '042*-044*'

Put all the strings that describe the codes for the comorbitities in a single datastructure:

In [None]:
icd9 = unique(df=inpatient, cols = icd_cols, all_str=True)

In [None]:
# A dictionary with names of cormobitities and the associated medical codes

code_string = { 'infarction' : infarction, 
               'heart_failure' : heart_failure, 
               'peripheral_vascular' : peripheral_vascular, 
               'cerebrovascular' : cerebrovascular, 
               'dementia' : dementia, 
               'pulmonary' : pulmonary, 
               'rheumatic' : rheumatic, 
               'peptic_ulcer' : peptic_ulcer, 
               'liver_mild' : liver_mild, 
               'diabetes_without_complication' : diabetes_without_complication, 
               'diabetes_with_complication' : diabetes_with_complication, 
               'plegia' : plegia, 
               'renal' : renal, 
               'malignancy' : malignancy, 
               'liver_not_mild' : liver_not_mild, 
               'tumor' : tumor, 
               'hiv' : hiv}

Having created a all_codes, we can use the functions we have created to expand the description for all the different comorbidities to include all the specific codes:

In [None]:
codes = {disease : expand_code(codes.split(), 
                               all_codes=icd9,
                               drop_dot=True,
                               drop_leading_zero=True) 
        for disease, codes in code_string.items()}

And we can check if it really expanded the codes, for instance by examining the codes for mild liver disease:

In [None]:
codes['liver_mild']

In order to do the calculations, we need the weights associated with each comorbidity. These weights are related to the predictive power of the comorbididy for the probability of dying in a given time period. There are a few different standards, but with relatively minor varitions. Here we use the following:


In [None]:
charlson_points = { 'infarction': 1, 
                   'heart_failure': 1, 
                   'peripheral_vascular': 1, 
                   'cerebrovascular': 1, 
                   'dementia': 1, 
                   'pulmonary': 1, 
                   'rheumatic': 1, 
                   'peptic_ulcer': 1, 
                   'liver_mild': 1, 
                   'diabetes_without_complication': 1, 
                   'diabetes_with_complication': 2, 
                   'plegia': 2, 
                   'renal': 2, 
                   'malignancy': 2, 
                   'liver_not_mild': 3, 
                   'tumor': 6, 
                   'hiv': 6}


We also need the function that takes a set of codes and identifies the rows and persons who have the codes (a function we developed in a previous notebook):

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import nbdev

In [None]:
from nbdev.sync import script2notebook 

In [None]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
Converted old_functions.ipynb.
Converted pattern_finder.ipynb.
Converted pattern_finder_only.ipynb.
Converted utilities.ipynb.
