# Preliminaries and Dataframe Construction

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

#Import Encounters from Database Query
#df = pd.read_pickle("encounters.pkl")

#Import Excluded Encounters from Pickle
#excluded_df = pd.read_pickle("excluded_processed.pkl")

### Discharge Dictionaries and Race Roll-Up

In [1]:
### OMB RACE DICTIONARY ###
#Hispanic-inclusive race reporting by OMB standard (use method here: https://stackoverflow.com/questions/26886653/pandas-create-new-column-based-on-values-from-other-columns-apply-a-function-o)
#See also https://www.cdc.gov/mmwr/volumes/70/wr/mm7032a2.htm

from pandas.api.types import CategoricalDtype

def dict_race_ethnicity (row):
    if row['Ethnicity'] == 'Hispanic': # or  row['Ethnicity'] == 1 :
        return 'Hispanic'
    return row['Race']

dict_race_small = {
    0 : 'Unknown',
    1 : 'AIAN',
    2 : 'AAPI',
    3 : 'Black',
    4 : 'AAPI',
    5 : 'White',
    6 : '>1 Race',

    'AIAN': 'AIAN',
    'AAPI': 'AAPI',
    'Black': 'Black',
    'Unknown': 'Unknown',
    'Hispanic' : 'Hispanic',
    'White': 'White',
    '>1 Race': '>1 Race',

    'American Indian or Alaska Native': 'AIAN',
    'Asian': 'AAPI',
    'Black or African American': 'Black',
    'Unknown': 'Unknown',
    'Hispanic' : 'Hispanic',
    'Native Hawaiian or Other Pacific Islander' : 'AAPI',
    'White': 'White',
    'More than one race': '>1 Race'
}

ethnicity_dict_reverse_lkp = {
    1 : 'Hispanic',
    2 : 'Not Hispanic'
}

sex_dict_reverse_lkp = {
    1 :'Male',
    2 :'Female'
}
###

### Discharge_Status Dictionary ###
#Both "Patient Expired" and "To Hospice/Medical Facility" are coded as expired
def dict_discharge(row) :
    if (row['Discharge_Status'] == 2) | (row['Discharge_Status'] == 11) :#|  (row['Discharge_Status'] == 8):
        return 'Expired'
    else:
        return 'Survived'
###

dict_discharge_long = {
    'Patient Expired': 2,
    'To Hospice/Medical Facility': 11,
    'To Hospice/Home Care': 8,

    'Home or Self Care': 1,
    'To Home Health Org Care': 3,
    'To Jail / Law Enforcement Facility': 4,
    'To SNF (Skilled Nursing)': 5,
    'To Inpatient Rehab Facility or Unit': 6,
    'Left against medical advice': 7,

    'To Psychiatric Hospital or Unit': 9,
    'To Short Term Acute Care Hosp': 10,
    'To LTC Facility (Long Term Care)': 12,

    'Sent to SMH': 13,
    'Sent to HH': 14,
    'To Short Term General Hospital for Inpatient Care with Planned Hospital Readmission': 15,
    'To Inpatient Rehab Facility or Unit with Planned Hospital Readmission': 16,
    'To Other Facility not otherwise defined': 17,
    'To ICF (Intermediate Care)': 18,
    'To Federal Hospital': 19,
    "To Designated Cancer Ctr or Children's Hospital with Planned Hospital Readmission": 20,
    'Still Inpatient': 21,
    'To Psychiatric Hospital or Unit with Planned Hospital Readmission': 22
}

### Calculators for Descriptive Stats on Included/Excluded Cohorts
For a given sub-population partition, these calculators will calculate:
- mean (SD and normal 95% CIs) for most individual variables
- survival rates (95% CIs using Fay-Feur method)
- age-adjusted rates (95% CIs using Tiwari modification of Fay-Feur)

##### Age-Adjusted Rates
The following functions and statistics are implementations of the crude and age-adjusted rates and associated confidence interval methods outlined by SEER-Stat \
        https://seer.cancer.gov/seerstat/WebHelp/Rate_Algorithms.htm \
        Note: the inverse ChiSq distribution in Python is called by the chi2.ppf function.

        _References_
        Fay MP, Feuer EJ. Confidence intervals for directly standardized rates: a method based on the gamma distribution. Statistics in Medicine 1997 Apr 15;16(7):791-801.

An implementation in R using the inverse Gamma distribution (with equivalent results) can be found at:
        https://github.com/cran/epitools/blob/master/R/ageadjust.direct.R 
        Note: qgamma in Python is the gamma.ppf function https://stackoverflow.com/questions/51297532/gamma-distribution-in-python)


A general introduction to age-adjustment can be found here https://www.health.pa.gov/topics/HealthStatistics/Statistical-Resources/UnderstandingHealthStats/Pages/Age-Adjusted-Rates.aspx


        

In [1]:
from scipy.stats.distributions import chi2

#DEFINE Raw Stats Calculator#
def get_raw_stats_base(df_, groups, alpha=0.05):
    return (df_
        #.fillna(0)
        .groupby(groups, as_index=True, observed=True)
        .agg(Pop_N=pd.NamedAgg(column="Survived", aggfunc="count"),
             Survived=pd.NamedAgg(column="Survived", aggfunc="sum"),
             Age=pd.NamedAgg(column="Age", aggfunc="mean"),
             Age_SD=pd.NamedAgg(column="Age", aggfunc="std"),
             Age_SE=pd.NamedAgg(column="Age", aggfunc="sem"),
             SOFA=pd.NamedAgg(column="InitialSOFA", aggfunc="mean"),
             SOFA_SD=pd.NamedAgg(column="InitialSOFA", aggfunc="std"),
             SOFA_SE=pd.NamedAgg(column="InitialSOFA", aggfunc="sem"),
             StayLength=pd.NamedAgg(column="StayLength", aggfunc="mean"),
             StayLength_SE=pd.NamedAgg(column="StayLength", aggfunc="sem"),
             Cho_LE=pd.NamedAgg(column="Cho_LE", aggfunc="mean"),
             Cho_LE_SE=pd.NamedAgg(column="Cho_LE", aggfunc="sem"),
             COVID_Status_mean=pd.NamedAgg(column="COVID_Status", aggfunc="mean"),
             COVID_Status_SE=pd.NamedAgg(column="COVID_Status", aggfunc="sem"),
             CCS_raw=pd.NamedAgg(column="CCS_raw", aggfunc="mean"),
             CCS_raw_SE=pd.NamedAgg(column="CCS_raw", aggfunc="sem"),
             CCS_age=pd.NamedAgg(column="CCS_age", aggfunc="mean"),
             CCS_age_SE=pd.NamedAgg(column="CCS_age", aggfunc="sem"),
             CCS_Colorado=pd.NamedAgg(column="CCS_Colorado", aggfunc="mean"),
             CCS_Colorado_SE=pd.NamedAgg(column="CCS_Colorado", aggfunc="sem"),
             ECI_raw=pd.NamedAgg(column="ECI_raw", aggfunc="mean"),
             ECI_raw_SE=pd.NamedAgg(column="ECI_raw", aggfunc="sem")
             )
        .reset_index()
        #Calculate Survival Rate (by first calculating death rate)
        .assign(Deaths = lambda df_0: df_0['Pop_N'] - df_0['Survived'])
        .assign(D_rate = lambda df_1: df_1['Deaths'] / df_1['Pop_N'])
        .assign(D_rate_CI_lo = lambda df_2: (0.5*chi2.ppf(
                    alpha/2, #alpha
                    2*df_2['Deaths'] #shape (N.B.: if shape is zero, then result should be defined as zero)
                    )) / df_2['Pop_N'],
                D_rate_CI_hi = lambda df_2: (0.5*chi2.ppf(
                    1 - alpha/2, #alpha
                    2*(df_2['Deaths']+1) #shape (N.B.: if shape is zero, then result should be defined as zero)
                    )) / df_2['Pop_N']
            )
        .assign(S_rate = lambda df_3: 1-df_3['D_rate'],
                S_rate_CI_lo = lambda df_3: 1-df_3['D_rate_CI_hi'],
                S_rate_CI_hi = lambda df_3: 1-df_3['D_rate_CI_lo'],
                Age_CI_lo = lambda df_: df_['Age'] - 1.96* df_['Age_SE'],
                Age_CI_hi = lambda df_:df_['Age'] + 1.96* df_['Age_SE'],
                SOFA_CI_lo = lambda df_: df_['SOFA'] - 1.96* df_['SOFA_SE'],
                SOFA_CI_hi = lambda df_:df_['SOFA'] + 1.96* df_['SOFA_SE'],
                StayLength_CI_lo = lambda df_: df_['StayLength'] - 1.96* df_['StayLength_SE'],
                StayLength_CI_hi = lambda df_:df_['StayLength'] + 1.96* df_['StayLength_SE'],
                Cho_LE_CI_lo = lambda df_: df_['Cho_LE'] - 1.96* df_['Cho_LE_SE'],
                Cho_LE_CI_hi = lambda df_:df_['Cho_LE'] + 1.96* df_['Cho_LE_SE'],
                COVID_Status_CI_lo = lambda df_: df_['COVID_Status_mean'] - 1.96* df_['COVID_Status_SE'],
                COVID_Status_CI_hi = lambda df_:df_['COVID_Status_mean'] + 1.96* df_['COVID_Status_SE'],
                CCS_raw_CI_lo = lambda df_: df_['CCS_raw'] - 1.96* df_['CCS_raw_SE'],
                CCS_raw_CI_hi = lambda df_:df_['CCS_raw'] + 1.96* df_['CCS_raw_SE'],
                CCS_age_CI_lo = lambda df_: df_['CCS_age'] - 1.96* df_['CCS_age_SE'],
                CCS_age_CI_hi = lambda df_:df_['CCS_age'] + 1.96* df_['CCS_age_SE'],
                CCS_Colorado_lo = lambda df_: df_['CCS_Colorado'] - 1.96* df_['CCS_Colorado_SE'],
                CCS_Colorado_hi = lambda df_:df_['CCS_Colorado'] + 1.96* df_['CCS_Colorado_SE'],
                ECI_raw_CI_lo = lambda df_: df_['ECI_raw'] - 1.96* df_['ECI_raw_SE'],
                ECI_raw_CI_hi = lambda df_:df_['ECI_raw'] + 1.96* df_['ECI_raw_SE'],
            )
        #Cleanup
        .drop(['Deaths', 'D_rate', 'D_rate_CI_hi', 'D_rate_CI_lo', 'Age_SE', 'SOFA_SE', 'StayLength_SE', 'Cho_LE_SE', 'COVID_Status_SE', 'CCS_raw_SE', 'CCS_age_SE', 'CCS_Colorado_SE', 'ECI_raw_SE'], axis=1)
        .round(4)
        .set_index(groups)
    )

#DEFINE Age-Adjusted Calculator#
def get_age_adjusted_stats_base(df_, groups, alpha=0.05):
    
    std_pop = pd.DataFrame({
    'Age_Group': ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '>85'],
    'Std_Pop': [(0.013818 + 0.055317 + 0.145565 + 0.138646), 0.135573, 0.162613, 0.134834, 0.087247, 0.066037, 0.044842, 0.015508]})

    if ('Age_Group' in groups): 
        groups_age = groups
    else:
        groups_age = groups + ['Age_Group']

    return (df_
        #.fillna(0)
        #calculate population totals for each age group (additionally sliced by other variables, e.g. protocol, run, race)
        .groupby(groups_age, as_index=True, observed=True) #originally false
        .agg(Pop_N=pd.NamedAgg(column="Survived", aggfunc="count"),
             Survived=pd.NamedAgg(column="Survived", aggfunc="sum"),          
             )
        .reset_index()
        .merge(std_pop, on='Age_Group') #bring in standard pop for age-adjustment
        .assign(Std_Pop = lambda df_0: df_0['Std_Pop'].mask(df_0['Pop_N'] == 0, 0), #zero out Std_Pop for each sub-group age-band where sub-group has no subjects (i.e. no AIAN in <25)
                Deaths = lambda df_0: df_0['Pop_N'] - df_0['Survived'],
            )
        #Calculate Age-Adj Deaths and Variance for each age-group
        .assign(Age_Adj_D_rate = lambda df_1: (df_1['Deaths']/df_1['Pop_N']) * df_1['Std_Pop'], #calculate the age-adjusted rate (https://seer.cancer.gov/seerstat/WebHelp/Rate_Algorithms.htm)
                #Age_Adj_D_var = lambda df_1: (df_1['Std_Pop']**2)*(df_1['Deaths']/(df_1['Pop_N']**2)), ## OLD VERSION, unclear why used.
                Age_Adj_D_var = lambda df_1: df_1['Deaths']*((df_1['Std_Pop']/df_1['Pop_N'])**2), ## SEER STAT version https://seer.cancer.gov/seerstat/WebHelp/Rate_Algorithms.htm
                #variance for each age_group to be summed for total variance of Race (see WA Health doc) https://doh.wa.gov/sites/default/files/legacy/Documents/1500//ConfIntGuide.pdf
            )
        #Assign the w variables to each Age_Group (and other groupings)
        .assign(w_i = lambda df_1: df_1['Std_Pop']/df_1['Pop_N']) #calc pop weight for each Age_Group and Race (use max for Fay and Freur, and avg for Tiwari mod)
        .assign(w_max = lambda df_2: df_2.groupby(groups)['w_i'].transform('max')) #find max pop weight for Fay and Freur CIs (note use of transform, see here https://stackoverflow.com/questions/35640364/python-pandas-max-value-in-a-group-as-a-new-column
        #Collapse the age-groups to calculate total age-adjusted deaths/lives saved.
        .groupby(groups, as_index=False).sum(numeric_only=True)
        .assign(w_max = lambda df_3: df_3['w_max']/len(df_.groupby('Age_Group', observed=True).count())) #divide sum of max pop weights by number of age-groups - i.e. 8 (to re-idnetify the max pop weight for Race)
        ## Calculate Fay-Feur CIs for Age-Adjusted Death Rates
        .assign(Age_Adj_D_rate_CI_lo = lambda df_3: 
                    (df_3['Age_Adj_D_var'])/(2*df_3['Age_Adj_D_rate']) *
                    chi2.ppf(alpha/2, #alpha 
                        (2*df_3['Age_Adj_D_rate']**2)/df_3['Age_Adj_D_var']), #shape
                Age_Adj_D_rate_CI_hi = lambda df_3: 
                    ((df_3['Age_Adj_D_var']+df_3['w_max']**2)/(2*(df_3['Age_Adj_D_rate']+df_3['w_max']))) *
                    chi2.ppf(1-alpha/2, # alpha
                        (2*(df_3['Age_Adj_D_rate']+df_3['w_max'])**2)/(df_3['Age_Adj_D_var']+df_3['w_max']**2)) #shape
            )
        #Calculate Age-Adjusted Survival Rates and CIs (as inverse of AA Death Rates and CIs)
        .assign(Age_Adj_S_rate = lambda df_4: 1-df_4['Age_Adj_D_rate'],
                Age_Adj_S_rate_CI_lo = lambda df_4: 1-df_4['Age_Adj_D_rate_CI_hi'],
                Age_Adj_S_rate_CI_hi = lambda df_4: 1-df_4['Age_Adj_D_rate_CI_lo'],
            )
        .drop(['Survived', 'Pop_N', 'Deaths','Std_Pop','w_i', 'w_max', 'Age_Adj_D_var', 'Age_Adj_D_rate', 'Age_Adj_D_rate_CI_hi', 'Age_Adj_D_rate_CI_lo'], axis=1)
        .round(4)
        .set_index(groups)
    )
