# Test Run of Creation of Clinical Profiles for Diabetes Cohorts

In [1]:
import pandas as pd
import numpy as np
import glob
from fhir.resources.clinicalprofile import ClinicalProfile
from datetime import datetime

In [2]:
# Assuming file structure as in sample dataset... can adjust later
def getData(basePath, dtypes_dict, sep='|', encoding='Latin-1'):
    demographics_path = glob.glob(basePath+'*demographics.txt')
    labs_path = glob.glob(basePath+'*labs.txt')
    diagnoses_path = glob.glob(basePath+'*diagnoses*.txt')
    encounter_path = glob.glob(basePath+'*encounter.txt')
    meds_path = glob.glob(basePath+'*meds.txt')
    procedure_path = glob.glob(basePath+'*procedure.txt')
    
    df_demographics = pd.read_csv(demographics_path[0], sep=sep)
    
    df_labs = pd.read_csv(labs_path[0], sep=sep)
    
    df_diagnoses = pd.read_csv(diagnoses_path[0], sep=sep)
    
    df_encounter = pd.read_csv(encounter_path[0], sep=sep)
    
    df_meds = pd.read_csv(meds_path[0], sep=sep)
    
    df_procedures = pd.read_csv(procedure_path[0], sep=sep, encoding=encoding)
               
    
    return df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures

In [3]:
# Want to specify dtypes for performance
#all_dates = ['DOB', 'Ordering_datetime','Result_datetime','Entry_Date',
#            'Encounter_date','Order_datetime','Start_date','End_date']
all_dtypes = {'PatientID':np.int64, 'Gender':'category','Race':'category','Ethnicity':'category',
              'EncounterID':np.int64, 'Result_numeric':np.float64,'Lab_Name':'category',
               'Base_Name':'category','Loinc_Code':'category','LONG_COMMON_NAME':'category',
                'status':'category','Category':'category','GroupId':'category','unit':'category',
               'range':'category', 'icd_10':'category','icd_name':'category','hpo':'category',
              'hpo_term':'category', 'Encounter_type':'category','Medication_Name':'category',
              'Dose':'category', 'Route':'category', 'Frequency':'category', 'Quantity':'category', 
              'RXNorm': 'category','Therapeutic_Class':'category','Pharmaceutical_Class':'category', 
              'Pharmaceutical_Subclass':'category', 'Procedure_ID':np.int64,'Procedure_Code':'category',
                   'Procedure_Name':'category'}

In [4]:
df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures = getData(r'S:\NCATS\Clinical_Profiles\clean_data\Diabetes\jh', all_dtypes)

In [5]:
medianEncounterYear = (pd.to_datetime(df_encounter.Encounter_date).dt.year).median()

In [13]:
def calculateProfile(df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures,
                     medianEncounterYear, gender='All', race='All', age_low='All', age_high=None):
    
    if (gender == 'All'):
        # grab whole dataframe
        df_sub_demographics = df_demographics
    else:
        # grab gender
        df_sub_demographics = df_demographics[df_demographics.Gender == gender]
        
    if (race != 'All'):
        # grab race
        df_sub_demographics = df_sub_demographics[df_sub_demographics.Race == race]
        
    if (age_low != 'All'):
        # grab age
        dob_ub = medianEncounterYear - age_low
        dob_lb = medianEncounterYear - age_high
        df_sub_demographics = (df_sub_demographics[
            (pd.to_datetime(df_sub_demographics.DOB).dt.year >= dob_lb) & 
            (pd.to_datetime(df_sub_demographics.DOB).dt.year <= dob_ub)])
        
    # Initialize  profile
    clinicalProfile = ClinicalProfile()
    
    # Header  info
    if age_high != None:
        clinicalProfile.identifier  = {'value': 'diabetes'+'_'+gender+'_'+race+'_'+str(age_low)+'_'+str(age_high)}
        clinicalProfile.cohort = {'reference': 'diabetes'+'_'+gender+'_'+race+'_'+str(age_low)+'_'+str(age_high)}
    else:
        clinicalProfile.identifier  = {'value': 'diabetes'+'_'+gender+'_'+race+'_'+age_low}
        clinicalProfile.cohort = {'reference': 'diabetes'+'_'+gender+'_'+race+'_'+age_low}
    clinicalProfile.status = 'draft'
    clinicalProfile.population = {'reference': 'diabetes'}
    
    clinicalProfile.date = str(datetime.now())
    clinicalProfile.reporter = {'reference': 'Organziation/JHM',
                               'type': 'Organization',
                               'display': 'Johns Hopkins School of Medicine'}
    
    # Lab info
    df_labs_full = df_labs.merge(df_sub_demographics, on='PatientID', how='right')
    df_labs_full.drop(['Result_datetime','Base_Name','status','Category','GroupId'],axis=1,inplace=True)
    
    # Calculate first
    labs_counts = df_labs_full.Loinc_Code.value_counts()
    labs_codes = labs_counts.index
    df_labs_full['orderYear'] = pd.to_datetime(df_labs_full.Ordering_datetime).dt.year
    labs_frequencyPerYear = (df_labs_full.groupby(['Loinc_Code','PatientID','orderYear']).PatientID.size()
                                    .groupby(['Loinc_Code','orderYear']).aggregate(np.mean))
    labs_fractionOfSubjects = (np.divide(df_labs_full.groupby(['Loinc_Code']).PatientID.nunique(),
                                              df_labs_full.PatientID.nunique()))
    labs_units = df_labs_full.groupby(['Loinc_Code']).unit.unique()
    
    def percentile(n):
        def percentile_(x):
            return x.quantile(n*0.01)
        percentile_.__name__ = '%s' % n
        return percentile_
    
    labs_stats = (df_labs_full.groupby(['Loinc_Code'])
               .Result_numeric.agg(['min','max', 'mean','median','std',
                                   percentile(10), percentile(20), percentile(30),
                                   percentile(40), percentile(50), percentile(60),
                                   percentile(70), percentile(80), percentile(90)]))
    
    # skipping normalizedHigh and normalizedLow
    
    df_labs_full['range_high'] = (pd.to_numeric(df_labs_full.range.dropna()
               .astype('str').str.split(',',expand=True)[1]).astype('float'))

    df_labs_full['range_low'] = (pd.to_numeric(df_labs_full.range.dropna()
               .astype('str').str.split(',',expand=True)[0]).astype('float'))
    
    def fracsAboveBelowNormal(x):
        aboveNorm = np.divide(np.sum(x.Result_numeric > x.range_high), x.Result_numeric.size)
        belowNorm = np.divide(np.sum(x.Result_numeric < x.range_low), x.Result_numeric.size)
        return pd.Series({'aboveNorm':aboveNorm, 'belowNorm':belowNorm})
    
    labs_aboveBelowNorm = (df_labs_full.groupby(['Loinc_Code'])
                     .apply(fracsAboveBelowNormal))

    labs_correlatedLabsCoefficients = (df_labs_full.groupby('Loinc_Code').Result_numeric
                                        .apply(lambda x: pd.Series(x.values))
                                              .unstack().transpose().corr())
    
    labs_abscorrelation = labs_correlatedLabsCoefficients.abs()
    # Medication info
    df_meds_full = df_meds.merge(df_sub_demographics, on='PatientID', how='outer')
    
    meds_medication = df_meds_full.RXNorm.unique()

    uniqDropNA = lambda x: np.unique(x.dropna())

    #meds_dosageInfo = (df_meds_full.groupby('RXNorm')
    #              .agg({'Route':uniqDropNA, 'Dose':uniqDropNA,'Quantity':uniqDropNA}))

    df_meds_full['startYear'] = pd.to_datetime(df_meds_full.Start_date).dt.year

    meds_frequencyPerYear = (df_meds_full.groupby(['RXNorm','startYear','PatientID']).PatientID
                        .count().groupby(['RXNorm','startYear']).mean())

    meds_fractionOfSubjects = (np.divide(df_meds_full.groupby(['RXNorm']).PatientID.nunique(),
                                    df_meds_full.PatientID.nunique()))
    
    # Diagnosis info
    df_diagnoses_hpo_full = (df_diagnoses.merge(df_sub_demographics, on='PatientID', how='outer'))
    
    diagnoses_code = df_diagnoses_hpo_full.icd_10.unique()

    df_diagnoses_hpo_full['entryYear'] = pd.to_datetime(df_diagnoses_hpo_full.Entry_Date).dt.year

    diagnoses_frequencyPerYear = (df_diagnoses_hpo_full.groupby(['icd_10','entryYear','PatientID']).PatientID
                        .count().groupby(['icd_10','entryYear']).mean())

    diagnoses_fractionOfSubjects = (np.divide(df_diagnoses_hpo_full.groupby(['icd_10']).PatientID.nunique(),
                                    df_diagnoses_hpo_full.PatientID.nunique()))
    
    # Procedure info
    df_procedures_full = (df_procedures
                          .merge(df_sub_demographics, on='PatientID', how='right'))
    
    df_procedures_full.drop(['DOB','Gender','Race','Ethnicity'], axis=1, inplace=True)
    
    df_procedures_full = df_procedures_full.merge(df_encounter, on=['EncounterID','PatientID'], how='left')
    
    procedures_code = df_procedures_full.Procedure_Code.unique()

    df_procedures_full['encounterYear'] = pd.to_datetime(df_procedures_full.Encounter_date).dt.year

    procedures_frequencyPerYear = (df_procedures_full.groupby(['Procedure_Code','encounterYear','PatientID']).PatientID.count()
                                            .groupby(['Procedure_Code','encounterYear']).mean())

    procedures_fractionOfSubjects = (np.divide(df_procedures_full.groupby(['Procedure_Code']).PatientID.nunique(),
                                    df_procedures_full.PatientID.nunique()))
                        
    
    # HPO info
    
    return 'Done'

In [7]:
cohortDefinitions = pd.read_csv(r'DiabetesCohorts.csv')

In [8]:
cohortDefinitions

Unnamed: 0,gender,race,age_low,age_high
0,All,All,All,
1,All,All,18,44.0
2,All,All,45,64.0
3,All,All,65,74.0
4,All,All,75,200.0
5,All,White or Caucasian,All,
6,All,White or Caucasian,18,44.0
7,All,White or Caucasian,45,64.0
8,All,White or Caucasian,65,74.0
9,All,White or Caucasian,75,200.0


In [14]:
%%time
calculateProfile(df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, 
                 df_procedures, medianEncounterYear,gender='Male', age_low=45, age_high=64)

Wall time: 1min 12s


'Done'