# Test Run of Creation of Clinical Profiles for Diabetes Cohorts

In [1]:
import pandas as pd
import numpy as np
import glob
from fhir.resources.clinicalprofile import *
from fhir.resources.coding import Coding
from datetime import datetime
import json

In [2]:
# Assuming file structure as in sample dataset... can adjust later
def getData(basePath, dtypes_dict, sep='|', encoding='Latin-1'):
    demographics_path = glob.glob(basePath+'*demographics.txt')
    labs_path = glob.glob(basePath+'*labs.txt')
    diagnoses_path = glob.glob(basePath+'*diagnoses*.txt')
    encounter_path = glob.glob(basePath+'*encounter.txt')
    meds_path = glob.glob(basePath+'*meds.txt')
    procedure_path = glob.glob(basePath+'*procedure.txt')
    
    df_demographics = pd.read_csv(demographics_path[0], sep=sep)
    
    df_labs = pd.read_csv(labs_path[0], sep=sep)
    
    df_diagnoses = pd.read_csv(diagnoses_path[0], sep=sep)
    
    df_encounter = pd.read_csv(encounter_path[0], sep=sep)
    
    df_meds = pd.read_csv(meds_path[0], sep=sep)
    
    df_procedures = pd.read_csv(procedure_path[0], sep=sep, encoding=encoding)
               
    
    return df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures

In [3]:
# Want to specify dtypes for performance
#all_dates = ['DOB', 'Ordering_datetime','Result_datetime','Entry_Date',
#            'Encounter_date','Order_datetime','Start_date','End_date']
all_dtypes = {'PatientID':np.int64, 'Gender':'category','Race':'category','Ethnicity':'category',
              'EncounterID':np.int64, 'Result_numeric':np.float64,'Lab_Name':'category',
               'Base_Name':'category','Loinc_Code':'category','LONG_COMMON_NAME':'category',
                'status':'category','Category':'category','GroupId':'category','unit':'category',
               'range':'category', 'icd_10':'category','icd_name':'category','hpo':'category',
              'hpo_term':'category', 'Encounter_type':'category','Medication_Name':'category',
              'Dose':'category', 'Route':'category', 'Frequency':'category', 'Quantity':'category', 
              'RXNorm': 'category','Therapeutic_Class':'category','Pharmaceutical_Class':'category', 
              'Pharmaceutical_Subclass':'category', 'Procedure_ID':np.int64,'Procedure_Code':'category',
                   'Procedure_Name':'category'}

In [4]:
df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures = getData(r'S:\NCATS\Clinical_Profiles\clean_data\Diabetes\jh', all_dtypes)

In [5]:
medianEncounterYear = (pd.to_datetime(df_encounter.Encounter_date).dt.year).median()

In [6]:
cohortDefinitions = pd.read_csv(r'DiabetesCohorts.csv')

In [8]:
df_labs.Lab_Name.head()

0    Est GFR NonAfrAm(MDRD Eqn)
1          Mean Corpus Hgb Conc
2              Prothrombin Time
3                       Calcium
4         Glucose Point of Care
Name: Lab_Name, dtype: object

In [7]:
def calculateProfile(df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, df_procedures,
                     medianEncounterYear, gender='All', race='All', age_low='All', age_high=None, numCorrs=10):
    
    if (gender == 'All'):
        # grab whole dataframe
        df_sub_demographics = df_demographics
    else:
        # grab gender
        df_sub_demographics = df_demographics[df_demographics.Gender == gender]
        
    if (race != 'All'):
        # grab race
        df_sub_demographics = df_sub_demographics[df_sub_demographics.Race == race]
        
    if (age_low != 'All'):
        # grab age
        dob_ub = medianEncounterYear - float(age_low)
        dob_lb = medianEncounterYear - float(age_high)
        df_sub_demographics = (df_sub_demographics[
            (pd.to_datetime(df_sub_demographics.DOB).dt.year >= dob_lb) & 
            (pd.to_datetime(df_sub_demographics.DOB).dt.year <= dob_ub)])
        
    # Initialize  profile
    clinicalProfile = ClinicalProfile()
    clinicalProfile.resourceType = 'ClinicalProfile'
    
    # Header  info
    if age_high != None:
        clinicalProfile.identifier  = [identifier.Identifier({'value': 
                                                              'Group/diabetes'+'-'+gender+'-'+race+'-'+str(age_low)+'-'+str(age_high)})]
        clinicalProfile.cohort = fhirreference.FHIRReference({'reference': 
                                                      'Group/diabetes'+'-'+gender+'-'+race+'-'+str(age_low)+'-'+str(age_high)}) 
    else:
        clinicalProfile.identifier  = [identifier.Identifier({'value': 
                                                              'Group/diabetes'+'-'+gender+'-'+race+'-'+str(age_low)+'-'+str(age_high)})]
        clinicalProfile.cohort = fhirreference.FHIRReference({'reference': 
                                                      'Group/diabetes'+'-'+gender+'-'+race+'-'+str(age_low)})
    clinicalProfile.status = 'draft'
    clinicalProfile.population = fhirreference.FHIRReference({'reference': 'Group/diabetes'})
     
    clinicalProfile.date = fhirdate.FHIRDate(str(datetime.now()).replace(' ', 'T'))
    clinicalProfile.reporter = fhirreference.FHIRReference({'reference': 'Organization/JHM',
                           'type': 'Organization',
                           'display': 'Johns Hopkins School of Medicine'})
    
    # Lab info
    df_labs_full = df_labs.merge(df_sub_demographics, on='PatientID', how='right')
    df_labs_full.drop(['Result_datetime','Base_Name','status','Category','GroupId','LONG_COMMON_NAME'],
                      axis=1,inplace=True)
    
    # Calculate first
    labs_counts = df_labs_full.Loinc_Code.value_counts()
    df_labs_full['orderYear'] = pd.to_datetime(df_labs_full.Ordering_datetime).dt.year
    grouped_labs = df_labs_full.groupby(['Loinc_Code','orderYear'])
    labs_frequencyPerYear = (df_labs_full.groupby(['Loinc_Code','PatientID','orderYear']).PatientID.size()
                                    .groupby(['Loinc_Code','orderYear']).aggregate(np.mean))
    labs_fractionOfSubjects = (np.divide(grouped_labs.PatientID.nunique(),
                                              df_labs_full.PatientID.nunique()))
    labs_units = df_labs_full.groupby(['Loinc_Code']).unit.unique()
    labs_names = df_labs_full.groupby(['Loinc_Code']).Lab_Name.unique()
    
    def percentile(n):
        def percentile_(x):
            return x.quantile(n*0.01)
        percentile_.__name__ = '%s' % n
        return percentile_
    
    labs_stats = (grouped_labs.Result_numeric.agg(['min','max', 'mean','median','std',
                                   percentile(10), percentile(20), percentile(30),
                                   percentile(40), percentile(50), percentile(60),
                                   percentile(70), percentile(80), percentile(90)]))
    
    # skipping normalizedHigh and normalizedLow
    
    df_labs_full['range_high'] = (pd.to_numeric(df_labs_full.range.dropna()
               .astype('str').str.split(',',expand=True)[1]).astype('float'))

    df_labs_full['range_low'] = (pd.to_numeric(df_labs_full.range.dropna()
               .astype('str').str.split(',',expand=True)[0]).astype('float'))
    
   
    def fracsAboveBelowNormal(x):
        aboveNorm = np.divide(np.sum(x.Result_numeric > x.range_high), x.Result_numeric.size)
        belowNorm = np.divide(np.sum(x.Result_numeric < x.range_low), x.Result_numeric.size)
        return pd.Series({'aboveNorm':aboveNorm, 'belowNorm':belowNorm})
    
    labs_aboveBelowNorm = (grouped_labs.apply(fracsAboveBelowNormal))
    
    labs_correlatedLabsCoefficients = (df_labs_full.groupby(['Loinc_Code','orderYear','PatientID'])
                                       .Result_numeric.mean())
    #corrmat = pd.DataFrame(labs_correlatedLabsCoefficients).unstack(level=[0,1]).corr()
    #corrmat=corrmat.droplevel(level=0).droplevel(level=0,axis=1)
    #
    #yearly_vals = dict()
    #for year in corrmat.loc[thisLab].index:
    #    crosstab = corrmat.loc[(thisLab, year)]
    #    yearly_vals[year] = crosstab[crosstab.index.get_level_values(level=1) == year].droplevel(level=1)
    #    
    #topNcorrs = pd.DataFrame(yearly_vals).apply(np.mean, axis=1).drop(thisLab).nlargest(numCorrs)
    
    labs_abscorrelation = labs_correlatedLabsCoefficients.abs()
    
       
    ## LABS TO MEDS
    # Need medication info to do correlation
    df_meds_full = df_meds.merge(df_sub_demographics, on='PatientID', how='outer')
    df_meds_full['startYear'] = pd.to_datetime(df_meds_full.Start_date).dt.year
    rxInfo = df_meds_full[['RXNorm','PatientID', 'startYear']]
    
    
    def patientsAboveBelowNormalMeds(x):
        # Get patients above and below normal
        patientsAboveNorm = x.PatientID[x.Result_numeric > x.range_high].tolist()
        patientsBelowNorm = x.PatientID[x.Result_numeric < x.range_low].tolist()
        
        # Get unique patient IDs for above & below normal
        patientsAboveBelowNorm = list(set(patientsAboveNorm + patientsBelowNorm))
        
        # Link to meds table
        abnormalPatientsMeds = rxInfo[(rxInfo.PatientID.isin(patientsAboveBelowNorm)) &
                                     (rxInfo.startYear == pd.to_datetime(x.Ordering_datetime).dt.year.unique()[0])]
        
        #return print(pd.to_datetime(x.Ordering_datetime).dt.year.unique()[0])
        return pd.Series({'medsAboveBelowNorm': abnormalPatientsMeds.RXNorm.value_counts().index,
                        'counts': abnormalPatientsMeds.RXNorm.value_counts().values})
    
    # Need to grab the indices of those with abnormal lab, grab their medications, count and rank them 
    labs_correlatedMedsCoefficients = (grouped_labs.apply(patientsAboveBelowNormalMeds))
    
    # Currently a little hacky, but seems fast
    mytups = list()
    multiIndex = list()

    for lab in labs_correlatedMedsCoefficients.index:
        thisLabYear = labs_correlatedMedsCoefficients.loc[lab]
        thisLab = lab[0]
        thisYear = lab[1]
        totalCrossTab = np.sum(thisLabYear.counts)
        for medInd in range(len(labs_correlatedMedsCoefficients.loc[lab].medsAboveBelowNorm.values)):
            mytups.append((thisLabYear.medsAboveBelowNorm.values[medInd], thisLabYear.counts[medInd]/totalCrossTab))
            multiIndex.append((thisLab, thisYear))
    
    
    index = pd.MultiIndex.from_tuples(multiIndex)
    labs_correlatedMedsCoefficients = (pd.DataFrame.from_records(mytups, columns=['RXNorm','Relative_Counts'], index=index))
                                       
    
    ## LABS TO PROCEDURES
    
    df_procedures_full = (df_procedures.merge(df_sub_demographics, on='PatientID', how='right'))
    df_procedures_full = (df_procedures_full.merge(df_encounter, on='EncounterID', how='inner'))
    
    df_procedures_full.drop(['DOB','Gender','Race','Ethnicity', 'Encounter_type','PatientID_y'], axis=1, inplace=True)
    df_procedures_full['encounterYear'] = pd.to_datetime(df_procedures_full.Encounter_date).dt.year
    df_procedures_full.rename({'PatientID_x':'PatientID'}, axis=1, inplace=True)
    procInfo = df_procedures_full[['Procedure_Code','PatientID','encounterYear']]
    
    def patientsAboveBelowNormalProcs(x):
        # Get patients above and below normal
        patientsAboveNorm = x.PatientID[x.Result_numeric > x.range_high].tolist()
        patientsBelowNorm = x.PatientID[x.Result_numeric < x.range_low].tolist()
        
        # Get unique patient IDs for above & below normal
        patientsAboveBelowNorm = list(set(patientsAboveNorm + patientsBelowNorm))
        
        # Link to procs table
        abnormalPatientsProcs = procInfo[procInfo.PatientID.isin(patientsAboveBelowNorm) &
                                     (procInfo.encounterYear == pd.to_datetime(x.Ordering_datetime).dt.year.unique()[0])]
                        
        return pd.Series({'procsAboveBelowNorm': abnormalPatientsProcs.Procedure_Code.value_counts().index,
                         'counts': abnormalPatientsProcs.Procedure_Code.value_counts().values})
    
    # Need to grab the indices of those with abnormal lab, grab their medications, count and rank them 
    labs_correlatedProceduresCoefficients = (grouped_labs.apply(patientsAboveBelowNormalProcs))
        
    # Currently a little hacky, but seems fast
    mytups = list()
    multiIndex = list()

    for lab in labs_correlatedProceduresCoefficients.index:
        thisLabYear = labs_correlatedProceduresCoefficients.loc[lab]
        thisLab = lab[0]
        thisYear = lab[1]
        totalCrossTab = np.sum(thisLabYear.counts)
        for procInd in range(len(labs_correlatedProceduresCoefficients.loc[lab].procsAboveBelowNorm.values)):
            mytups.append((thisLabYear.procsAboveBelowNorm.values[procInd], thisLabYear.counts[procInd]/totalCrossTab))
            multiIndex.append((thisLab, thisYear))
        
    index = pd.MultiIndex.from_tuples(multiIndex)
    labs_correlatedProceduresCoefficients = (pd.DataFrame.from_records(mytups, columns=['Procedure_Code','Relative_Counts'], 
                                                                       index=index))
    
    ## LABS TO DIAGNOSES
    
    df_diagnoses_hpo_full = (df_diagnoses.merge(df_sub_demographics, on='PatientID', how='outer'))
    df_diagnoses_hpo_full['entryYear'] = pd.to_datetime(df_diagnoses_hpo_full.Entry_Date).dt.year
    diagInfo = df_diagnoses_hpo_full[['icd_10','PatientID','entryYear']]
    
    def patientsAboveBelowNormalDiags(x):
        # Get patients above and below normal
        patientsAboveNorm = x.PatientID[x.Result_numeric > x.range_high].tolist()
        patientsBelowNorm = x.PatientID[x.Result_numeric < x.range_low].tolist()
        
        # Get unique patient IDs for above & below normal
        patientsAboveBelowNorm = list(set(patientsAboveNorm + patientsBelowNorm))
        
        # Link to procs table
        abnormalPatientsDiags = diagInfo[diagInfo.PatientID.isin(patientsAboveBelowNorm)&
                                     (diagInfo.entryYear == pd.to_datetime(x.Ordering_datetime).dt.year.unique()[0])]
                        
        return pd.Series({'diagsAboveBelowNorm': abnormalPatientsDiags.icd_10.value_counts().index,
                         'counts': abnormalPatientsDiags.icd_10.value_counts().values})
    
    # Need to grab the indices of those with abnormal lab, grab their medications, count and rank them 
    labs_correlatedDiagnosisCoefficients = (grouped_labs.apply(patientsAboveBelowNormalDiags))
    
    # Currently a little hacky, but seems fast
    mytups = list()
    multiIndex = list()

    for lab in labs_correlatedDiagnosisCoefficients.index:
        thisLabYear = labs_correlatedDiagnosisCoefficients.loc[lab]
        thisLab = lab[0]
        thisYear = lab[1]
        totalCrossTab = np.sum(thisLabYear.counts)
        for diagInd in range(len(labs_correlatedDiagnosisCoefficients.loc[lab].diagsAboveBelowNorm.values)):
            mytups.append((thisLabYear.diagsAboveBelowNorm.values[diagInd], thisLabYear.counts[diagInd]/totalCrossTab))
            multiIndex.append((thisLab, thisYear))
    
    index = pd.MultiIndex.from_tuples(multiIndex)
    labs_correlatedDiagnosisCoefficients = (pd.DataFrame.from_records(mytups, columns=['icd_10','Relative_Counts'], 
                                                                       index=index))
    
    # Medication info
    
    """meds_medication = df_meds_full.RXNorm.unique()

    uniqDropNA = lambda x: np.unique(x.dropna())

    #meds_dosageInfo = (df_meds_full.groupby('RXNorm')
    #              .agg({'Route':uniqDropNA, 'Dose':uniqDropNA,'Quantity':uniqDropNA}))

    meds_frequencyPerYear = (df_meds_full.groupby(['RXNorm','startYear','PatientID']).PatientID
                        .count().groupby(['RXNorm','startYear']).mean())

    meds_fractionOfSubjects = (np.divide(df_meds_full.groupby(['RXNorm']).PatientID.nunique(),
                                    df_meds_full.PatientID.nunique()))
    
    grouped_meds = df_meds_full.groupby(['RXNorm','startYear'])
    labInfo = df_labs_full[['Result_numeric','Loinc_Code','range_high','range_low','PatientID', 'orderYear']]
    
    #meds_correlatedLabsCoefficients
    def medsPatientsAboveBelowNormal(x):
        
        patientsWithThisRX = list(set(x.PatientID.tolist()))
        
        # Link to labs table
        abnormalPatientsLabs = labInfo[(labInfo.PatientID.isin(patientsWithThisRX)) & 
                                       ((labInfo.Result_numeric > labInfo.range_high) | 
                                        (labInfo.Result_numeric < labInfo.range_low)) &
                                      (labInfo.orderYear == pd.to_datetime(x.Start_date).dt.year.unique()[0])]
                        
        return pd.Series({'labsAboveBelowNorm': abnormalPatientsLabs.Loinc_Code.value_counts().index,
                         'counts': abnormalPatientsLabs.Loinc_Code.value_counts().values})

    meds_correlatedLabsCoefficients = (grouped_meds.apply(medsPatientsAboveBelowNormal))
    
    # Currently a little hacky, but seems fast
    mytups = list()
    multiIndex = list()

    for med in meds_correlatedLabsCoefficients.index:
        thisMedYear = meds_correlatedLabsCoefficients.loc[med]
        thisMed = med[0]
        thisYear = med[1]
        totalCrossTab = np.sum(thisMedYear.counts)
        for labInd in range(len(meds_correlatedLabsCoefficients.loc[med].labsAboveBelowNorm.values)):
            mytups.append((thisMedYear.labsAboveBelowNorm.values[labInd], thisMedYear.counts[labInd]/totalCrossTab))
            multiIndex.append((thisMed, thisYear))
    
    index = pd.MultiIndex.from_tuples(multiIndex)
    meds_correlatedLabsCoefficients = (pd.DataFrame.from_records(mytups, columns=['Loinc_Code','Relative_Counts'], 
                                                                       index=index))
    
    # Meds to DX 
    def medsDxAssociation(x):
        
        patientsWithThisRX = list(set(x.PatientID.tolist()))
        
        # Link to labs table
        medDxCrosstab = diagInfo[diagInfo.PatientID.isin(patientsWithThisRX) &
                                      (diagInfo.entryYear == pd.to_datetime(x.Start_date).dt.year.unique()[0])]
                        
        return pd.Series({'Dx_crosstab': medDxCrosstab.icd_10.value_counts().index,
                         'counts': medDxCrosstab.icd_10.value_counts().values})

    meds_correlatedDxCoefficients = (grouped_meds.apply(medsDxAssociation))
    
    # Currently a little hacky, but seems fast
 
    mytups = list()
    multiIndex = list()

    for med in meds_correlatedDxCoefficients.index:
        thisMedYear = meds_correlatedDxCoefficients.loc[med]
        thisMed = med[0]
        thisYear = med[1]
        totalCrossTab = np.sum(thisMedYear.counts)
        for dxInd in range(len(meds_correlatedDxCoefficients.loc[med].Dx_crosstab.values)):
            mytups.append((thisMedYear.Dx_crosstab.values[dxInd], thisMedYear.counts[dxInd]/totalCrossTab))
            multiIndex.append((thisMed, thisYear))
    
    index = pd.MultiIndex.from_tuples(multiIndex)
    meds_correlatedDxCoefficients = (pd.DataFrame.from_records(mytups, columns=['icd_10','Relative_Counts'], 
                                                                       index=index))
    """
    # Diagnosis info
    
    diagnoses_code = df_diagnoses_hpo_full.icd_10.unique()
    
    diagnoses_counts = df_diagnoses_hpo_full.icd_10.value_counts()

    df_diagnoses_hpo_full['entryYear'] = pd.to_datetime(df_diagnoses_hpo_full.Entry_Date).dt.year

    diagnoses_frequencyPerYear = (df_diagnoses_hpo_full.groupby(['icd_10','entryYear','PatientID']).PatientID
                        .count().groupby(['icd_10','entryYear']).mean())

    diagnoses_fractionOfSubjects = (np.divide(df_diagnoses_hpo_full.groupby(['icd_10']).PatientID.nunique(),
                                    df_diagnoses_hpo_full.PatientID.nunique()))
    
    # Procedure info
    
    procedures_code = df_procedures_full.Procedure_Code.unique()
    procedures_counts = df_procedures_full.Procedure_Code.value_counts()

    procedures_frequencyPerYear = (df_procedures_full.groupby(['Procedure_Code','encounterYear','PatientID']).PatientID.count()
                                            .groupby(['Procedure_Code','encounterYear']).mean())

    procedures_fractionOfSubjects = (np.divide(df_procedures_full.groupby(['Procedure_Code']).PatientID.nunique(),
                                    df_procedures_full.PatientID.nunique()))
                        
    
    # HPO info
    
    return (clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,labs_correlatedMedsCoefficients,
    labs_correlatedProceduresCoefficients, labs_correlatedDiagnosisCoefficients,
            diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects)

In [106]:
(clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,labs_correlatedMedsCoefficients,
    labs_correlatedProceduresCoefficients, labs_correlatedDiagnosisCoefficients,
           meds_medication, meds_frequencyPerYear, meds_fractionOfSubjects, meds_correlatedLabsCoefficients,
            meds_correlatedDxCoefficients, diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects) = calculateProfile(df_demographics, df_labs, df_diagnoses, df_encounter, df_meds, 
                 df_procedures, medianEncounterYear,gender='Male', age_low=45, age_high=64)

KeyboardInterrupt: 

In [87]:
lab_names = pd.DataFrame({'lab_name':labs_names}).reset_index()
lab_counts = pd.DataFrame({'lab_counts':labs_counts}).reset_index().rename({'index':'Loinc_Code'},axis=1)

In [88]:
lab_info = lab_names.merge(lab_counts, how='inner', on='Loinc_Code').set_index('Loinc_Code')

In [8]:
def writeProfile(clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,
                 labs_correlatedMedsCoefficients,labs_correlatedProceduresCoefficients, 
                 labs_correlatedDiagnosisCoefficients, diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects, cohort, gender='All', race='All', age_low='All', 
                 age_high=None):
    ## LABS
    labs = list()
    corrmat = (pd.DataFrame(labs_correlatedLabsCoefficients).unstack(level=[0,1]).corr()
                        .droplevel(level=0).droplevel(level=0,axis=1))
    lab_names = pd.DataFrame({'lab_name':labs_names}).reset_index()
    lab_counts = pd.DataFrame({'lab_counts':labs_counts}).reset_index().rename({'index':'Loinc_Code'},axis=1)
    lab_info = lab_names.merge(lab_counts, how='inner', on='Loinc_Code').set_index('Loinc_Code')

    for thisLab in lab_info.index:
        thisCPLab = ClinicalProfileLab()
        try:
            thisCPLab.code = [codeableconcept.CodeableConcept(dict(coding=[dict(system='https://loinc.org', 
                                                                                code=thisLab)],
                                                                  text=lab_info.loc[thisLab]['lab_name'][0]))]
            thisCPLab.count = int(lab_info.loc[thisLab]['lab_counts'])
            thisCPLab.frequencyPerYear = round(float(labs_frequencyPerYear.loc[thisLab].mean()),3)
            thisCPLab.fractionOfSubjects = round(float(labs_fractionOfSubjects.loc[thisLab].mean()),3)
            thisCPLab.scalarDistribution = ClinicalProfileLabScalarDistribution()
            thisCPLab.scalarDistribution.units = quantity.Quantity(dict(unit=str(labs_units.loc[thisLab][0])))
            thisCPLab.scalarDistribution.min = round(float(labs_stats.loc[thisLab]['min'].min()),3)
            thisCPLab.scalarDistribution.max = round(float(labs_stats.loc[thisLab]['max'].max()),3)
            thisCPLab.scalarDistribution.mean = round(float(labs_stats.loc[thisLab]['mean'].mean()),3)
            thisCPLab.scalarDistribution.median = round(float(labs_stats.loc[thisLab]['median'].median()),3)
            thisCPLab.scalarDistribution.stdDev = round(float(labs_stats.loc[thisLab]['std'].std()),3)
            deciles = list()
            for dec in labs_stats.columns[5:]:
                deciles.append(ClinicalProfileLabScalarDistributionDecile(
                                                                    dict(nth=int(dec), 
                                                                        value=round(labs_stats.loc[thisLab][dec].mean(),3))))
            thisCPLab.scalarDistribution.decile = deciles

            thisCPLab.scalarDistribution.fractionAboveNormal = round(float(labs_aboveBelowNorm.loc[thisLab].aboveNorm.mean()),3)
            thisCPLab.scalarDistribution.fractionBelowNormal = round(float(labs_aboveBelowNorm.loc[thisLab].belowNorm.mean()),3)

            yearly_vals = dict()
            for year in corrmat.loc[thisLab].index:
                crosstab = corrmat.loc[(thisLab, year)]
                yearly_vals[year] = crosstab[crosstab.index.get_level_values(level=1) == year].droplevel(level=1)

            top10corrs = pd.DataFrame(yearly_vals).apply(np.mean, axis=1).drop(thisLab).nlargest(10).round(3)

            entries = list()
            for code, corr in top10corrs.iteritems():
                otherLoinc = [(dict(coding=[dict(system='https://loinc.org', code=code)],
                                                                  text=lab_info.loc[code]['lab_name'][0]))]
                entries.append(dict(labcode=otherLoinc, coefficient=corr))

            try:
                thisCPLab.scalarDistribution.correlatedLabs = ClinicalProfileLabScalarDistributionCorrelatedLabs(
                                                                dict(topn=10, 
                                                                     abscorrelation=top10corrs[-1],
                                                                     entry=entries))
            except:
                print('No correlated Labs for Lab ', thisLab)

            try:
                top10corrs = (pd.DataFrame(labs_correlatedMedsCoefficients.loc[thisLab].groupby(['RXNorm'])
                                                                                    .Relative_Counts.mean())
                                                                                    .Relative_Counts.nlargest(10).round(3))
                entries = list()
                for code, corr in top10corrs.iteritems():
                    otherRX = [(dict(coding=[dict(system='https://www.nlm.nih.gov/research/umls/rxnorm/', code=code)]))]
                    entries.append(dict(medicationCodeableConcept=otherRX, coefficient=corr))

                thisCPLab.scalarDistribution.correlatedMedications = ClinicalProfileLabScalarDistributionCorrelatedMedications(
                                                                        dict(topn=10, 
                                                                          entry=entries))
            except:
                print('No correlated Meds for Lab ', thisLab)

            try:
                top10corrs = (pd.DataFrame(labs_correlatedDiagnosisCoefficients.loc[thisLab].groupby(['icd_10'])
                                                                                    .Relative_Counts.mean())
                                                                                    .Relative_Counts.nlargest(10).round(3))
                entries = list()
                for code, corr in top10corrs.iteritems():
                    otherDX = [(dict(coding=[dict(system='https://www.icd10data.com/', code=code)]))]
                    entries.append(dict(code=otherDX, coefficient=corr))

                thisCPLab.scalarDistribution.correlatedDiagnoses = ClinicalProfileLabScalarDistributionCorrelatedDiagnoses(
                                                                        dict(topn=10, 
                                                                          entry=entries))
            except:
                print('No correlated Diagnoses for Lab ', thisLab)

            try:      
                top10corrs = (pd.DataFrame(labs_correlatedProceduresCoefficients.loc[thisLab].groupby(['Procedure_Code'])
                                                                                    .Relative_Counts.mean())
                                                                                    .Relative_Counts.nlargest(10).round(3))
                entries = list()
                for code, corr in top10corrs.iteritems():
                    otherProc = [(dict(coding=[dict(system='https://www.ama-assn.org/practice-management/cpt', code=code)]))]
                    entries.append(dict(code=otherProc, coefficient=corr))

                thisCPLab.scalarDistribution.correlatedProcedures = ClinicalProfileLabScalarDistributionCorrelatedProcedures(
                                                                        dict(topn=10, 
                                                                          entry=entries))
            except:
                print('No correlated Procedures for Lab ', thisLab)
            
            labs.append(thisCPLab)
        
        except:
            print('This lab did not work ', thisLab)
        

    ## MEDICATIONS
    """meds = list()
    for thisMed in meds_medication[~np.isnan(meds_medication)]:
        thisCPMed = ClinicalProfileMedication()
        thisCPMed.medication = [codeableconcept.CodeableConcept(dict(
                                                    coding=[dict(
                                                        system='https://www.nlm.nih.gov/research/umls/rxnorm/', 
                                                        code=str(thisMed))]))]
        try:
            thisCPMed.frequencyPerYear = round(float(meds_frequencyPerYear.loc[thisMed].mean()),3)
            thisCPMed.fractionOfSubjects = round(float(meds_fractionOfSubjects.loc[thisMed].mean()),3)
        except:
            print('No frequency per year for med ', thisMed)

        try:
            top10corrs = (pd.DataFrame(meds_correlatedLabsCoefficients.loc[thisMed].groupby(['Loinc_Code'])
                                                                                .Relative_Counts.mean())
                                                                                .Relative_Counts.nlargest(10).round(3))
            entries = list()
            for code, corr in top10corrs.iteritems():
                otherLab = [(dict(coding=[dict(system='https://loinc.org', code=code)]))]
                entries.append(dict(labcode=otherLab, coefficient=corr))


            thisCPMed.correlatedLabs = ClinicalProfileLabScalarDistributionCorrelatedLabs(dict(topn=10, entry=entries))
        except:
            print('No correlated Labs for Med ', thisMed)

        try:
            top10corrs = (pd.DataFrame(meds_correlatedDxCoefficients.loc[thisMed].groupby(['icd_10'])
                                                                                .Relative_Counts.mean())
                                                                                .Relative_Counts.nlargest(10).round(3))
            entries = list()
            for code, corr in top10corrs.iteritems():
                otherDX = [(dict(coding=[dict(system='https://www.icd10data.com/', code=code)]))]
                entries.append(dict(code=otherDX, coefficient=corr))

            thisCPMed.correlatedDiagnoses = ClinicalProfileLabScalarDistributionCorrelatedDiagnoses(dict
                                                                                                    (topn=10, 
                                                                                                     entry=entries))
        except:
            print('No correlated DX for Med ', thisMed)

        meds.append(thisCPMed)"""

    ## DIAGNOSES 
    dxs = list()
    for thisDX in diagnoses_code:
        thisCPdx = ClinicalProfileDiagnosis()
        try:
            thisCPdx.code = [codeableconcept.CodeableConcept(dict(coding=[dict(
                                                            system='https://www.icd10data.com/', 
                                                            code=str(thisDX))]))]
            thisCPdx.count = int(diagnoses_counts.loc[thisDX])

            thisCPdx.frequencyPerYear = round(float(diagnoses_frequencyPerYear.loc[thisDX].mean()),3)
            thisCPdx.fractionOfSubjects = round(float(diagnoses_fractionOfSubjects.loc[thisDX].mean()),3)
            dxs.append(thisCPdx)
        except:
            print('This DX did not work ', thisDX)


    ## PROCEDURES
    procs = list()
    for thisProc in procedures_code:
        thisCPProc = ClinicalProfileProcedure()
        try:
            thisCPProc.code = [codeableconcept.CodeableConcept(dict(coding=[dict(
                                                            system='https://www.ama-assn.org/practice-management/cpt', 
                                                            code=str(thisProc))]))]

            thisCPProc.frequencyPerYear = round(float(procedures_frequencyPerYear.loc[thisProc].mean()),3)
            thisCPProc.fractionOfSubjects = round(float(procedures_fractionOfSubjects.loc[thisProc].mean()),3)
            procs.append(thisCPProc)
        except:
            print('This procedure did not work ', thisProc)


    clinicalProfile.lab = labs
    clinicalProfile.diagnosis = dxs
    clinicalProfile.procedure = procs

    if age_high != 'nan':
        filename = 'diabetes_resources/'+cohort+'-'+gender+'-'+race+'-'+age_low+'-'+age_high+'.json'
    else:
        filename = 'diabetes_resources/'+cohort+'-'+gender+'-'+race+'-'+age_low+'.json'
        
    with open(filename, 'w') as outfile:
        json.dump(clinicalProfile.as_json(), outfile, indent=4)
    
    del(clinicalProfile)
    return print('Write to '+ filename + ' successful')

In [102]:
clinicalProfile.lab = labs
clinicalProfile.diagnosis = dxs
clinicalProfile.procedure = procs

In [103]:
import json

In [104]:
with open('trial_cp.json', 'w') as outfile:
    json.dump(clinicalProfile.as_json(), outfile)

In [134]:
males = cohortDefinitions.iloc[21:40]
males_test = cohortDefinitions.iloc[38:40]
males_test

Unnamed: 0,gender,race,age_low,age_high
38,Male,Other,65,74.0
39,Male,Other,75,200.0


In [None]:
males = cohortDefinitions.iloc[25:39]
males_test = cohortDefinitions.iloc[38:40]
females = cohortDefinitions.iloc[41:]
for ind, female_cohort in females.iterrows():
    (clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,labs_correlatedMedsCoefficients,
    labs_correlatedProceduresCoefficients, labs_correlatedDiagnosisCoefficients,
           diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects) = calculateProfile(df_demographics, 
                     df_labs, df_diagnoses, df_encounter, df_meds, df_procedures, medianEncounterYear, 
                 gender='Female', race=female_cohort.race, age_low=female_cohort.age_low,
                                                    age_high=female_cohort.age_high)
    

    writeProfile(clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,
                 labs_correlatedMedsCoefficients,labs_correlatedProceduresCoefficients, 
                 labs_correlatedDiagnosisCoefficients, diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects, 'diabetes', gender='Female', race=female_cohort.race, 
                 age_low=str(female_cohort.age_low),age_high=str(female_cohort.age_high))

No correlated Meds for Lab  12610-2
No correlated Diagnoses for Lab  12610-2
No correlated Procedures for Lab  12610-2
No correlated Meds for Lab  12646-6
No correlated Diagnoses for Lab  12646-6
No correlated Procedures for Lab  12646-6
No correlated Meds for Lab  12962-7
No correlated Diagnoses for Lab  12962-7
No correlated Procedures for Lab  12962-7
No correlated Meds for Lab  13457-7
No correlated Diagnoses for Lab  13457-7
No correlated Meds for Lab  14749-6
No correlated Diagnoses for Lab  14749-6


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


No correlated Labs for Lab  14869-2
No correlated Meds for Lab  14869-2
No correlated Diagnoses for Lab  14869-2
No correlated Procedures for Lab  14869-2
No correlated Meds for Lab  1501-6
No correlated Diagnoses for Lab  1501-6
No correlated Meds for Lab  1504-0
No correlated Diagnoses for Lab  1504-0
No correlated Meds for Lab  1507-3
No correlated Diagnoses for Lab  1507-3
No correlated Meds for Lab  1514-9
No correlated Diagnoses for Lab  1514-9
No correlated Meds for Lab  1530-5
No correlated Diagnoses for Lab  1530-5
No correlated Meds for Lab  1558-6
No correlated Diagnoses for Lab  1558-6
No correlated Meds for Lab  1742-6
No correlated Diagnoses for Lab  1742-6
No correlated Meds for Lab  17856-6
No correlated Diagnoses for Lab  17856-6
No correlated Procedures for Lab  17856-6
No correlated Meds for Lab  17861-6
No correlated Diagnoses for Lab  17861-6
No correlated Labs for Lab  17863-2
No correlated Meds for Lab  17863-2
No correlated Diagnoses for Lab  17863-2
No correlat

  return np.nanmean(a, axis, out=out, keepdims=keepdims)



No correlated Diagnoses for Lab  25428-4
No correlated Procedures for Lab  25428-4
No correlated Meds for Lab  2565-0
No correlated Diagnoses for Lab  2565-0
No correlated Procedures for Lab  2565-0
No correlated Meds for Lab  2571-8
No correlated Diagnoses for Lab  2571-8
No correlated Meds for Lab  2601-3
No correlated Diagnoses for Lab  2601-3
No correlated Labs for Lab  2729-2
No correlated Meds for Lab  2729-2
No correlated Diagnoses for Lab  2729-2
No correlated Meds for Lab  2777-1
No correlated Diagnoses for Lab  2777-1
No correlated Meds for Lab  2823-3
No correlated Diagnoses for Lab  2823-3
No correlated Meds for Lab  2885-2
No correlated Diagnoses for Lab  2885-2
No correlated Meds for Lab  2951-2
No correlated Diagnoses for Lab  2951-2
No correlated Meds for Lab  3049-4
No correlated Diagnoses for Lab  3049-4
No correlated Meds for Lab  3094-0
No correlated Diagnoses for Lab  3094-0
No correlated Meds for Lab  33256-9
No correlated Diagnoses for Lab  33256-9
No correlated

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


No correlated Labs for Lab  1507-3
No correlated Meds for Lab  1507-3
No correlated Diagnoses for Lab  1507-3
No correlated Labs for Lab  15087-0
No correlated Meds for Lab  15087-0
No correlated Diagnoses for Lab  15087-0
No correlated Procedures for Lab  15087-0
No correlated Meds for Lab  1558-6
No correlated Diagnoses for Lab  1558-6
No correlated Meds for Lab  1742-6
No correlated Diagnoses for Lab  1742-6
No correlated Meds for Lab  17856-6
No correlated Diagnoses for Lab  17856-6
No correlated Procedures for Lab  17856-6
No correlated Meds for Lab  17861-6
No correlated Diagnoses for Lab  17861-6
No correlated Meds for Lab  17863-2
No correlated Diagnoses for Lab  17863-2
No correlated Meds for Lab  17864-0
No correlated Diagnoses for Lab  17864-0
No correlated Meds for Lab  19123-9
No correlated Diagnoses for Lab  19123-9
No correlated Meds for Lab  1920-8
No correlated Diagnoses for Lab  1920-8
No correlated Meds for Lab  1975-2
No correlated Diagnoses for Lab  1975-2
No corre

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


No correlated Meds for Lab  1742-6
No correlated Diagnoses for Lab  1742-6
No correlated Meds for Lab  17856-6
No correlated Diagnoses for Lab  17856-6
No correlated Procedures for Lab  17856-6
No correlated Meds for Lab  17861-6
No correlated Diagnoses for Lab  17861-6
No correlated Meds for Lab  17863-2
No correlated Diagnoses for Lab  17863-2
No correlated Procedures for Lab  17863-2
No correlated Meds for Lab  17864-0
No correlated Diagnoses for Lab  17864-0
No correlated Meds for Lab  19123-9
No correlated Diagnoses for Lab  19123-9
No correlated Meds for Lab  1920-8
No correlated Diagnoses for Lab  1920-8
No correlated Meds for Lab  1975-2
No correlated Diagnoses for Lab  1975-2
No correlated Meds for Lab  1995-0
No correlated Diagnoses for Lab  1995-0
No correlated Procedures for Lab  1995-0
No correlated Meds for Lab  2000-8
No correlated Diagnoses for Lab  2000-8
No correlated Meds for Lab  20436-2
No correlated Diagnoses for Lab  20436-2
No correlated Procedures for Lab  2043

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


No correlated Meds for Lab  1558-6
No correlated Diagnoses for Lab  1558-6
No correlated Meds for Lab  1742-6
No correlated Diagnoses for Lab  1742-6
No correlated Meds for Lab  17856-6
No correlated Diagnoses for Lab  17856-6
No correlated Procedures for Lab  17856-6
No correlated Meds for Lab  17861-6
No correlated Diagnoses for Lab  17861-6
No correlated Meds for Lab  17864-0
No correlated Diagnoses for Lab  17864-0
No correlated Procedures for Lab  17864-0
No correlated Meds for Lab  19123-9
No correlated Diagnoses for Lab  19123-9
No correlated Meds for Lab  1920-8
No correlated Diagnoses for Lab  1920-8
No correlated Meds for Lab  1975-2
No correlated Diagnoses for Lab  1975-2
No correlated Meds for Lab  1995-0
No correlated Diagnoses for Lab  1995-0
No correlated Procedures for Lab  1995-0
No correlated Meds for Lab  2000-8
No correlated Diagnoses for Lab  2000-8
No correlated Meds for Lab  20436-2
No correlated Diagnoses for Lab  20436-2
No correlated Procedures for Lab  20436-

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


 2777-1
No correlated Diagnoses for Lab  2777-1
No correlated Meds for Lab  2823-3
No correlated Diagnoses for Lab  2823-3
No correlated Meds for Lab  2885-2
No correlated Diagnoses for Lab  2885-2
No correlated Meds for Lab  2951-2
No correlated Diagnoses for Lab  2951-2
No correlated Meds for Lab  3049-4
No correlated Diagnoses for Lab  3049-4
No correlated Meds for Lab  3094-0
No correlated Diagnoses for Lab  3094-0
No correlated Meds for Lab  33256-9
No correlated Diagnoses for Lab  33256-9
No correlated Procedures for Lab  33256-9
No correlated Meds for Lab  33914-3
No correlated Diagnoses for Lab  33914-3
No correlated Procedures for Lab  33914-3
No correlated Meds for Lab  39789-3
No correlated Diagnoses for Lab  39789-3
No correlated Procedures for Lab  39789-3
No correlated Meds for Lab  41653-7
No correlated Diagnoses for Lab  41653-7
No correlated Meds for Lab  4544-3
No correlated Diagnoses for Lab  4544-3
No correlated Meds for Lab  4548-4
No correlated Diagnoses for Lab  

In [None]:
for ind, male_cohort in males.iterrows():
    (clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,labs_correlatedMedsCoefficients,
    labs_correlatedProceduresCoefficients, labs_correlatedDiagnosisCoefficients,
           diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects) = calculateProfile(df_demographics, 
                     df_labs, df_diagnoses, df_encounter, df_meds, df_procedures, medianEncounterYear, 
                 gender='Male', race=male_cohort.race, age_low=male_cohort.age_low,
                                                    age_high=male_cohort.age_high)
    

    writeProfile(clinicalProfile, labs_counts, labs_frequencyPerYear, labs_fractionOfSubjects, labs_units, labs_names,
           labs_stats, labs_aboveBelowNorm, labs_correlatedLabsCoefficients, labs_abscorrelation,
                 labs_correlatedMedsCoefficients,labs_correlatedProceduresCoefficients, 
                 labs_correlatedDiagnosisCoefficients, diagnoses_code, diagnoses_counts,
            diagnoses_frequencyPerYear, diagnoses_fractionOfSubjects, procedures_code, procedures_counts,
            procedures_frequencyPerYear, procedures_fractionOfSubjects, 'diabetes', gender='Male', race=male_cohort.race, 
                 age_low=str(male_cohort.age_low),age_high=str(male_cohort.age_high))

In [42]:
with open('diabetes_resources/diabetes-Male-Other-65-74.0.json') as json_file:
    data = json.load(json_file)

In [48]:
data.pop('resourceType')

'ClinicalProfile'

In [46]:
data['resourceType'] = 'ClinicalProfile'

In [13]:
import fhir_loader

In [16]:
fhir_loader.fhir_loader(args=['http://hapi.clinicalprofiles.org/baseR4',
                              r'S:\NCATS\Clinical_Profiles\showson1\diabetes_resources'])

S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-Black or African American-45-64.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-Black or African American-All.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-White or Caucasian-18-44.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-White or Caucasian-75-200.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-White or Caucasian-45-64.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-Black or African American-65-74.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-White or Caucasian-65-74.0.json failure: (400) Bad Request
S:\NCATS\Clinical_Profiles\showson1\diabetes_resources\diabetes-Male-Black or African 

1

In [None]:
ClinicalProfile.

In [330]:
corrmat = pd.DataFrame(labs_correlatedLabsCoefficients).unstack(level=[0,1]).corr()

In [46]:
corrmat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric,Result_numeric
Unnamed: 0_level_1,Unnamed: 1_level_1,Loinc_Code,12962-7,12962-7,12962-7,12962-7,13457-7,13457-7,13457-7,13457-7,14749-6,14749-6,...,788-0,788-0,788-0,788-0,788-0,789-8,789-8,789-8,789-8,789-8
Unnamed: 0_level_2,Unnamed: 1_level_2,orderYear,2015.0,2016.0,2017.0,2018.0,2015.0,2016.0,2017.0,2018.0,2015.0,2016.0,...,2014.0,2015.0,2016.0,2017.0,2018.0,2014.0,2015.0,2016.0,2017.0,2018.0
Unnamed: 0_level_3,Loinc_Code,orderYear,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
Result_numeric,12962-7,2015.0,1.0,0.794516,0.675569,,,,,,-0.158466,-0.553238,...,,0.134099,0.190168,0.226637,,,-0.35716,-0.355347,-0.333103,
Result_numeric,12962-7,2016.0,0.794516,1.0,0.777654,0.730387,,-0.062948,0.759371,,-0.216095,-0.243302,...,,0.132199,0.244731,0.239149,0.162303,,-0.37963,-0.427639,-0.393235,-0.324591
Result_numeric,12962-7,2017.0,0.675569,0.777654,1.0,0.76681,,-0.29484,0.320304,,-0.471451,-0.240195,...,,0.128389,0.198729,0.240713,0.261824,,-0.286288,-0.406105,-0.424339,-0.396701
Result_numeric,12962-7,2018.0,,0.730387,0.76681,1.0,,,1.0,,,-0.111986,...,,,0.171335,0.208277,0.212639,,,-0.375196,-0.389312,-0.434687
Result_numeric,13457-7,2015.0,,,,,1.0,0.832568,0.766745,,,,...,,0.342558,0.352908,0.053266,,,0.750696,0.468441,0.275227,


In [331]:
corrmat=corrmat.droplevel(level=0).droplevel(level=0,axis=1)

In [332]:
corrmat.head(20)

Unnamed: 0_level_0,Loinc_Code,12962-7,12962-7,12962-7,12962-7,13457-7,13457-7,13457-7,13457-7,14749-6,14749-6,...,788-0,788-0,788-0,788-0,788-0,789-8,789-8,789-8,789-8,789-8
Unnamed: 0_level_1,orderYear,2015.0,2016.0,2017.0,2018.0,2015.0,2016.0,2017.0,2018.0,2015.0,2016.0,...,2014.0,2015.0,2016.0,2017.0,2018.0,2014.0,2015.0,2016.0,2017.0,2018.0
Loinc_Code,orderYear,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
12962-7,2015.0,1.0,0.794516,0.675569,,,,,,-0.158466,-0.553238,...,,0.134099,0.190168,0.226637,,,-0.35716,-0.355347,-0.333103,
12962-7,2016.0,0.794516,1.0,0.777654,0.730387,,-0.062948,0.759371,,-0.216095,-0.243302,...,,0.132199,0.244731,0.239149,0.162303,,-0.37963,-0.427639,-0.393235,-0.324591
12962-7,2017.0,0.675569,0.777654,1.0,0.76681,,-0.29484,0.320304,,-0.471451,-0.240195,...,,0.128389,0.198729,0.240713,0.261824,,-0.286288,-0.406105,-0.424339,-0.396701
12962-7,2018.0,,0.730387,0.76681,1.0,,,1.0,,,-0.111986,...,,,0.171335,0.208277,0.212639,,,-0.375196,-0.389312,-0.434687
13457-7,2015.0,,,,,1.0,0.832568,0.766745,,,,...,,0.342558,0.352908,0.053266,,,0.750696,0.468441,0.275227,
13457-7,2016.0,,-0.062948,-0.29484,,0.832568,1.0,0.493864,0.589956,,,...,,8.3e-05,-0.023279,-0.016576,0.146357,,0.447562,0.08129,0.067646,0.468814
13457-7,2017.0,,0.759371,0.320304,1.0,0.766745,0.493864,1.0,0.874144,,,...,,0.386185,0.154318,0.008304,-0.251732,,0.219473,-0.009105,0.279846,0.339681
13457-7,2018.0,,,,,,0.589956,0.874144,1.0,,,...,,,-0.196318,-0.626851,-0.111222,,,0.059152,0.533298,0.38575
14749-6,2015.0,-0.158466,-0.216095,-0.471451,,,,,,1.0,0.641188,...,,-0.10819,-0.010046,0.086261,,,-0.020999,-0.1384,-0.185873,
14749-6,2016.0,-0.553238,-0.243302,-0.240195,-0.111986,,,,,0.641188,1.0,...,,-0.376322,-0.041573,-0.155266,-0.123724,,0.12566,0.130591,0.270512,0.199875


In [337]:
corrmat.loc['12962-7'].index

Float64Index([2015.0, 2016.0, 2017.0, 2018.0], dtype='float64', name='orderYear')

In [333]:
c1 = corrmat.loc[('12962-7',2015)]
v2015 = c1[c1.index.get_level_values(level=1) == 2015].droplevel(level=1)

c2 = corrmat.loc[('12962-7',2016)]
v2016 = c2[c2.index.get_level_values(level=1) == 2016].droplevel(level=1)

c3 = corrmat.loc[('12962-7',2017)]
v2017 = c3[c3.index.get_level_values(level=1) == 2017].droplevel(level=1)

c4 = corrmat.loc[('12962-7',2018)]
v2018 = c4[c4.index.get_level_values(level=1) == 2018].droplevel(level=1)

In [340]:
pd.DataFrame({'2015':v2015,'2016':v2016,'2017':v2017,'2018':v2018}).apply(np.mean, axis=1).drop('12962-7').nlargest(10).index

Index(['1507-3', '20436-2', '50206-2', '3094-0', '6299-2', '2777-1', '1558-6',
       '19123-9', '39789-3', '5902-2'],
      dtype='object')

In [200]:
df_diagnoses.columns

Index(['PatientID', 'Entry_Date', 'icd_10', 'icd_name', 'hpo', 'hpo_term'], dtype='object')

##### labs_codes

In [20]:
labs_frequencyPerYear

Loinc_Code  orderYear
12962-7     2015.0       3.560377
            2016.0       5.944706
            2017.0       6.895515
            2018.0       5.253977
13457-7     2015.0       1.000000
            2016.0       1.536585
            2017.0       1.511364
            2018.0       1.111111
14749-6     2015.0       3.566667
            2016.0       4.325301
            2017.0       2.000000
1501-6      2016.0       1.000000
1507-3      2016.0       1.000000
15087-0     2016.0       1.000000
1514-9      2016.0       1.000000
1530-5      2016.0       1.000000
1558-6      2016.0       1.138889
            2017.0       1.900000
            2018.0       1.000000
1742-6      2014.0       1.000000
            2015.0       2.036896
            2016.0       3.730369
            2017.0       4.233217
            2018.0       3.230663
17856-6     2015.0       1.000000
            2016.0       1.608108
            2017.0       1.740741
            2018.0       1.071429
17861-6     2014.0       1

In [21]:
labs_fractionOfSubjects 

Loinc_Code
12962-7    0.543091
13457-7    0.008780
14749-6    0.013035
1501-6     0.000068
1507-3     0.000135
15087-0    0.000203
1514-9     0.000068
1530-5     0.000068
1558-6     0.004930
1742-6     0.676347
17856-6    0.007497
17861-6    0.713765
17864-0    0.000675
19123-9    0.213427
1920-8     0.672430
1975-2     0.666892
1995-0     0.003174
2000-8     0.013035
20436-2    0.000203
20438-8    0.000135
20448-7    0.001891
20636-7    0.000068
2085-9     0.532487
2093-3     0.531744
21394-2    0.000068
21395-9    0.000068
2339-0     0.106578
2341-6     0.047683
2345-7     0.700797
25428-4    0.019519
             ...   
41653-7    0.281575
4544-3     0.613873
4548-4     0.475415
4549-2     0.004323
48642-3    0.642442
48643-1    0.701000
50206-2    0.000405
5902-2     0.033095
5905-5     0.416655
5964-2     0.238147
62238-1    0.048156
6298-4     0.009793
6299-2     0.009726
6690-2     0.200257
6768-6     0.671147
704-7      0.004930
706-2      0.495813
713-8      0.498582
714-6    

In [22]:
labs_units

Loinc_Code
12962-7              [mg/dL]
13457-7              [mg/dL]
14749-6              [mg/dL]
1501-6               [mg/dL]
1507-3               [mg/dL]
15087-0             [pmol/L]
1514-9               [mg/dL]
1530-5               [mg/dL]
1558-6               [mg/dL]
1742-6                 [U/L]
17856-6              [% HGB]
17861-6              [mg/dL]
17864-0              [mg/dL]
19123-9              [mg/dL]
1920-8                [IU/L]
1975-2               [mg/dL]
1995-0              [mmol/L]
2000-8               [mg/dL]
20436-2              [mg/dL]
20438-8              [mg/dL]
20448-7             [uIU/mL]
20636-7             [umol/L]
2085-9               [mg/dL]
2093-3               [mg/dL]
21394-2                  [%]
21395-9                  [%]
2339-0               [mg/dL]
2341-6               [mg/dL]
2345-7               [mg/dL]
25428-4                [nan]
                 ...        
41653-7              [mg/dL]
4544-3                   [%]
4548-4                   [%]
454

In [23]:
labs_stats 

Unnamed: 0_level_0,min,max,mean,median,std,10,20,30,40,50,60,70,80,90
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12962-7,1.00,256.00,27.603849,20.00,21.049680,10.000,13.00,15.00,17.00,20.00,24.00,30.00,39.00,56.000
13457-7,17.00,209.00,85.222222,82.00,33.519861,47.000,57.00,66.00,73.00,82.00,91.00,100.00,110.60,127.000
14749-6,36.00,741.00,187.718713,164.00,94.449388,97.000,116.00,130.00,146.00,164.00,189.00,213.00,250.00,302.000
1501-6,199.00,199.00,199.000000,199.00,,199.000,199.00,199.00,199.00,199.00,199.00,199.00,199.00,199.000
1507-3,240.00,248.00,244.000000,244.00,5.656854,240.800,241.60,242.40,243.20,244.00,244.80,245.60,246.40,247.200
15087-0,,,,,,,,,,,,,,
1514-9,196.00,196.00,196.000000,196.00,,196.000,196.00,196.00,196.00,196.00,196.00,196.00,196.00,196.000
1530-5,150.00,150.00,150.000000,150.00,,150.000,150.00,150.00,150.00,150.00,150.00,150.00,150.00,150.000
1558-6,49.00,533.00,150.495327,132.00,76.634685,85.200,94.20,108.80,121.00,132.00,144.60,159.40,177.60,246.800
1742-6,3.00,11310.00,46.494896,24.00,196.803949,11.000,14.00,17.00,20.00,24.00,28.00,34.00,43.00,67.000


In [24]:
labs_aboveBelowNorm

Unnamed: 0_level_0,aboveNorm,belowNorm
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
12962-7,0.000000,0.000000
13457-7,0.311419,0.000000
14749-6,0.890346,0.010727
1501-6,1.000000,0.000000
1507-3,1.000000,0.000000
15087-0,0.000000,0.000000
1514-9,1.000000,0.000000
1530-5,1.000000,0.000000
1558-6,0.747664,0.018692
1742-6,0.449559,0.000000


In [25]:
labs_correlatedLabsCoefficients

Loinc_Code,12962-7,13457-7,14749-6,1501-6,1507-3,15087-0,1514-9,1530-5,1558-6,1742-6,...,731-0,736-9,737-7,744-3,777-3,785-6,786-4,787-2,788-0,789-8
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12962-7,1.000000,-0.055581,0.035161,,1.0,,,,-0.068888,0.020743,...,-0.269085,-0.087230,-0.013561,-0.047541,0.007685,-0.001462,-0.003409,-0.000741,0.011870,0.010305
13457-7,-0.055581,1.000000,-0.008309,,-1.0,,,,-0.080521,0.081259,...,-0.009641,-0.041775,0.004295,-0.133184,-0.064775,0.025906,0.051309,-0.086771,-0.035549,-0.071983
14749-6,0.035161,-0.008309,1.000000,,1.0,,,,0.095745,-0.001746,...,0.118635,0.071101,-0.002947,0.044364,0.053145,0.004197,-0.088947,0.034674,-0.035769,0.038527
1501-6,,,,,,,,,,,...,,,,,,,,,,
1507-3,1.000000,-1.000000,1.000000,,1.0,,,,1.000000,1.000000,...,-1.000000,1.000000,,-1.000000,1.000000,-1.000000,1.000000,1.000000,-1.000000,-1.000000
15087-0,,,,,,,,,,,...,,,,,,,,,,
1514-9,,,,,,,,,,,...,,,,,,,,,,
1530-5,,,,,,,,,,,...,,,,,,,,,,
1558-6,-0.068888,-0.080521,0.095745,,1.0,,,,1.000000,0.002801,...,0.080810,0.100359,0.095210,0.113871,-0.095739,0.014187,-0.074418,0.016898,0.189401,0.152925
1742-6,0.020743,0.081259,-0.001746,,1.0,,,,0.002801,1.000000,...,-0.020606,-0.020532,-0.011792,0.017662,-0.020986,0.013528,0.033515,-0.008508,-0.001703,0.010882


In [26]:
 labs_abscorrelation

Loinc_Code,12962-7,13457-7,14749-6,1501-6,1507-3,15087-0,1514-9,1530-5,1558-6,1742-6,...,731-0,736-9,737-7,744-3,777-3,785-6,786-4,787-2,788-0,789-8
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12962-7,1.000000,0.055581,0.035161,,1.0,,,,0.068888,0.020743,...,0.269085,0.087230,0.013561,0.047541,0.007685,0.001462,0.003409,0.000741,0.011870,0.010305
13457-7,0.055581,1.000000,0.008309,,1.0,,,,0.080521,0.081259,...,0.009641,0.041775,0.004295,0.133184,0.064775,0.025906,0.051309,0.086771,0.035549,0.071983
14749-6,0.035161,0.008309,1.000000,,1.0,,,,0.095745,0.001746,...,0.118635,0.071101,0.002947,0.044364,0.053145,0.004197,0.088947,0.034674,0.035769,0.038527
1501-6,,,,,,,,,,,...,,,,,,,,,,
1507-3,1.000000,1.000000,1.000000,,1.0,,,,1.000000,1.000000,...,1.000000,1.000000,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
15087-0,,,,,,,,,,,...,,,,,,,,,,
1514-9,,,,,,,,,,,...,,,,,,,,,,
1530-5,,,,,,,,,,,...,,,,,,,,,,
1558-6,0.068888,0.080521,0.095745,,1.0,,,,1.000000,0.002801,...,0.080810,0.100359,0.095210,0.113871,0.095739,0.014187,0.074418,0.016898,0.189401,0.152925
1742-6,0.020743,0.081259,0.001746,,1.0,,,,0.002801,1.000000,...,0.020606,0.020532,0.011792,0.017662,0.020986,0.013528,0.033515,0.008508,0.001703,0.010882


In [27]:
 labs_correlatedMedsCoefficients

Unnamed: 0_level_0,RXNorm,Relative_Counts
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
13457-7,6809.0,0.051690
13457-7,161.0,0.047714
13457-7,29046.0,0.031809
13457-7,301542.0,0.027833
13457-7,83367.0,0.025845
13457-7,52175.0,0.025845
13457-7,51428.0,0.023857
13457-7,1191.0,0.021869
13457-7,40790.0,0.021869
13457-7,5487.0,0.019881


In [28]:
 labs_correlatedProceduresCoefficients

Unnamed: 0_level_0,Procedure_Code,Relative_Counts
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
13457-7,99214,7.341715e-02
13457-7,99213,4.894477e-02
13457-7,90471,4.011376e-02
13457-7,90688,3.577309e-02
13457-7,99215,3.457566e-02
13457-7,83036,3.427631e-02
13457-7,36415,2.634336e-02
13457-7,93000,2.267625e-02
13457-7,99396,1.983236e-02
13457-7,47135,1.856010e-02


In [29]:
 labs_correlatedDiagnosisCoefficients

Unnamed: 0_level_0,icd_10,Relative_Counts
Loinc_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
13457-7,E78.5,0.120219
13457-7,E66.9,0.076503
13457-7,E55.9,0.065574
13457-7,G47.33,0.054645
13457-7,R07.9,0.043716
13457-7,J30.9,0.043716
13457-7,G47.30,0.032787
13457-7,M25.519,0.021858
13457-7,D22.9,0.021858
13457-7,B35.1,0.021858


In [30]:
 meds_medication

array([3.26750e+04, 2.55100e+03, 1.61000e+02, ..., 4.91498e+05,
       9.44740e+04, 2.18900e+03])

In [31]:
 meds_frequencyPerYear

RXNorm     startYear
38.0       2017.0       2.500000
90.0       2016.0       1.500000
           2017.0       1.000000
99.0       2016.0       1.400000
           2017.0       1.500000
           2018.0       1.000000
103.0      2016.0       2.500000
           2017.0       2.500000
149.0      2015.0       2.000000
161.0      2015.0       1.696517
           2016.0       2.169723
           2017.0       2.339214
           2018.0       2.118842
167.0      2015.0       1.250000
           2016.0       1.755814
           2017.0       1.719008
           2018.0       1.750000
168.0      2016.0       1.000000
           2017.0       1.000000
           2018.0       1.500000
197.0      2015.0       2.000000
           2016.0       1.676923
           2017.0       1.591667
           2018.0       1.580645
211.0      2017.0       1.125000
           2018.0       1.000000
281.0      2015.0       1.000000
           2016.0       2.047619
           2017.0       1.953488
           2018.0     

In [32]:
meds_fractionOfSubjects

RXNorm
38.0         0.000261
44.0         0.000104
90.0         0.003095
94.0         0.000087
99.0         0.000382
103.0        0.000348
110.0        0.000052
149.0        0.000087
161.0        0.304949
167.0        0.005998
168.0        0.001512
197.0        0.004033
211.0        0.000817
281.0        0.007562
296.0        0.001860
376.0        0.000017
388.0        0.002938
430.0        0.000017
435.0        0.164983
448.0        0.000313
519.0        0.032718
569.0        0.000104
596.0        0.026303
598.0        0.000574
612.0        0.029763
620.0        0.000974
644.0        0.000834
658.0        0.000035
689.0        0.001200
703.0        0.024913
               ...   
1792393.0    0.000035
1796083.0    0.000017
1798370.0    0.000017
1799218.0    0.000504
1801082.0    0.000035
1801174.0    0.000035
1801186.0    0.015716
1801322.0    0.000017
1801840.0    0.001843
1808552.0    0.000070
1858267.0    0.000417
1862590.0    0.000052
1865962.0    0.000035
1869699.0    0.000139
187

In [33]:
 meds_correlatedLabsCoefficients

Unnamed: 0_level_0,Loinc_Code,Relative_Counts
RXNorm,Unnamed: 1_level_1,Unnamed: 2_level_1
38.0,2345-7,0.300000
38.0,4548-4,0.300000
38.0,2571-8,0.100000
38.0,17861-6,0.100000
38.0,2951-2,0.100000
38.0,1742-6,0.100000
44.0,718-7,0.098876
44.0,41653-7,0.094382
44.0,789-8,0.080899
44.0,788-0,0.080899


In [34]:
 meds_correlatedDxCoefficients

Unnamed: 0_level_0,icd_10,Relative_Counts
RXNorm,Unnamed: 1_level_1,Unnamed: 2_level_1
38.0,E66.9,0.067568
38.0,E78.5,0.054054
38.0,E55.9,0.040541
38.0,D64.9,0.040541
38.0,G47.33,0.040541
38.0,N91.2,0.027027
38.0,R06.02,0.027027
38.0,E22.1,0.027027
38.0,R32,0.027027
38.0,R10.9,0.027027


In [35]:
diagnoses_code

array(['H20.9', 'R80.9', 'E78.5', 'G47.33', 'R31.0', 'E03.9', 'J40',
       'E66.9', 'L21.9', 'K60.2', 'M54.5', 'M54.2', 'R39.11', 'R51',
       'L03.90', 'E55.9', 'E83.52', 'J42', 'B35.1', 'J30.1', 'G47.30',
       'R21', 'G51.0', 'E73.9', 'M25.569', 'D64.9', 'R10.9', 'N94.6',
       'M25.529', 'J30.9', 'R60.9', 'M10.9', 'I44.2', 'H26.9', 'R49.0',
       'M86.9', 'R31.9', 'G62.9', 'R27.0', 'E23.0', 'I49.5', 'R06.02',
       'M00.9', 'N18.9', 'K56.0', 'G03.9', 'R33.9', 'I49.9', 'R01.1',
       'Q21.1', 'E87.5', 'R05', 'K92.2', 'I86.1', 'Q44.6', 'M72.0',
       'N50.82', 'R79.81', 'D50.9', 'R11.10', 'F70', 'R73.9', 'I47.2',
       'K92.1', 'H40.9', 'E66.3', 'J31.0', 'I49.1', 'J01.90', 'L29.9',
       'R13.10', 'H70.90', 'G47.00', 'H71.90', 'R07.9', 'R55', 'R50.9',
       'R06.83', 'I49.3', 'L57.0', 'J33.9', 'E86.0', 'K59.00', 'R00.2',
       'E21.0', 'K31.84', 'M54.9', 'I44.7', 'H93.19', 'J32.9', 'R32',
       'K02.9', 'R35.0', 'R11.2', 'R19.7', 'I42.9', 'R60.1', 'R25.1',
       'E87.2'

In [36]:
 diagnoses_frequencyPerYear

icd_10   entryYear
A46      2016.0       1.000000
         2017.0       1.000000
         2018.0       1.000000
B00.4    1900.0       1.000000
         2005.0       1.000000
         2013.0       1.000000
         2017.0       1.000000
         2018.0       1.000000
         2019.0       1.000000
B35.1    1899.0       1.000000
         1900.0       1.000000
         2004.0       1.000000
         2005.0       1.000000
         2006.0       1.000000
         2007.0       1.000000
         2008.0       1.000000
         2009.0       1.000000
         2010.0       1.000000
         2011.0       1.000000
         2012.0       1.000000
         2013.0       1.019231
         2014.0       1.014493
         2015.0       1.023077
         2016.0       1.011429
         2017.0       1.009615
         2018.0       1.026549
         2019.0       1.050000
B96.20   2013.0       1.000000
         2014.0       1.000000
         2015.0       1.000000
                        ...   
R82.991  2016.0     

In [37]:
 diagnoses_fractionOfSubjects

icd_10
A46        0.000051
B00.4      0.000136
B35.1      0.034956
B96.20     0.000597
C18.9      0.005459
C22.0      0.003207
C43.9      0.003429
C45.0      0.000068
C45.1      0.000017
C46.9      0.000136
C61        0.027518
C72.9      0.000017
C83.70     0.000068
D18.00     0.000699
D18.1      0.000102
D22.9      0.006090
D25.9      0.013358
D46.1      0.000017
D46.9      0.001552
D47.01     0.000034
D50.9      0.035468
D58.9      0.000239
D59.3      0.000119
D59.5      0.000051
D61.9      0.000563
D64.9      0.112034
D65        0.000119
D68.51     0.001262
D68.9      0.001757
D69.6      0.016275
             ...   
R57.1      0.000205
R57.9      0.000853
R59.1      0.001382
R60.1      0.002235
R60.9      0.047905
R63.0      0.002559
R63.1      0.000461
R63.2      0.000239
R63.3      0.000188
R64        0.000529
R68.3      0.000136
R68.84     0.000631
R70.0      0.001808
R73.9      0.022093
R78.81     0.009844
R79.81     0.000171
R80.3      0.000409
R80.9      0.061587
R81        0.

In [38]:
 procedures_code

array(['92012', '66821', '92083', ..., '27786', '65210', nan],
      dtype=object)

In [39]:
 procedures_frequencyPerYear

Procedure_Code  encounterYear
01996           2015.0            5.571429
                2016.0           19.244898
                2017.0           30.055556
                2018.0           10.500000
0290T           2016.0            4.000000
                2017.0            7.000000
03000           2013.0            1.000000
                2014.0            1.000000
                2015.0            4.655172
                2016.0           16.380407
                2017.0           23.381776
                2018.0           11.300683
0346T           2015.0            1.500000
                2016.0           19.428571
                2017.0           15.555556
                2018.0            2.750000
0387T           2016.0           48.000000
                2017.0           52.000000
                2018.0           22.000000
0391T           2016.0           24.000000
                2017.0           26.000000
                2018.0           11.000000
0392T           2015.0  

In [40]:
procedures_fractionOfSubjects

Procedure_Code
01996     0.003985
0290T     0.000068
03000     0.064096
0346T     0.000608
0387T     0.000068
0391T     0.000068
0392T     0.000068
0449T     0.000135
10021     0.000203
10022     0.003512
10030     0.000743
10040     0.000135
10060     0.003850
10061     0.003512
10080     0.000068
10120     0.000338
10121     0.000203
10140     0.001283
10160     0.001418
10180     0.000338
11000     0.000203
11004     0.000270
11005     0.000203
11008     0.000135
11012     0.000135
11042     0.006889
11043     0.002972
11044     0.003985
11045     0.000810
11046     0.001013
            ...   
Q4038     0.000270
Q4046     0.000068
Q9967     0.019249
S0189     0.000135
S0280     0.003647
S0281     0.002161
S0515     0.000135
S2900     0.002567
S8450     0.000068
S9986     0.001148
STOPG     0.001418
UCODE     0.000270
UNBILL    0.001013
V2615     0.000068
V2718     0.000203
V2787     0.000405
V2788     0.000068
V2799     0.000135
V5010     0.001216
V5011     0.000135
V5014     0.0002