# Generate HAI dataframe

In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '/Users/kenlocey/Desktop/Rush/CMS_HospitalArchives/'


## Define Custom Functions

In [2]:

def check_lists(lists):
    for i, ls in enumerate(lists):
        for i2, ls2 in enumerate(lists):
            for i3 in ls:
                if i3 not in ls2:
                    print('\n')
                    print(i3 + ': NOT FOUND IN')
                    print(ls2)
                    sys.exit()
                        
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Address 1', 
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Address',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = [
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load HAI Files

In [3]:

df_list = []
lists = []

yrs = ['2023', '2023',
       '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04',
       '01', '04', '07',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07',
       '10', '07', '04',
       ]

subdirs = ['2023/hospitals_01_2023/Healthcare_Associated_Infections-Hospital.csv', 
           '2023/hospitals_04_2023/Healthcare_Associated_Infections-Hospital.csv',
           
           '2022/hospitals_01_2022/Healthcare_Associated_Infections-Hospital.csv', 
           '2022/hospitals_04_2022/Healthcare_Associated_Infections-Hospital.csv',
           '2022/hospitals_07_2022/Healthcare_Associated_Infections-Hospital.csv',
           
           '2021/hospitals_01_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_03_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_04_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_07_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_10_2021/Healthcare_Associated_Infections-Hospital.csv',
           
           '2020/hospitals_archive_10_2020/Healthcare_Associated_Infections_Hospital.csv',
           '2020/hospitals_archive_07_2020/Healthcare_Associated_Infections_Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Healthcare Associated Infections - Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Healthcare Associated Infections - Hospital.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Healthcare Associated Infections - Hospital.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Healthcare Associated Infections - Hospital.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Healthcare Associated Infections - Hospital.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Healthcare Associated Infections - Hospital.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/Healthcare Associated Infections - Hospital.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Healthcare Associated Infections - Hospital.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Healthcare Associated Infections - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/Healthcare Associated Infections - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Healthcare Associated Infections - Hospital.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Healthcare Associated Infections - Hospital.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Healthcare Associated Infections - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Healthcare Associated Infections - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Healthcare Associated Infections - Hospital.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/Healthcare_Associated_Infections.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130701/Healthcare_Associated_Infections.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/Healthcare_Associated_Infections.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    
    if yrs[i] == '2013':
        df.drop(labels=['Address 2', 'Address 3'], axis=1, inplace=True)
        df.rename(columns={'Address 1': 'Address',
                           'Measure': 'Measure Name'}, inplace=True)
        
        df["Measure ID"] = ['Not given']*df.shape[0]
        df['Compared to National'] = ['Not given']*df.shape[0]
        
        if mos[i] == '04':
            
            '''
            Central Line Associated Bloodstream Infection		07/01/2011		06/30/2012
            Catheter Associated Urinary Tract Infections		    01/01/2012		06/30/2012
            SSI - Colon Surgery		                            01/01/2012		06/30/2012
            SSI - Abdominal Hysterectomy		                    01/01/2012	    06/30/2012
            '''
            
            ls = ['Central-Line-Associated Blood Stream Infections (CLABSI)',
                  'CLABSI Compared to National',
                  'CLABSI Lower Confidence Limit',
                  'CLABSI Observed Cases',
                  'CLABSI Predicted Cases',
                  'CLABSI Upper Confidence Limit',
                  'CLABSI Central Line Days',
                  ]
            clabsi_df = df[df['Measure Name'].isin(ls)]
            clabsi_df["Start Date"] = ['07/01/2011']*clabsi_df.shape[0]
            clabsi_df["End Date"] = ['06/30/2012']*clabsi_df.shape[0]
            
            ls = ['Catheter-Associated Urinary Tract Infections (CAUTI)',
                  'CAUTI Compared to National',
                  'CAUTI Lower Confidence Limit',
                  'CAUTI Observed Cases',
                  'CAUTI Predicted Cases',
                  'CAUTI Upper Confidence Limit',
                  'CAUTI Urinary Catheter Days',
                  ]
            cauti_df = df[df['Measure Name'].isin(ls)]
            cauti_df["Start Date"] = ['01/01/2012']*cauti_df.shape[0]
            cauti_df["End Date"] = ['06/30/2012']*cauti_df.shape[0]
            
            ls = ['Surgical Site Infection from colon surgery (SSI: Colon)',
                  'SSI: Colon Compared to National',
                  'SSI: Colon Lower Confidence Limit',
                  'SSI: Colon Observed Cases',
                  'SSI: Colon Predicted Cases',
                  'SSI: Colon Upper Confidence Limit',
                  'SSI: Colon, Number of Procedures',
                ]
            ssi_df1 = df[df['Measure Name'].isin(ls)]
            ssi_df1["Start Date"] = ['01/01/2011']*ssi_df1.shape[0]
            ssi_df1["End Date"] = ['06/30/2012']*ssi_df1.shape[0]
            
            ls = ['Surgical Site Infection from abdominal hysterectomy (SSI: Hysterectomy)',
                  'SSI: Abdominal Lower Confidence Limit',
                  'SSI: Abdominal Observed Cases',
                  'SSI: Abdominal Predicted Cases',
                  'SSI: Abdominal Upper Confidence Limit',
                  'SSI: Abdominal, Number of Procedures',
                  'SSI: Hysterectomy Compared to National',
                ]
            ssi_df2 = df[df['Measure Name'].isin(ls)]
            ssi_df2["Start Date"] = ['01/01/2011']*ssi_df2.shape[0]
            ssi_df2["End Date"] = ['06/30/2012']*ssi_df2.shape[0]
            
            df = pd.concat([clabsi_df, cauti_df, ssi_df1, ssi_df2])
            
            
        elif mos[i] == '07':
            
            '''
            Central Line Associated Bloodstream Infection		10/01/2011		09/30/2012
            Catheter Associated Urinary Tract Infections		    01/01/2012		09/30/2012
            SSI - Colon Surgery		                            01/01/2012		09/30/2012
            SSI - Abdominal Hysterectomy		                    01/01/2012	    09/30/2012
            '''
            
            #df["Measure Start Date"] = []*df.shape[0]
            #df["Measure End Date"] = []*df.shape[0]
            
            ls = ['Central-Line-Associated Blood Stream Infections (CLABSI)',
                  'CLABSI Compared to National',
                  'CLABSI Lower Confidence Limit',
                  'CLABSI Observed Cases',
                  'CLABSI Predicted Cases',
                  'CLABSI Upper Confidence Limit',
                  'CLABSI Central Line Days',
                  ]
            clabsi_df = df[df['Measure Name'].isin(ls)]
            clabsi_df["Start Date"] = ['10/01/2011']*clabsi_df.shape[0]
            clabsi_df["End Date"] = ['09/30/2012']*clabsi_df.shape[0]
            
            ls = ['Catheter-Associated Urinary Tract Infections (CAUTI)',
                  'CAUTI Compared to National',
                  'CAUTI Lower Confidence Limit',
                  'CAUTI Observed Cases',
                  'CAUTI Predicted Cases',
                  'CAUTI Upper Confidence Limit',
                  'CAUTI Urinary Catheter Days',
                  ]
            cauti_df = df[df['Measure Name'].isin(ls)]
            cauti_df["Start Date"] = ['01/01/2012']*cauti_df.shape[0]
            cauti_df["End Date"] = ['09/30/2012']*cauti_df.shape[0]
            
            ls = ['Surgical Site Infection from colon surgery (SSI: Colon)',
                  'SSI: Colon Compared to National',
                  'SSI: Colon Lower Confidence Limit',
                  'SSI: Colon Observed Cases',
                  'SSI: Colon Predicted Cases',
                  'SSI: Colon Upper Confidence Limit',
                  'SSI: Colon, Number of Procedures',
                ]
            ssi_df1 = df[df['Measure Name'].isin(ls)]
            ssi_df1["Start Date"] = ['01/01/2012']*ssi_df1.shape[0]
            ssi_df1["End Date"] = ['09/30/2012']*ssi_df1.shape[0]
            
            ls = ['Surgical Site Infection from abdominal hysterectomy (SSI: Hysterectomy)',
                  'SSI: Abdominal Lower Confidence Limit',
                  'SSI: Abdominal Observed Cases',
                  'SSI: Abdominal Predicted Cases',
                  'SSI: Abdominal Upper Confidence Limit',
                  'SSI: Abdominal, Number of Procedures',
                  'SSI: Hysterectomy Compared to National',
                ]
            ssi_df2 = df[df['Measure Name'].isin(ls)]
            ssi_df2["Start Date"] = ['01/01/2012']*ssi_df2.shape[0]
            ssi_df2["End Date"] = ['09/30/2012']*ssi_df2.shape[0]
            
            df = pd.concat([clabsi_df, cauti_df, ssi_df1, ssi_df2])
            
            
        if mos[i] == '10':
            
            '''
            Central Line Associated Bloodstream Infection		01/01/2012		12/31/2012
            Catheter Associated Urinary Tract Infections		    01/01/2012		12/31/2012
            SSI - Colon Surgery		                            01/01/2012		12/31/2012
            SSI - Abdominal Hysterectomy		                    01/01/2012	    12/31/2012
            '''

            ls = ['Central-Line-Associated Blood Stream Infections (CLABSI)',
                  'CLABSI Compared to National',
                  'CLABSI Lower Confidence Limit',
                  'CLABSI Observed Cases',
                  'CLABSI Predicted Cases',
                  'CLABSI Upper Confidence Limit',
                  'CLABSI Central Line Days',
                  ]
            clabsi_df = df[df['Measure Name'].isin(ls)]
            clabsi_df["Start Date"] = ['01/01/2012']*clabsi_df.shape[0]
            clabsi_df["End Date"] = ['12/31/2012']*clabsi_df.shape[0]
            
            ls = ['Catheter-Associated Urinary Tract Infections (CAUTI)',
                  'CAUTI Compared to National',
                  'CAUTI Lower Confidence Limit',
                  'CAUTI Observed Cases',
                  'CAUTI Predicted Cases',
                  'CAUTI Upper Confidence Limit',
                  'CAUTI Urinary Catheter Days',
                  ]
            cauti_df = df[df['Measure Name'].isin(ls)]
            cauti_df["Start Date"] = ['01/01/2012']*cauti_df.shape[0]
            cauti_df["End Date"] = ['12/31/2012']*cauti_df.shape[0]
            
            ls = ['Surgical Site Infection from colon surgery (SSI: Colon)',
                  'SSI: Colon Compared to National',
                  'SSI: Colon Lower Confidence Limit',
                  'SSI: Colon Observed Cases',
                  'SSI: Colon Predicted Cases',
                  'SSI: Colon Upper Confidence Limit',
                  'SSI: Colon, Number of Procedures',
                ]
            ssi_df1 = df[df['Measure Name'].isin(ls)]
            ssi_df1["Start Date"] = ['01/01/2012']*ssi_df1.shape[0]
            ssi_df1["End Date"] = ['12/31/2012']*ssi_df1.shape[0]
            
            ls = ['Surgical Site Infection from abdominal hysterectomy (SSI: Hysterectomy)',
                  'SSI: Abdominal Lower Confidence Limit',
                  'SSI: Abdominal Observed Cases',
                  'SSI: Abdominal Predicted Cases',
                  'SSI: Abdominal Upper Confidence Limit',
                  'SSI: Abdominal, Number of Procedures',
                  'SSI: Hysterectomy Compared to National',
                ]
            ssi_df2 = df[df['Measure Name'].isin(ls)]
            ssi_df2["Start Date"] = ['01/01/2012']*ssi_df2.shape[0]
            ssi_df2["End Date"] = ['12/31/2012']*ssi_df2.shape[0]
            
            df = pd.concat([clabsi_df, cauti_df, ssi_df1, ssi_df2])
            
            
            
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

check_lists(lists)
subdir = 'HAI/CombinedFiles_HAI'

hai_df = pd.concat(df_list)
print('hai_df.shape:', hai_df.shape)

2023/hospitals_01_2023/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (174528, 15)
2023/hospitals_04_2023/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (174420, 15)
2022/hospitals_01_2022/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (174528, 15)
2022/hospitals_04_2022/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (174528, 15)
2022/hospitals_07_2022/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (174348, 15)
2021/hospitals_01_2021/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (175752, 15)
2021/hospitals_03_2021/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (175752, 15)
2021/hospitals_04_2021/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (176832, 15)
2021/hospitals_07_2021/Healthcare_Associated_Infections-Hospital.csv :  (rows, columns) = (175176, 15)
2021/hospitals_10_2021/Healthcare_Associated_Infections-Hospital.csv :  (

In [4]:
hai_df = hai_df.filter(items=['Facility ID', 'Start Date', 'End Date', #'Compared to National',
                              'Footnote', 'Measure ID', 'Measure Name', 'Score',  'file_month', 
                              'file_year'], axis=1)

hai_df = hai_df[~hai_df['Measure ID'].isin(['Not given'])]
print(hai_df.shape)

(6751920, 9)


In [5]:
d = {
     'CAUTI: Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',
     'CAUTI Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',                                                 
     'CAUTI Lower Confidence Limit': 'CAUTI lower CL',
     'CAUTI: Lower Confidence Limit': 'CAUTI lower CL',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Lower Confidence Limit': 'CAUTI lower CL',                                                
     'CAUTI Upper Confidence Limit': 'CAUTI upper CL',
     'CAUTI: Upper Confidence Limit': 'CAUTI upper CL',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Upper Confidence Limit': 'CAUTI upper CL',                                      
     'Catheter-Associated Urinary Tract Infections (CAUTI)': 'CAUTI (SIR)',
     'Catheter-associated urinary tract infections (CAUTI) in ICUs and select wards': 'CAUTI (SIR)',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards)': 'CAUTI (SIR)',
     'Catheter-Associated Urinary Tract Infections (CAUTI) in ICUs only': 'CAUTI SIR (ICUs only)',                                          
     'CAUTI: Observed Cases': 'CAUTI Observed Cases',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Observed Cases': 'CAUTI Observed Cases',                                                 
     'CAUTI: Observed Cases': 'CAUTI Observed Cases',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Observed Cases': 'CAUTI Observed Cases',                                             
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Predicted Cases': 'CAUTI Predicted Cases',
     'CAUTI: Predicted Cases': 'CAUTI Predicted Cases',                                                 
     'CAUTI: Number of Procedures': 'CAUTI Number of Procedures',

    
     'Central Line Associated Bloodstream Infection: Number of Device Days': 'CLABSI Number of Device Days',
     'CLABSI: Number of Device Days': 'CLABSI Number of Device Days',
     'CLABSI Central Line Days': 'CLABSI Number of Device Days',                                               
     'CLABSI Lower Confidence Limit': 'CLABSI lower CL',
     'CLABSI: Lower Confidence Limit': 'CLABSI lower CL',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Lower Confidence Limit': 'CLABSI lower CL',                                                 
     'CLABSI Upper Confidence Limit': 'CLABSI upper CL',
     'CLABSI: Upper Confidence Limit': 'CLABSI upper CL',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Upper Confidence Limit': 'CLABSI upper CL',                                   
     'Central line-associated blood stream infections (CLABSI) in ICUs only': 'CLABSI SIR (ICUs only)',
     'Central line-associated bloodstream infections (CLABSI) in ICUs only': 'CLABSI SIR (ICUs only)',
     'Central Line Associated Bloodstream Infection (ICU + select Wards)': 'CLABSI (SIR)',
     'Central line-associated bloodstream infections (CLABSI) in ICUs and select wards': 'CLABSI (SIR)',
     'Central line-associated blood stream infections (CLABSI)': 'CLABSI (SIR)',
     'Central-Line-Associated Blood Stream Infections (CLABSI)': 'CLABSI (SIR)',
     'CLABSI: Observed Cases': 'CLABSI Observed Cases',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Observed Cases': 'CLABSI Observed Cases',                                                 
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Predicted Cases': 'CLABSI Predicted Cases',
     'CLABSI: Predicted Cases': 'CLABSI Predicted Cases',                                                
     'CLABSI: Number of Procedures': 'CLABSI Number of Procedures',
     
     
     'MRSA Lower Confidence Limit': 'MRSA lower CL',
     'MRSA Bacteremia: Lower Confidence Limit': 'MRSA lower CL',
     'Methicillin-resistant Staphylococcus Aureus (MRSA) Blood Laboratory-identified Events (Bloodstream infections)': 'MRSA (SIR)',
     'Methicillin-resistant Staphylococcus Aureus (MRSA) blood infections': 'MRSA (SIR)',
     'MRSA Bacteremia': 'MRSA (SIR)',
     'MRSA Observed Cases': 'MRSA Observed Cases', 
     'MRSA Bacteremia: Observed Cases': 'MRSA Observed Cases',
     'MRSA Predicted Cases': 'MRSA Predicted Cases', 
     'MRSA Bacteremia: Predicted Cases': 'MRSA Predicted Cases',
     'MRSA Upper Confidence Limit': 'MRSA upper CL', 
     'MRSA Bacteremia: Upper Confidence Limit': 'MRSA upper CL',
     'MRSA Bacteremia: Patient Days': 'MRSA patient days',
     'MRSA Patient Days': 'MRSA patient days',
     
     
     'Clostridium Difficile (C.Diff): Observed Cases': 'CDIFF Observed Cases',
     'C.diff Observed Cases': 'CDIFF Observed Cases',
     'Clostridium Difficile (C.Diff)': 'CDIFF (SIR)',
     'Clostridium difficile (C.diff.) Laboratory-identified Events (Intestinal infections)': 'CDIFF (SIR)',
     'Clostridium difficile (C.diff.) intestinal infections': 'CDIFF (SIR)',
     'Clostridium Difficile (C.Diff): Patient Days': 'CDIFF patient days',
     'C.diff Patient Days': 'CDIFF patient days',
     'C.diff Upper Confidence Limit': 'CDIFF upper CL',
     'Clostridium Difficile (C.Diff): Upper Confidence Limit': 'CDIFF upper CL',
     'Clostridium Difficile (C.Diff): Lower Confidence Limit': 'CDIFF lower CL',
     'C.diff Lower Confidence Limit': 'CDIFF lower CL',
     'C.diff Predicted Cases': 'CDIFF Predicted Cases',
     'Clostridium Difficile (C.Diff): Predicted Cases': 'CDIFF Predicted Cases',
     
     
     'Surgical Site Infection from abdominal hysterectomy (SSI: Hysterectomy)': 'SSI Abdominal Hysterectomy (SIR)',
     'Surgical site infections (SSI) from abdominal hysterectomy': 'SSI Abdominal Hysterectomy (SIR)',
     'SSI - Abdominal Hysterectomy': 'SSI Abdominal Hysterectomy (SIR)',
     'SSI - Abdominal Hysterectomy: Number of Procedures': 'SSI Abdominal Number of Procedures',
     'SSI - Abdominal Hysterectomy: Observed Cases': 'SSI Abdominal Observed Cases', 
     'SSI - Abdominal Hysterectomy: Predicted Cases': 'SSI Abdominal Predicted Cases', 
     'SSI - Abdominal Hysterectomy: Upper Confidence Limit': 'SSI Abdominal upper CL', 
     'SSI - Abdominal Hysterectomy: Lower Confidence Limit': 'SSI Abdominal lower CL',
     'SSI: Abdominal Lower Confidence Limit': 'SSI Abdominal lower CL', 
     'SSI: Abdominal Observed Cases': 'SSI Abdominal Observed Cases',  
     'SSI: Abdominal Predicted Cases': 'SSI Abdominal Predicted Cases',  
     'SSI: Abdominal Upper Confidence Limit': 'SSI Abdominal upper CL', 
     'SSI: Abdominal, Number of Procedures': 'SSI Abdominal Number of Procedures',
     
     'Surgical Site Infection from colon surgery (SSI: Colon)': 'SSI Colon Surgery (SIR)',
     'Surgical site infections (SSI) from colon surgery':  'SSI Colon Surgery (SIR)',
     'SSI - Colon Surgery': 'SSI Colon Surgery (SIR)', 
     'SSI - Colon Surgery: Lower Confidence Limit': 'SSI Colon lower CL',
     'SSI - Colon Surgery: Number of Procedures': 'SSI Colon Number of Procedures', 
     'SSI - Colon Surgery: Observed Cases': 'SSI Colon Observed Cases', 
     'SSI - Colon Surgery: Predicted Cases': 'SSI Colon Predicted Cases', 
     'SSI - Colon Surgery: Upper Confidence Limit': 'SSI Colon upper CL', 
     'SSI: Colon Lower Confidence Limit':'SSI Colon lower CL', 
     'SSI: Colon Observed Cases': 'SSI Colon Observed Cases', 
     'SSI: Colon Predicted Cases': 'SSI Colon Predicted Cases', 
     'SSI: Colon Upper Confidence Limit': 'SSI Colon upper CL',  
     'SSI: Colon, Number of Procedures': 'SSI Colon, Number of Procedures',
     }


hai_df['Measure Name'].replace(to_replace=d, inplace=True)



d = {'HAI-1-DOPC-DAYS': 'HAI-1 DOPC', 
     'HAI-1-ELIGCASES': 'HAI-1 ELIG CASES', 
     'HAI-1-NUMERATOR': 'HAI-1 NUMERATOR', 
     'HAI-1-SIR': 'HAI-1 SIR',
     'HAI_1_DOPC': 'HAI-1 DOPC',
     'HAI_1_DOPC_DAYS': 'HAI-1 DOPC', 
     'HAI_1_ELIGCASES': 'HAI-1 ELIG CASES', 
     'HAI_1_NUMERATOR': 'HAI-1 NUMERATOR',
     'HAI_1_SIR': 'HAI-1 SIR', 
     'HAI_1_CILOWER': 'HAI-1 CI LOWER',
     'HAI_1_CIUPPER': 'HAI-1 CI UPPER',
     'HAI_1_CI_LOWER': 'HAI-1 CI LOWER',
     'HAI_1_CI_UPPER': 'HAI-1 CI UPPER',
     'HAI-1-CI-LOWER': 'HAI-1 CI LOWER',
     'HAI-1-CI-UPPER': 'HAI-1 CI UPPER',
     
     'HAI_1a_CI_LOWER': 'HAI-1a CI LOWER',
     'HAI_1a_CI_UPPER': 'HAI-1a CI UPPER',
     'HAI_1a_DOPC_DAYS': 'HAI-1a DOPC', 
     'HAI_1a_ELIGCASES': 'HAI-1a ELIG CASES',  
     'HAI_1a_NUMERATOR': 'HAI-1a NUMERATOR',  
     'HAI_1a_SIR': 'HAI-1a SIR',  
     
     'HAI_2a_CI_LOWER': 'HAI-2a CI LOWER',
     'HAI_2a_CI_UPPER': 'HAI-2a CI UPPER',
     'HAI_2a_DOPC_DAYS': 'HAI-2a DOPC',
     'HAI_2a_ELIGCASES': 'HAI-2a ELIG CASES', 
     'HAI_2a_NUMERATOR': 'HAI-2a NUMERATOR', 
     'HAI_2a_SIR': 'HAI-2a SIR', 

     'HAI-2-DOPC-DAYS': 'HAI-2 DOPC', 
     'HAI-2-ELIGCASES': 'HAI-2 ELIG CASES', 
     'HAI-2-NUMERATOR': 'HAI-2 NUMERATOR', 
     'HAI-2-SIR': 'HAI-2 SIR', 
     'HAI_2_DOPC': 'HAI-2 DOPC',  
     'HAI_2_DOPC_DAYS': 'HAI-2 DOPC',  
     'HAI_2_ELIGCASES': 'HAI-2 ELIG CASES', 
     'HAI_2_NUMERATOR': 'HAI-2 NUMERATOR', 
     'HAI_2_SIR': 'HAI-2 SIR',
     'HAI_2_CILOWER': 'HAI-2 CI LOWER',
     'HAI_2_CIUPPER': 'HAI-2 CI UPPER',
     'HAI_2_CI_LOWER': 'HAI-2 CI LOWER',
     'HAI_2_CI_UPPER': 'HAI-2 CI UPPER',
     'HAI-2-CI-LOWER': 'HAI-2 CI LOWER',
     'HAI-2-CI-UPPER': 'HAI-2 CI UPPER',
     
     'HAI-3-CI-LOWER': 'HAI-3 CI LOWER', 
     'HAI-3-CI-UPPER': 'HAI-3 CI UPPER', 
     'HAI-3-DOPC-DAYS': 'HAI-3 DOPC', 
     'HAI-3-ELIGCASES': 'HAI-3 ELIG CASES', 
     'HAI-3-NUMERATOR': 'HAI-3 NUMERATOR', 
     'HAI-3-SIR': 'HAI-3 SIR', 
     'HAI_3_CILOWER': 'HAI-3 CI LOWER', 
     'HAI_3_CI_LOWER': 'HAI-3 CI LOWER', 
     'HAI_3_CIUPPER': 'HAI-3 CI UPPER', 
     'HAI_3_CI_UPPER': 'HAI-3 CI UPPER', 
     'HAI_3_DOPC': 'HAI-3 DOPC', 
     'HAI_3_DOPC_DAYS': 'HAI-3 DOPC', 
     'HAI_3_ELIGCASES': 'HAI-3 ELIG CASES', 
     'HAI_3_NUMERATOR': 'HAI-3 NUMERATOR', 
     'HAI_3_SIR': 'HAI-3 SIR', 
     
     'HAI-4-CI-LOWER': 'HAI-4 CI LOWER', 
     'HAI-4-CI-UPPER': 'HAI-4 CI UPPER', 
     'HAI_4_CI_LOWER': 'HAI-4 CI LOWER', 
     'HAI_4_CI_UPPER': 'HAI-4 CI UPPER', 
     'HAI_4_CILOWER': 'HAI-4 CI LOWER', 
     'HAI_4_CIUPPER': 'HAI-4 CI UPPER', 
     'HAI_4_DOPC': 'HAI-4 DOPC', 
     'HAI_4_DOPC_DAYS': 'HAI-4 DOPC', 
     'HAI-4-DOPC-DAYS': 'HAI-4 DOPC', 
     'HAI-4-ELIGCASES': 'HAI-4 ELIG CASES', 
     'HAI_4_ELIGCASES': 'HAI-4 ELIG CASES', 
     'HAI-4-NUMERATOR': 'HAI-4 NUMERATOR', 
     'HAI_4_NUMERATOR': 'HAI-4 NUMERATOR', 
     'HAI-4-SIR': 'HAI-4 SIR',  
     'HAI_4_SIR': 'HAI-4 SIR',  
     
     'HAI-5-CI-LOWER': 'HAI-5 CI LOWER', 
     'HAI-5-CI-UPPER': 'HAI-5 CI UPPER', 
     'HAI_5_CI_LOWER': 'HAI-5 CI LOWER', 
     'HAI_5_CI_UPPER': 'HAI-5 CI UPPER', 
     'HAI_5_CILOWER': 'HAI-5 CI LOWER', 
     'HAI_5_CIUPPER': 'HAI-5 CI UPPER', 
     'HAI-5-DOPC-DAYS': 'HAI-5 DOPC',
     'HAI-5-ELIGCASES': 'HAI-5 ELIG CASES', 
     'HAI-5-NUMERATOR': 'HAI-5 NUMERATOR', 
     'HAI-5-SIR': 'HAI-5 SIR', 
     'HAI_5_DOPC': 'HAI-5 DOPC',
     'HAI_5_DOPC_DAYS': 'HAI-5 DOPC',
     'HAI_5_ELIGCASES': 'HAI-5 ELIG CASES', 
     'HAI_5_NUMERATOR': 'HAI-5 NUMERATOR', 
     'HAI_5_SIR': 'HAI-5 SIR',  
     
     'HAI-6-CI-LOWER': 'HAI-6 CI LOWER', 
     'HAI-6-CI-UPPER': 'HAI-6 CI UPPER', 
     'HAI_6_CI_LOWER': 'HAI-6 CI LOWER', 
     'HAI_6_CI_UPPER': 'HAI-6 CI UPPER', 
     'HAI_6_CILOWER': 'HAI-6 CI LOWER', 
     'HAI_6_CIUPPER': 'HAI-6 CI UPPER', 
     'HAI-6-DOPC-DAYS': 'HAI-6 DOPC', 
     'HAI-6-ELIGCASES': 'HAI-6 ELIG CASES', 
     'HAI-6-NUMERATOR': 'HAI-6 NUMERATOR', 
     'HAI-6-SIR': 'HAI-6 SIR',  
     'HAI_6_DOPC': 'HAI-6 DOPC', 
     'HAI_6_DOPC_DAYS': 'HAI-6 DOPC', 
     'HAI_6_ELIGCASES': 'HAI-6 ELIG CASES',  
     'HAI_6_NUMERATOR': 'HAI-6 NUMERATOR',  
     'HAI_6_SIR': 'HAI-6 SIR', 
    } 

hai_df['Measure ID'].replace(to_replace=d, inplace=True)
hai_df['name_id'] = hai_df['Measure ID'] + ': ' + hai_df['Measure Name']
measures = sorted(hai_df['name_id'].unique())

to_drop = []
for m in measures:
    if 'LOWER' in m or 'UPPER' in m or '1a' in m or '2a' in m:
        to_drop.append(m)

hai_df = hai_df[~hai_df['name_id'].isin(to_drop)]
hai_df.drop(labels=['name_id', 'Footnote', 'Start Date', 'End Date'], axis=1, inplace=True)
hai_df = hai_df.filter(items = ['Facility ID',
                                'file_month', 
                                'file_year',
                                #'Measure ID',
                                'Measure Name',
                                'Score'], axis=1)
hai_df.head()


Unnamed: 0,Facility ID,file_month,file_year,Measure Name,Score
2,10001,1,2023,CLABSI Number of Device Days,10024.0
3,10001,1,2023,CLABSI Predicted Cases,10.597
4,10001,1,2023,CLABSI Observed Cases,7.0
5,10001,1,2023,CLABSI (SIR),0.661
8,10001,1,2023,CAUTI Urinary Catheter Days,17731.0


In [6]:
measures = sorted(hai_df['Measure Name'].unique())
print(len(measures), 'measure names:')
print(measures, '\n')

hai_df.head()

27 measure names:
['CAUTI (SIR)', 'CAUTI Number of Procedures', 'CAUTI Observed Cases', 'CAUTI Predicted Cases', 'CAUTI Urinary Catheter Days', 'CDIFF (SIR)', 'CDIFF Observed Cases', 'CDIFF Predicted Cases', 'CDIFF patient days', 'CLABSI (SIR)', 'CLABSI Number of Device Days', 'CLABSI Number of Procedures', 'CLABSI Observed Cases', 'CLABSI Predicted Cases', 'MRSA (SIR)', 'MRSA Observed Cases', 'MRSA Predicted Cases', 'MRSA patient days', 'SSI Abdominal Hysterectomy (SIR)', 'SSI Abdominal Number of Procedures', 'SSI Abdominal Observed Cases', 'SSI Abdominal Predicted Cases', 'SSI Colon Number of Procedures', 'SSI Colon Observed Cases', 'SSI Colon Predicted Cases', 'SSI Colon Surgery (SIR)', 'SSI Colon, Number of Procedures'] 



Unnamed: 0,Facility ID,file_month,file_year,Measure Name,Score
2,10001,1,2023,CLABSI Number of Device Days,10024.0
3,10001,1,2023,CLABSI Predicted Cases,10.597
4,10001,1,2023,CLABSI Observed Cases,7.0
5,10001,1,2023,CLABSI (SIR),0.661
8,10001,1,2023,CAUTI Urinary Catheter Days,17731.0


In [7]:
#hais = ['HAI-1 ', 'HAI-2 ', 'HAI-3', 'HAI-4', 'HAI-5', 'HAI-6']
hais = ['CLABSI', 'CAUTI', 'MRSA', 'CDI', 'SSI C', 'SSI A']
hai_df2 = pd.DataFrame(columns=['Facility ID', 'file_month', 'file_year'])
main_df = pd.DataFrame(columns=['Facility ID', 'file_month', 'file_year'])


In [8]:
for i, hai in enumerate(hais):
    
    tdf = hai_df.loc[hai_df['Measure Name'].str.startswith(hai, na=False)]
    measures = sorted(tdf['Measure Name'].unique())
    
    hai_df2 = pd.DataFrame(columns=['Facility ID', 'file_month', 'file_year'])

    for j, m in enumerate(measures):
    
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in ['Facility ID', 'file_month', 'file_year']:
                continue

            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        hai_df2 = hai_df2.merge(tdf2, on=['Facility ID', 'file_month', 
                                          'file_year'], how='outer')
    
    main_df = main_df.merge(hai_df2, on=['Facility ID', 'file_month', 
                                          'file_year'], how='outer')



In [9]:
del hai_df2
hai_df = main_df.copy(deep=True)
del main_df

print(hai_df.shape)
hai_df = hai_df.loc[:, ~hai_df.T.duplicated(keep='first')]
hai_df.dropna(how='all', axis=1, inplace=True)
print(hai_df.shape)
hai_df.head()


(181311, 30)
(181311, 30)


Unnamed: 0,Facility ID,file_month,file_year,CLABSI (SIR),CLABSI Number of Device Days,CLABSI Number of Procedures,CLABSI Observed Cases,CLABSI Predicted Cases,CAUTI (SIR),CAUTI Number of Procedures,CAUTI Observed Cases,CAUTI Predicted Cases,CAUTI Urinary Catheter Days,MRSA (SIR),MRSA Observed Cases,MRSA Predicted Cases,MRSA patient days,CDIFF (SIR),CDIFF Observed Cases,CDIFF Predicted Cases,CDIFF patient days,SSI Colon Number of Procedures,SSI Colon Observed Cases,SSI Colon Predicted Cases,SSI Colon Surgery (SIR),"SSI Colon, Number of Procedures",SSI Abdominal Hysterectomy (SIR),SSI Abdominal Number of Procedures,SSI Abdominal Observed Cases,SSI Abdominal Predicted Cases
0,10001,1,2023,0.661,10024.0,,7.0,10.597,0.3,,8.0,26.63,17731.0,0.85,8.0,9.412,101908.0,0.66,48.0,72.686,101451.0,154.0,5.0,4.548,1.099,,0.0,200.0,0.0,1.845
1,10005,1,2023,3.673,3713.0,,9.0,2.45,1.201,,6.0,4.995,8670.0,0.0,0.0,1.999,38413.0,0.858,9.0,10.484,35686.0,88.0,3.0,2.512,1.194,,,38.0,0.0,0.316
2,10006,1,2023,0.757,7318.0,,6.0,7.924,0.196,,3.0,15.296,11755.0,1.441,6.0,4.164,62709.0,0.088,2.0,22.618,54159.0,91.0,1.0,2.523,0.396,,,35.0,0.0,0.373
3,10007,1,2023,,268.0,,0.0,0.165,,,2.0,0.774,1417.0,,0.0,0.132,5484.0,0.466,1.0,2.148,5413.0,6.0,0.0,0.152,,,,,,
4,10008,1,2023,,14.0,,0.0,0.008,,,0.0,0.265,488.0,,0.0,0.051,2171.0,,0.0,0.398,2171.0,,,,,,,,,


In [10]:
for i, n in enumerate(list(hai_df)):
    if n in ['Facility ID', 'file_month', 'file_year']:
        continue
            
    hai_df.rename(columns={n: 'HAI: ' + n}, inplace=True)
    
print(hai_df.shape)
hai_df.head()

(181311, 30)


Unnamed: 0,Facility ID,file_month,file_year,HAI: CLABSI (SIR),HAI: CLABSI Number of Device Days,HAI: CLABSI Number of Procedures,HAI: CLABSI Observed Cases,HAI: CLABSI Predicted Cases,HAI: CAUTI (SIR),HAI: CAUTI Number of Procedures,HAI: CAUTI Observed Cases,HAI: CAUTI Predicted Cases,HAI: CAUTI Urinary Catheter Days,HAI: MRSA (SIR),HAI: MRSA Observed Cases,HAI: MRSA Predicted Cases,HAI: MRSA patient days,HAI: CDIFF (SIR),HAI: CDIFF Observed Cases,HAI: CDIFF Predicted Cases,HAI: CDIFF patient days,HAI: SSI Colon Number of Procedures,HAI: SSI Colon Observed Cases,HAI: SSI Colon Predicted Cases,HAI: SSI Colon Surgery (SIR),"HAI: SSI Colon, Number of Procedures",HAI: SSI Abdominal Hysterectomy (SIR),HAI: SSI Abdominal Number of Procedures,HAI: SSI Abdominal Observed Cases,HAI: SSI Abdominal Predicted Cases
0,10001,1,2023,0.661,10024.0,,7.0,10.597,0.3,,8.0,26.63,17731.0,0.85,8.0,9.412,101908.0,0.66,48.0,72.686,101451.0,154.0,5.0,4.548,1.099,,0.0,200.0,0.0,1.845
1,10005,1,2023,3.673,3713.0,,9.0,2.45,1.201,,6.0,4.995,8670.0,0.0,0.0,1.999,38413.0,0.858,9.0,10.484,35686.0,88.0,3.0,2.512,1.194,,,38.0,0.0,0.316
2,10006,1,2023,0.757,7318.0,,6.0,7.924,0.196,,3.0,15.296,11755.0,1.441,6.0,4.164,62709.0,0.088,2.0,22.618,54159.0,91.0,1.0,2.523,0.396,,,35.0,0.0,0.373
3,10007,1,2023,,268.0,,0.0,0.165,,,2.0,0.774,1417.0,,0.0,0.132,5484.0,0.466,1.0,2.148,5413.0,6.0,0.0,0.152,,,,,,
4,10008,1,2023,,14.0,,0.0,0.008,,,0.0,0.265,488.0,,0.0,0.051,2171.0,,0.0,0.398,2171.0,,,,,,,,,


In [11]:
hai_df.to_pickle('dataframes/hai_df.pkl.gz', protocol=5, compression='gzip')