# Generate HAI dataframe

In [None]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
import time 

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'


## Define Custom Functions

In [None]:
def curate(df):
    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date", 'Hospital Name', 'Address 1', 'City/Town', 
             'County/Parish', 'Telephone Number']
    
    cols2 = ['Facility ID', "Start Date", "End Date", 'Facility Name', 'Address', 'City', 
             'County Name', 'Phone Number']
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists

## Load HAI Files

In [None]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021', '2021', '2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07',
       '10', '07', '04',
       ]

subdirs = ['2023/hospitals_01_2023/Healthcare_Associated_Infections-Hospital.csv', 
           '2023/hospitals_04_2023/Healthcare_Associated_Infections-Hospital.csv',
           '2023/hospitals_07_2023/Healthcare_Associated_Infections-Hospital.csv',
           '2023/hospitals_10_2023/Healthcare_Associated_Infections-Hospital.csv',
           
           '2022/hospitals_01_2022/Healthcare_Associated_Infections-Hospital.csv', 
           '2022/hospitals_04_2022/Healthcare_Associated_Infections-Hospital.csv',
           '2022/hospitals_07_2022/Healthcare_Associated_Infections-Hospital.csv',
           '2022/hospitals_10_2022/Healthcare_Associated_Infections-Hospital.csv',
           
           '2021/hospitals_01_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_03_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_04_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_07_2021/Healthcare_Associated_Infections-Hospital.csv',
           '2021/hospitals_10_2021/Healthcare_Associated_Infections-Hospital.csv',
           
           '2020/hospitals_archive_10_2020/Healthcare_Associated_Infections_Hospital.csv',
           '2020/hospitals_archive_07_2020/Healthcare_Associated_Infections_Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Healthcare Associated Infections - Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Healthcare Associated Infections - Hospital.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Healthcare Associated Infections - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Healthcare Associated Infections - Hospital.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/Healthcare Associated Infections - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Healthcare Associated Infections - Hospital.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Healthcare Associated Infections - Hospital.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Healthcare Associated Infections - Hospital.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/Healthcare Associated Infections - Hospital.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Healthcare Associated Infections - Hospital.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Healthcare Associated Infections - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/Healthcare Associated Infections - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Healthcare Associated Infections - Hospital.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Healthcare Associated Infections - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Healthcare Associated Infections - Hospital.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Healthcare Associated Infections - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Healthcare Associated Infections - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Healthcare Associated Infections - Hospital.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/Healthcare_Associated_Infections.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130701/Healthcare_Associated_Infections.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/Healthcare_Associated_Infections.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

subdir = 'HAI/CombinedFiles_HAI'
df = pd.concat(df_list)

print('df.shape:', df.shape)
df = df[~df['Score'].isin([np.nan, float("NaN"), 'Not Available'])]
print('df.shape:', df.shape)
df.drop(labels=['Footnote'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print('df.shape:', df.shape)

print(sorted(list(df)))
df.head()


In [None]:
df = df.filter(items=['Facility ID', 'Facility Name', 'file_month', 'file_year',
                      'Measure ID', 'Measure Name', 'Start Date', 'End Date',
                      'Score', 'Higher Estimate', 'Lower Estimate',
                      'Compared to National', 'Denominator', 'Footnote'], axis=1)
df = df[~df['Measure ID'].isin([np.nan, float("NaN"), 'Not given'])]
df = df[~df['Measure Name'].isin([np.nan, float("NaN"), 'Not given'])]

d = {
     'CAUTI: Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',
     'CAUTI Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Number of Urinary Catheter Days': 'CAUTI Urinary Catheter Days',                                                 
     'CAUTI Lower Confidence Limit': 'CAUTI lower CL',
     'CAUTI: Lower Confidence Limit': 'CAUTI lower CL',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Lower Confidence Limit': 'CAUTI lower CL',                                                
     'CAUTI Upper Confidence Limit': 'CAUTI upper CL',
     'CAUTI: Upper Confidence Limit': 'CAUTI upper CL',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Upper Confidence Limit': 'CAUTI upper CL',                                      
     'Catheter-Associated Urinary Tract Infections (CAUTI)': 'CAUTI (SIR)',
     'Catheter-associated urinary tract infections (CAUTI) in ICUs and select wards': 'CAUTI (SIR)',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards)': 'CAUTI (SIR)',
     'Catheter-Associated Urinary Tract Infections (CAUTI) in ICUs only': 'CAUTI SIR (ICUs only)',                                          
     'CAUTI: Observed Cases': 'CAUTI Observed Cases',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Observed Cases': 'CAUTI Observed Cases',                                                 
     'CAUTI: Observed Cases': 'CAUTI Observed Cases',
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Observed Cases': 'CAUTI Observed Cases',                                             
     'Catheter Associated Urinary Tract Infections (ICU + select Wards): Predicted Cases': 'CAUTI Predicted Cases',
     'CAUTI: Predicted Cases': 'CAUTI Predicted Cases',                                                 
     'CAUTI: Number of Procedures': 'CAUTI Number of Procedures',

    
     'Central Line Associated Bloodstream Infection: Number of Device Days': 'CLABSI Number of Device Days',
     'CLABSI: Number of Device Days': 'CLABSI Number of Device Days',
     'CLABSI Central Line Days': 'CLABSI Number of Device Days',                                               
     'CLABSI Lower Confidence Limit': 'CLABSI lower CL',
     'CLABSI: Lower Confidence Limit': 'CLABSI lower CL',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Lower Confidence Limit': 'CLABSI lower CL',                                                 
     'CLABSI Upper Confidence Limit': 'CLABSI upper CL',
     'CLABSI: Upper Confidence Limit': 'CLABSI upper CL',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Upper Confidence Limit': 'CLABSI upper CL',                                   
     'Central line-associated blood stream infections (CLABSI) in ICUs only': 'CLABSI SIR (ICUs only)',
     'Central line-associated bloodstream infections (CLABSI) in ICUs only': 'CLABSI SIR (ICUs only)',
     'Central Line Associated Bloodstream Infection (ICU + select Wards)': 'CLABSI (SIR)',
     'Central line-associated bloodstream infections (CLABSI) in ICUs and select wards': 'CLABSI (SIR)',
     'Central line-associated blood stream infections (CLABSI)': 'CLABSI (SIR)',
     'Central-Line-Associated Blood Stream Infections (CLABSI)': 'CLABSI (SIR)',
     'CLABSI: Observed Cases': 'CLABSI Observed Cases',
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Observed Cases': 'CLABSI Observed Cases',                                                 
     'Central Line Associated Bloodstream Infection (ICU + select Wards): Predicted Cases': 'CLABSI Predicted Cases',
     'CLABSI: Predicted Cases': 'CLABSI Predicted Cases',                                                
     'CLABSI: Number of Procedures': 'CLABSI Number of Procedures',
     
     
     'MRSA Lower Confidence Limit': 'MRSA lower CL',
     'MRSA Bacteremia: Lower Confidence Limit': 'MRSA lower CL',
     'Methicillin-resistant Staphylococcus Aureus (MRSA) Blood Laboratory-identified Events (Bloodstream infections)': 'MRSA (SIR)',
     'Methicillin-resistant Staphylococcus Aureus (MRSA) blood infections': 'MRSA (SIR)',
     'MRSA Bacteremia': 'MRSA (SIR)',
     'MRSA Observed Cases': 'MRSA Observed Cases', 
     'MRSA Bacteremia: Observed Cases': 'MRSA Observed Cases',
     'MRSA Predicted Cases': 'MRSA Predicted Cases', 
     'MRSA Bacteremia: Predicted Cases': 'MRSA Predicted Cases',
     'MRSA Upper Confidence Limit': 'MRSA upper CL', 
     'MRSA Bacteremia: Upper Confidence Limit': 'MRSA upper CL',
     'MRSA Bacteremia: Patient Days': 'MRSA patient days',
     'MRSA Patient Days': 'MRSA patient days',
     
     
     'Clostridium Difficile (C.Diff): Observed Cases': 'CDIFF Observed Cases',
     'C.diff Observed Cases': 'CDIFF Observed Cases',
     'Clostridium Difficile (C.Diff)': 'CDIFF (SIR)',
     'Clostridium difficile (C.diff.) Laboratory-identified Events (Intestinal infections)': 'CDIFF (SIR)',
     'Clostridium difficile (C.diff.) intestinal infections': 'CDIFF (SIR)',
     'Clostridium Difficile (C.Diff): Patient Days': 'CDIFF patient days',
     'C.diff Patient Days': 'CDIFF patient days',
     'C.diff Upper Confidence Limit': 'CDIFF upper CL',
     'Clostridium Difficile (C.Diff): Upper Confidence Limit': 'CDIFF upper CL',
     'Clostridium Difficile (C.Diff): Lower Confidence Limit': 'CDIFF lower CL',
     'C.diff Lower Confidence Limit': 'CDIFF lower CL',
     'C.diff Predicted Cases': 'CDIFF Predicted Cases',
     'Clostridium Difficile (C.Diff): Predicted Cases': 'CDIFF Predicted Cases',
     
     
     'Surgical Site Infection from abdominal hysterectomy (SSI: Hysterectomy)': 'SSI Abdominal Hysterectomy (SIR)',
     'Surgical site infections (SSI) from abdominal hysterectomy': 'SSI Abdominal Hysterectomy (SIR)',
     'SSI - Abdominal Hysterectomy': 'SSI Abdominal Hysterectomy (SIR)',
     'SSI - Abdominal Hysterectomy: Number of Procedures': 'SSI Abdominal Number of Procedures',
     'SSI - Abdominal Hysterectomy: Observed Cases': 'SSI Abdominal Observed Cases', 
     'SSI - Abdominal Hysterectomy: Predicted Cases': 'SSI Abdominal Predicted Cases', 
     'SSI - Abdominal Hysterectomy: Upper Confidence Limit': 'SSI Abdominal upper CL', 
     'SSI - Abdominal Hysterectomy: Lower Confidence Limit': 'SSI Abdominal lower CL',
     'SSI: Abdominal Lower Confidence Limit': 'SSI Abdominal lower CL', 
     'SSI: Abdominal Observed Cases': 'SSI Abdominal Observed Cases',  
     'SSI: Abdominal Predicted Cases': 'SSI Abdominal Predicted Cases',  
     'SSI: Abdominal Upper Confidence Limit': 'SSI Abdominal upper CL', 
     'SSI: Abdominal, Number of Procedures': 'SSI Abdominal Number of Procedures',
     
     'Surgical Site Infection from colon surgery (SSI: Colon)': 'SSI Colon Surgery (SIR)',
     'Surgical site infections (SSI) from colon surgery':  'SSI Colon Surgery (SIR)',
     'SSI - Colon Surgery': 'SSI Colon Surgery (SIR)', 
     'SSI - Colon Surgery: Lower Confidence Limit': 'SSI Colon lower CL',
     'SSI - Colon Surgery: Number of Procedures': 'SSI Colon Number of Procedures', 
     'SSI - Colon Surgery: Observed Cases': 'SSI Colon Observed Cases', 
     'SSI - Colon Surgery: Predicted Cases': 'SSI Colon Predicted Cases', 
     'SSI - Colon Surgery: Upper Confidence Limit': 'SSI Colon upper CL', 
     'SSI: Colon Lower Confidence Limit':'SSI Colon lower CL', 
     'SSI: Colon Observed Cases': 'SSI Colon Observed Cases', 
     'SSI: Colon Predicted Cases': 'SSI Colon Predicted Cases', 
     'SSI: Colon Upper Confidence Limit': 'SSI Colon upper CL',  
     'SSI: Colon, Number of Procedures': 'SSI Colon Number of Procedures',
     }

df['Measure Name'].replace(to_replace=d, inplace=True)


d = {'HAI-1-DOPC-DAYS': 'HAI-1 DOPC', 
     'HAI-1-ELIGCASES': 'HAI-1 ELIG CASES', 
     'HAI-1-NUMERATOR': 'HAI-1 NUMERATOR', 
     'HAI-1-SIR': 'HAI-1 SIR',
     'HAI_1_DOPC': 'HAI-1 DOPC',
     'HAI_1_DOPC_DAYS': 'HAI-1 DOPC', 
     'HAI_1_ELIGCASES': 'HAI-1 ELIG CASES', 
     'HAI_1_NUMERATOR': 'HAI-1 NUMERATOR',
     'HAI_1_SIR': 'HAI-1 SIR', 
     'HAI_1_CILOWER': 'HAI-1 CI LOWER',
     'HAI_1_CIUPPER': 'HAI-1 CI UPPER',
     'HAI_1_CI_LOWER': 'HAI-1 CI LOWER',
     'HAI_1_CI_UPPER': 'HAI-1 CI UPPER',
     'HAI-1-CI-LOWER': 'HAI-1 CI LOWER',
     'HAI-1-CI-UPPER': 'HAI-1 CI UPPER',
     
     'HAI_1a_CI_LOWER': 'HAI-1a CI LOWER',
     'HAI_1a_CI_UPPER': 'HAI-1a CI UPPER',
     'HAI_1a_DOPC_DAYS': 'HAI-1a DOPC', 
     'HAI_1a_ELIGCASES': 'HAI-1a ELIG CASES',  
     'HAI_1a_NUMERATOR': 'HAI-1a NUMERATOR',  
     'HAI_1a_SIR': 'HAI-1a SIR',  
     
     'HAI_2a_CI_LOWER': 'HAI-2a CI LOWER',
     'HAI_2a_CI_UPPER': 'HAI-2a CI UPPER',
     'HAI_2a_DOPC_DAYS': 'HAI-2a DOPC',
     'HAI_2a_ELIGCASES': 'HAI-2a ELIG CASES', 
     'HAI_2a_NUMERATOR': 'HAI-2a NUMERATOR', 
     'HAI_2a_SIR': 'HAI-2a SIR', 

     'HAI-2-DOPC-DAYS': 'HAI-2 DOPC', 
     'HAI-2-ELIGCASES': 'HAI-2 ELIG CASES', 
     'HAI-2-NUMERATOR': 'HAI-2 NUMERATOR', 
     'HAI-2-SIR': 'HAI-2 SIR', 
     'HAI_2_DOPC': 'HAI-2 DOPC',  
     'HAI_2_DOPC_DAYS': 'HAI-2 DOPC',  
     'HAI_2_ELIGCASES': 'HAI-2 ELIG CASES', 
     'HAI_2_NUMERATOR': 'HAI-2 NUMERATOR', 
     'HAI_2_SIR': 'HAI-2 SIR',
     'HAI_2_CILOWER': 'HAI-2 CI LOWER',
     'HAI_2_CIUPPER': 'HAI-2 CI UPPER',
     'HAI_2_CI_LOWER': 'HAI-2 CI LOWER',
     'HAI_2_CI_UPPER': 'HAI-2 CI UPPER',
     'HAI-2-CI-LOWER': 'HAI-2 CI LOWER',
     'HAI-2-CI-UPPER': 'HAI-2 CI UPPER',
     
     'HAI-3-CI-LOWER': 'HAI-3 CI LOWER', 
     'HAI-3-CI-UPPER': 'HAI-3 CI UPPER', 
     'HAI-3-DOPC-DAYS': 'HAI-3 DOPC', 
     'HAI-3-ELIGCASES': 'HAI-3 ELIG CASES', 
     'HAI-3-NUMERATOR': 'HAI-3 NUMERATOR', 
     'HAI-3-SIR': 'HAI-3 SIR', 
     'HAI_3_CILOWER': 'HAI-3 CI LOWER', 
     'HAI_3_CI_LOWER': 'HAI-3 CI LOWER', 
     'HAI_3_CIUPPER': 'HAI-3 CI UPPER', 
     'HAI_3_CI_UPPER': 'HAI-3 CI UPPER', 
     'HAI_3_DOPC': 'HAI-3 DOPC', 
     'HAI_3_DOPC_DAYS': 'HAI-3 DOPC', 
     'HAI_3_ELIGCASES': 'HAI-3 ELIG CASES', 
     'HAI_3_NUMERATOR': 'HAI-3 NUMERATOR', 
     'HAI_3_SIR': 'HAI-3 SIR', 
     
     'HAI-4-CI-LOWER': 'HAI-4 CI LOWER', 
     'HAI-4-CI-UPPER': 'HAI-4 CI UPPER', 
     'HAI_4_CI_LOWER': 'HAI-4 CI LOWER', 
     'HAI_4_CI_UPPER': 'HAI-4 CI UPPER', 
     'HAI_4_CILOWER': 'HAI-4 CI LOWER', 
     'HAI_4_CIUPPER': 'HAI-4 CI UPPER', 
     'HAI_4_DOPC': 'HAI-4 DOPC', 
     'HAI_4_DOPC_DAYS': 'HAI-4 DOPC', 
     'HAI-4-DOPC-DAYS': 'HAI-4 DOPC', 
     'HAI-4-ELIGCASES': 'HAI-4 ELIG CASES', 
     'HAI_4_ELIGCASES': 'HAI-4 ELIG CASES', 
     'HAI-4-NUMERATOR': 'HAI-4 NUMERATOR', 
     'HAI_4_NUMERATOR': 'HAI-4 NUMERATOR', 
     'HAI-4-SIR': 'HAI-4 SIR',  
     'HAI_4_SIR': 'HAI-4 SIR',  
     
     'HAI-5-CI-LOWER': 'HAI-5 CI LOWER', 
     'HAI-5-CI-UPPER': 'HAI-5 CI UPPER', 
     'HAI_5_CI_LOWER': 'HAI-5 CI LOWER', 
     'HAI_5_CI_UPPER': 'HAI-5 CI UPPER', 
     'HAI_5_CILOWER': 'HAI-5 CI LOWER', 
     'HAI_5_CIUPPER': 'HAI-5 CI UPPER', 
     'HAI-5-DOPC-DAYS': 'HAI-5 DOPC',
     'HAI-5-ELIGCASES': 'HAI-5 ELIG CASES', 
     'HAI-5-NUMERATOR': 'HAI-5 NUMERATOR', 
     'HAI-5-SIR': 'HAI-5 SIR', 
     'HAI_5_DOPC': 'HAI-5 DOPC',
     'HAI_5_DOPC_DAYS': 'HAI-5 DOPC',
     'HAI_5_ELIGCASES': 'HAI-5 ELIG CASES', 
     'HAI_5_NUMERATOR': 'HAI-5 NUMERATOR', 
     'HAI_5_SIR': 'HAI-5 SIR',  
     
     'HAI-6-CI-LOWER': 'HAI-6 CI LOWER', 
     'HAI-6-CI-UPPER': 'HAI-6 CI UPPER', 
     'HAI_6_CI_LOWER': 'HAI-6 CI LOWER', 
     'HAI_6_CI_UPPER': 'HAI-6 CI UPPER', 
     'HAI_6_CILOWER': 'HAI-6 CI LOWER', 
     'HAI_6_CIUPPER': 'HAI-6 CI UPPER', 
     'HAI-6-DOPC-DAYS': 'HAI-6 DOPC', 
     'HAI-6-ELIGCASES': 'HAI-6 ELIG CASES', 
     'HAI-6-NUMERATOR': 'HAI-6 NUMERATOR', 
     'HAI-6-SIR': 'HAI-6 SIR',  
     'HAI_6_DOPC': 'HAI-6 DOPC', 
     'HAI_6_DOPC_DAYS': 'HAI-6 DOPC', 
     'HAI_6_ELIGCASES': 'HAI-6 ELIG CASES',  
     'HAI_6_NUMERATOR': 'HAI-6 NUMERATOR',  
     'HAI_6_SIR': 'HAI-6 SIR', 
    } 

df['Measure ID'].replace(to_replace=d, inplace=True)
df['Measure Name'] = df['Measure Name'] + ' (' + df['Measure ID'] + ')'
df = df.filter(items=['Facility ID', 'Facility Name', 'file_month', 'file_year',
                      'Measure Name', 'Start Date', 'End Date', 'Score'], axis=1)
df.head()

## Process and save dataframe

In [None]:
cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']
main_df = pd.DataFrame(columns=cols)
Measures = sorted(df['Measure Name'].unique())

for i, mi in enumerate(Measures):    
    tdf = df[df['Measure Name'] == mi]
    measures = sorted(tdf['Measure Name'].unique())
    
    df2 = pd.DataFrame(columns=cols)

    for j, m in enumerate(measures):
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in cols:
                continue
            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m + ' (' + n + ')'}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        df2 = df2.merge(tdf2, on=cols, how='outer')
    
    main_df = main_df.merge(df2, on=cols, how='outer')

tdf = main_df.copy(deep=True)
del df2, main_df

print(tdf.shape)
tdf = tdf.loc[:, ~tdf.T.duplicated(keep='first')]
tdf.dropna(how='all', axis=1, inplace=True)
print(tdf.shape)
tdf.drop_duplicates(inplace=True)
print(tdf.shape)
tdf.drop_duplicates(subset = cols, inplace=True)
print(tdf.shape)
tdf.head()


In [None]:
ttdf = tdf.drop(labels=['Start Date', 'End Date'], axis=1)
ttdf.drop_duplicates(inplace=True)
print(ttdf.shape)


In [None]:

start_time = time.time()
ttdf['marker'] = ttdf['Facility ID'] + ' | ' + ttdf['Facility Name']  + ' | ' + ttdf['file_month'] + ' | ' + ttdf['file_year']
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
column = ttdf.pop('marker')
ttdf.insert(0, column.name, column)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
collapsed_df = ttdf.groupby(ttdf.marker).apply(lambda group: group.ffill().bfill().head(1))
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf = collapsed_df.reset_index(drop=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

ttdf.head()

## Save dataframe

In [None]:
ttdf.drop(labels=['marker'], axis=1, inplace=True)
ttdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hai_df.pkl.gz', protocol=5, compression='gzip')


In [None]:
m1 = list(ttdf)
ls = ['Facility ID','Facility Name','file_month','file_year']
for l in ls: 
    m1.remove(l)

## Save Measurement Dates

In [None]:
# Columns to keep as is
id_cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']

# Melt the specific columns and create the 'Measure' and 'Score' columns
measures_df = tdf.melt(id_vars=id_cols, var_name='Measure Name', value_name='Score')
measures_df.drop(labels=['Score', 'Facility ID', 'Facility Name'], axis=1, inplace=True)

print(measures_df.shape)
measures_df.drop_duplicates(inplace=True)
measures_df.reset_index(drop=True, inplace=True)
print(measures_df.shape)

measures_df['Start Date'] = pd.to_datetime(measures_df['Start Date'])
measures_df['End Date'] = pd.to_datetime(measures_df['End Date'])
measures_df.to_csv('~/GitHub/hospitals-data-archive/measure_dates/hai_df.csv')

measures_df.head()

In [None]:
m2 = measures_df['Measure Name'].unique().tolist()
sorted(m1) == sorted(m2)