# Generate HACRP dataframe

In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define custom functions

In [2]:
                       
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', 'Hospital Name', 'PSI 90 Start Date', 'PSI 90 End Date', 
             'PSI 90 W Z Score', 'PSI 90 Footnote',
             'FISCAL_YEAR', 'DOMAIN_1_SCORE', 'DOMAIN_1_FOOTNOTE', 'DOMAIN_1_START_DATE', 
             'DOMAIN_1_END_DATE', 'PSI_90_W_Z_Score', 'PSI_90_FOOTNOTE', 'DOMAIN_2_SCORE', 
             'DOMAIN_2_FOOTNOTE', 'CLABSI_W_Z_SCORE', 'CLABSI_FOOTNOTE', 'CAUTI_W_Z_SCORE', 
             'CAUTI_FOOTNOTE', 'SSI_W_Z_SCORE', 'SSI_FOOTNOTE', 'MRSA_W_Z_SCORE', 'MRSA_FOOTNOTE', 
             'CDI_W_Z_SCORE', 'CDI_FOOTNOTE', 'DOMAIN_2_START_DATE', 'DOMAIN_2_END_DATE', 
             'TOTAL_HAC_SCORE', 'TOTAL_HAC_FOOTNOTE', 'PAYMENT_REDUCTION', 'PAYMENT_REDUCTION_FOOTNOTE',
             'HOSPITAL_NAME', 'PROVIDER_ID', 'STATE', 'Hospital_Name', 'Provider_ID',
             'Domain_1_Score', 'Domain_1_Score_Footnote', 'Domain_1_Start_Date', 
             'Domain_1_End_Date', 'AHRQ_PSI_90_Score', 'AHRQ_PSI_90_Score_Footnote', 
             'Domain_2_Score', 'Domain_2_Score_Footnote', 'CLABSI_Score', 'CLABSI_Score_Footnote', 
             'CAUTI_Score', 'CAUTI_Score_Footnote', 'SSI_Score', 'SSI_Score_Footnote', 'MRSA_Score', 
             'MRSA_Footnote', 'CDI_Score', 'CDI_Footnote', 'Domain_2_Start_Date', 'Domain_2_End_Date', 
             'Total_HAC_Score', 'Total_HAC_Score_Footnote', 'Payment_Reduction', 
             'Payment_Reduction_Footnote', 'CLABSI Footnote', 'CAUTI Footnote',
             'PROVIDER ID', 'PSI_90_W_Z_SCORE', 'Fiscal_Year', 'PSI 90 Composite',
             'PSI 90 Composite Footnote', 'PSI 90 W Z Footnote', 'CLABSI SIR', 'CLABSI SIR Footnote',
             ]
    
    
    cols2 = ['Facility ID', 'Facility Name', 'PSI-90 Start Date', 'PSI-90 End Date',
             'PSI-90 W Z Score', 'PSI-90 Footnote',
             'Fiscal Year', 'Domain 1 Score', 'Domain 1 Footnote', 'Domain 1 Start Date',
             'Domain 1 End Date', 'PSI-90 W Z Score', 'PSI-90 Footnote', 'Domain 2 Score', 
             'Domain 2 Footnote', 'CLABSI W Z Score', 'CLABSI Footnote', 'CAUTI W Z Score',
             'CAUTI Footnote', 'SSI W Z Score', 'SSI Footnote', 'MRSA W Z Score', 
             'MRSA Footnote', 'CDI W Z Score', 'CDI Footnote', 'Domain 2 Start Date',
             'Domain 2 End Date', 'Total HAC Score', 'Total HAC Footnote', 
             'Payment Reduction', 'Payment Reduction Footnote',
             'Facility Name', 'Facility ID', 'State', 'Facility Name', 'Facility ID',
             'Domain 1 Score', 'Domain 1 Score Footnote', 'Domain 1 Start Date', 
             'Domain 1 End Date', 'PSI-90 Score', 'PSI-90 Footnote',
             'Domain 2 Score', 'Domain 2 Score Footnote', 'CLABSI Score', 
             'CLABSI Footnote', 'CAUTI Score', 'CAUTI Footnote', 
             'SSI Score', 'SSI Footnote', 'MRSA Score', 'MRSA Footnote', 
             'CDI Score', 'CDI Footnote', 'Domain 2 Start Date', 'Domain 2 End Date', 
             'Total HAC Score', 'Total HAC Footnote', 'Payment Reduction',
             'Payment Reduction Footnote', 'CLABSI Footnote', 'CAUTI Footnote',
             'Facility ID', 'PSI-90 W Z Score', 'Fiscal Year', 'PSI-90 Score',
             'PSI-90 Footnote', 'PSI-90 W Z Footnote', 'CLABSI SIR', 'CLABSI Footnote',
             
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists

## Load HACRP Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12',
       ]

subdirs = ['2023/hospitals_01_2023/FY_2023_HAC_Reduction_Program_Hospital.csv', 
           '2023/hospitals_04_2023/FY_2023_HAC_Reduction_Program_Hospital.csv',
           '2023/hospitals_07_2023/FY_2023_HAC_Reduction_Program_Hospital.csv',
           '2023/hospitals_10_2023/FY_2023_HAC_Reduction_Program_Hospital.csv',
           
           '2022/hospitals_01_2022/FY_2022_HAC_Reduction_Program_Hospital.csv', 
           '2022/hospitals_04_2022/FY_2022_HAC_Reduction_Program_Hospital.csv',
           '2022/hospitals_07_2022/FY_2022_HAC_Reduction_Program_Hospital.csv',
           '2022/hospitals_10_2022/FY_2022_HAC_Reduction_Program_Hospital.csv',
           
           '2021/hospitals_01_2021/FY_2021_HAC_Reduction_Program_Hospital.csv',
           '2021/hospitals_03_2021/FY_2021_HAC_Reduction_Program_Hospital.csv',
           '2021/hospitals_04_2021/FY_2021_HAC_Reduction_Program_Hospital.csv',
           '2021/hospitals_07_2021/FY_2021_HAC_Reduction_Program_Hospital.csv',
           '2021/hospitals_10_2021/FY_2021_HAC_Reduction_Program_Hospital.csv',
           
           '2020/hospitals_archive_10_2020/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2020/hospitals_archive_07_2020/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/HOSPITAL_QUARTERY_HAC_DOMAIN_HOSPITAL.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv', 
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL_02_26_2016.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL_02_26_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL_02_26_2016.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL_11_24_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/HOSPITAL_QUARTERLY_HAC_DOMAIN_HOSPITAL_11_24_2014.csv',
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

subdir = 'HACRP/CombinedFiles_HACRP'
df = pd.concat(df_list)
print('df.shape:', df.shape)
print(sorted(list(df)))
df.head()

2023/hospitals_01_2023/FY_2023_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3165, 36)
2023/hospitals_04_2023/FY_2023_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3165, 36)
2023/hospitals_07_2023/FY_2023_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3165, 36)
2023/hospitals_10_2023/FY_2023_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3165, 36)
2022/hospitals_01_2022/FY_2022_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3170, 24)
2022/hospitals_04_2022/FY_2022_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3170, 24)
2022/hospitals_07_2022/FY_2022_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3170, 24)
2022/hospitals_10_2022/FY_2022_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3170, 24)
2021/hospitals_01_2021/FY_2021_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3204, 24)
2021/hospitals_03_2021/FY_2021_HAC_Reduction_Program_Hospital.csv :  (rows, columns) = (3204, 24)
2021/hospitals_04_20

Unnamed: 0,CAUTI SIR,CAUTI SIR Footnote,CAUTI W Z Footnote,CAUTI W Z Score,CDI SIR,CDI SIR Footnote,CDI W Z Footnote,CDI W Z Score,CLABSI Footnote,CLABSI SIR,CLABSI W Z Footnote,CLABSI W Z Score,Facility ID,Facility Name,Fiscal Year,HAI Measures End Date,HAI Measures Start Date,MRSA SIR,MRSA SIR Footnote,MRSA W Z Footnote,MRSA W Z Score,PSI-90 End Date,PSI-90 Footnote,PSI-90 Score,PSI-90 Start Date,PSI-90 W Z Footnote,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI SIR,SSI SIR Footnote,SSI W Z Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,CAUTI Footnote,CDI Footnote,MRSA Footnote,SSI Footnote,Domain 1 End Date,Domain 1 Footnote,Domain 1 Score,Domain 1 Start Date,Domain 2 End Date,Domain 2 Footnote,Domain 2 Score,Domain 2 Start Date,CAUTI Score,CDI Score,CLABSI Score,Domain 1 Score Footnote,Domain 2 Score Footnote,MRSA Score,SSI Score,Footnotes
0,0.306,,5.0,,0.644,,5.0,,,0.476,5.0,,10001,SOUTHEAST HEALTH MEDICAL CENTER,2023,12/31/2021,01/01/2021,0.912,,5.0,,,5,,,5.0,,,5,0.822,,5.0,,AL,5,0.0,1,2023,,,,,,,,,,,,,,,,,,,,
1,2.35,,5.0,,0.86,,5.0,,,3.311,5.0,,10005,MARSHALL MEDICAL CENTERS,2023,12/31/2021,01/01/2021,0.0,,5.0,,,5,,,5.0,,,5,0.356,,5.0,,AL,5,0.0,1,2023,,,,,,,,,,,,,,,,,,,,
2,0.602,,5.0,,0.08,,5.0,,,0.507,5.0,,10006,NORTH ALABAMA MEDICAL CENTER,2023,12/31/2021,01/01/2021,1.64,,5.0,,,5,,,5.0,,,5,0.75,,5.0,,AL,5,0.0,1,2023,,,,,,,,,,,,,,,,,,,,
3,,13.0,5.0,,0.41,,5.0,,13.0,,5.0,,10007,MIZELL MEMORIAL HOSPITAL,2023,12/31/2021,01/01/2021,,13.0,5.0,,,5,,,5.0,,,5,,13.0,5.0,,AL,5,0.0,1,2023,,,,,,,,,,,,,,,,,,,,
4,,13.0,5.0,,,13.0,5.0,,13.0,,5.0,,10008,CRENSHAW COMMUNITY HOSPITAL,2023,12/31/2021,01/01/2021,,13.0,5.0,,,5,,,5.0,,,5,,12.0,5.0,,AL,5,0.0,1,2023,,,,,,,,,,,,,,,,,,,,


In [4]:
ls = ['Facility ID', 'Facility Name', 'Fiscal Year', 'file_month', 'file_year',
      
      'CAUTI Score', 'CAUTI Footnote', 
      'CAUTI SIR', 'CAUTI SIR Footnote', 
      'CAUTI W Z Score', 'CAUTI W Z Footnote', 
      
      'CDI Score', 'CDI Footnote', 
      'CDI SIR', 'CDI SIR Footnote', 
      'CDI W Z Score', 'CDI W Z Footnote', 
      
      'CLABSI Score', 'CLABSI Footnote', 
      'CLABSI SIR', 
      'CLABSI W Z Score', 'CLABSI W Z Footnote',  
      
      'MRSA Score', 'MRSA Footnote', 
      'MRSA SIR', 'MRSA SIR Footnote',  
      'MRSA W Z Score', 'MRSA W Z Footnote',  
      
      'SSI Score', 'SSI Footnote', 
      'SSI SIR', 'SSI SIR Footnote',  
      'SSI W Z Score', 'SSI W Z Footnote',  
      
      'PSI-90 Score', 'PSI-90 Footnote', 
      'PSI-90 W Z Score', 'PSI-90 W Z Footnote',
      'PSI-90 Start Date', 'PSI-90 End Date',    
      
      'Domain 1 Start Date', 'Domain 1 End Date', 'Domain 1 Footnote',
      'Domain 1 Score', 'Domain 1 Score Footnote',  
      'Domain 2 Start Date', 'Domain 2 End Date', 'Domain 2 Footnote',
      'Domain 2 Score', 'Domain 2 Score Footnote',   
      
      'Footnotes', 'HAI Measures Start Date', 'HAI Measures End Date',
      'Total HAC Score', 'Total HAC Footnote',
      'Payment Reduction', 'Payment Reduction Footnote']

df = df.filter(items=ls, axis=1)

labs = ['CAUTI Score', 'CAUTI SIR', 'CAUTI W Z Score', 'CDI Score', 'CDI SIR', 'CDI W Z Score', 'CLABSI Score', 
        'CLABSI SIR', 'CLABSI W Z Score', 'MRSA Score', 'MRSA SIR', 'MRSA W Z Score', 'SSI Score', 'SSI SIR', 
        'SSI W Z Score', 'PSI-90 Score', 'PSI-90 W Z Score', 'Domain 1 Score', 'Domain 2 Score', 'Total HAC Score']

for l in labs:
    d = {'9**':9, '10**': 10, '6**': 6}
    df[l].replace(to_replace=d, inplace=True)
    df[l] = pd.to_numeric(df[l], errors='coerce')
   

labs = ['Facility ID', 'Facility Name', 'file_month', 'file_year',
        'CAUTI Score', 'CAUTI SIR', 'CAUTI W Z Score',
        'CDI Score', 'CDI SIR', 'CDI W Z Score',
        'CLABSI Score', 'CLABSI SIR', 'CLABSI W Z Score',
        'MRSA Score', 'MRSA SIR', 'MRSA W Z Score',
        'SSI Score', 'SSI SIR', 'SSI W Z Score',
        'PSI-90 Score', 'PSI-90 W Z Score',
        'Domain 1 Score', 'Domain 2 Score',
        'Total HAC Score', 'Payment Reduction',
        'HAI Measures End Date', 'HAI Measures Start Date',
        'Domain 1 Start Date', 'Domain 1 End Date',
        'Domain 2 Start Date', 'Domain 2 End Date',
        'PSI-90 Start Date', 'PSI-90 End Date',
       ]

df = df.filter(items=labs, axis=1)

print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.drop_duplicates(subset = ['Facility ID', 'Facility Name', 'file_month','file_year'], inplace=True)
print(df.shape)
df.head()

(127489, 33)
(127489, 33)
(127489, 33)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,CAUTI Score,CAUTI SIR,CAUTI W Z Score,CDI Score,CDI SIR,CDI W Z Score,CLABSI Score,CLABSI SIR,CLABSI W Z Score,MRSA Score,MRSA SIR,MRSA W Z Score,SSI Score,SSI SIR,SSI W Z Score,PSI-90 Score,PSI-90 W Z Score,Domain 1 Score,Domain 2 Score,Total HAC Score,Payment Reduction,HAI Measures End Date,HAI Measures Start Date,Domain 1 Start Date,Domain 1 End Date,Domain 2 Start Date,Domain 2 End Date,PSI-90 Start Date,PSI-90 End Date
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,,0.306,,,0.644,,,0.476,,,0.912,,,0.822,,,,,,0.0,,12/31/2021,01/01/2021,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,,2.35,,,0.86,,,3.311,,,0.0,,,0.356,,,,,,0.0,,12/31/2021,01/01/2021,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,,0.602,,,0.08,,,0.507,,,1.64,,,0.75,,,,,,0.0,,12/31/2021,01/01/2021,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,,,,,0.41,,,,,,,,,,,,,,,0.0,,12/31/2021,01/01/2021,,,,,,
4,10008,CRENSHAW COMMUNITY HOSPITAL,1,2023,,,,,,,,,,,,,,,,,,,,0.0,,12/31/2021,01/01/2021,,,,,,


## Save dataframe

In [5]:
tdf = df.drop(labels=['HAI Measures End Date', 'HAI Measures Start Date',
        'Domain 1 Start Date', 'Domain 1 End Date',
        'Domain 2 Start Date', 'Domain 2 End Date',
        'PSI-90 Start Date', 'PSI-90 End Date',], axis=1)

print(tdf.shape)
tdf.drop_duplicates(inplace=True)
print(tdf.shape)
tdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hacrp_df.pkl.gz', protocol=5, compression='gzip')


(127489, 25)
(127489, 25)


In [6]:
m1 = list(tdf)
ls = ['Facility ID','Facility Name','file_month','file_year']
for l in ls: 
    m1.remove(l)

## Save measurement dates

In [7]:
# Columns to keep as is
id_cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 
           'HAI Measures End Date', 'HAI Measures Start Date',
           'Domain 1 Start Date', 'Domain 1 End Date',
           'Domain 2 Start Date', 'Domain 2 End Date',
           'PSI-90 Start Date', 'PSI-90 End Date',]

# Melt the specific columns and create the 'Measure' and 'Score' columns
measures_df = df.melt(id_vars=id_cols, var_name='Measure Name', value_name='Score')

measures_df['HAI Measures End Date'].fillna(measures_df['Domain 2 End Date'], inplace=True)
measures_df['HAI Measures Start Date'].fillna(measures_df['Domain 2 Start Date'], inplace=True)
measures_df['PSI-90 End Date'].fillna(measures_df['Domain 1 End Date'], inplace=True)
measures_df['PSI-90 Start Date'].fillna(measures_df['Domain 1 Start Date'], inplace=True)

measures_df.drop(labels=['Score',
                     'Domain 1 Start Date', 'Domain 1 End Date', 
                     'Domain 2 Start Date', 'Domain 2 End Date', 
                     'Facility ID', 'Facility Name',
                    ], axis=1, inplace=True)

start = []
end = []
hai_start = measures_df['HAI Measures Start Date'].tolist()
hai_end = measures_df['HAI Measures End Date'].tolist()
psi_start = measures_df['PSI-90 Start Date'].tolist()
psi_end = measures_df['PSI-90 End Date'].tolist()

measures = measures_df['Measure Name'].tolist()

for i, m in enumerate(measures):
    if m in ['Total HAC Score', 'Payment Reduction']:
        start.append(np.nan)
        end.append(np.nan)
        
    elif m in ['CAUTI Score', 'CAUTI SIR', 'CAUTI W Z Score', 'CDI Score', 'CDI SIR', 'CDI W Z Score', 
             'CLABSI Score', 'CLABSI SIR', 'CLABSI W Z Score', 'MRSA Score', 'MRSA SIR', 'MRSA W Z Score', 
             'SSI Score', 'SSI SIR', 'SSI W Z Score', 'Domain 2 Score']:
            start.append(hai_start[i])
            end.append(hai_end[i])

    else:
        start.append(psi_start[i])
        end.append(psi_end[i])

measures_df['Start Date'] = start
measures_df['End Date'] = end

measures_df.drop(labels=['HAI Measures Start Date', 'HAI Measures End Date', 
                         'PSI-90 Start Date', 'PSI-90 End Date',
                    ], axis=1, inplace=True)

print(measures_df.shape)
measures_df.drop_duplicates(inplace=True)
measures_df.reset_index(drop=True, inplace=True)
print(measures_df.shape)
measures_df.head()


(2677269, 5)
(819, 5)


Unnamed: 0,file_month,file_year,Measure Name,Start Date,End Date
0,1,2023,CAUTI Score,01/01/2021,12/31/2021
1,4,2023,CAUTI Score,01/01/2021,12/31/2021
2,7,2023,CAUTI Score,01/01/2021,12/31/2021
3,10,2023,CAUTI Score,01/01/2021,12/31/2021
4,1,2022,CAUTI Score,01/01/2019,12/31/2019


In [8]:
d = {'01/01/2021': '01/01/2021', 
     '01/01/2019': '01/01/2019', 
     '01/01/2018': '01/01/2018', 
     '01/01/2017': '01/01/2017', 
     '01/01/2016': '01/01/2016', 
     '01/01/2015': '01/01/2015', 
     1012014:      '01/01/2014', 
     '01-JAN-14':  '01/01/2014', 
     1012013:      '01/01/2013', 
     '01/01/2013': '01/01/2013', 
     '01-JAN-12':  '01/01/2012', 
     '01/01/2012': '01/01/2012', 
     '07/01/2018': '07/01/2018', 
     '07/01/2017': '07/01/2017', 
     '07/01/2016': '07/01/2016', 
     '10/01/2015': '10/01/2015', 
     '07/01/2014': '07/01/2014', 
     7012013:      '07/01/2013', 
     '01-JUL-13':  '07/01/2013',  
     7012012:      '07/01/2012', 
     '07/01/2012': '07/01/2012',  
     '01-JUL-11':  '07/01/2011', 
     '07/01/2011': '07/01/2011',
    }
measures_df['Start Date'].replace(to_replace=d, inplace=True)

d = {
    '12/31/2021': '12/31/2021', 
    '12/31/2019': '12/31/2019',
    '12/31/2018': '12/31/2018', 
    '12/31/2017': '12/31/2017',
    '12/31/2016': '12/31/2016',
    12312015:     '12/31/2015',
    '31-DEC-15':  '12/31/2015', 
    12312014:     '12/31/2014',
    '12/31/2014': '12/31/2014', 
    '31-DEC-13':  '12/31/2013',
    '12/31/2013': '12/31/2013',
    '06/30/2019': '06/30/2019', 
    '06/30/2018': '06/30/2018',
    '06/30/2017': '06/30/2017',
    '09/30/2015': '09/30/2015',
    6302015:      '06/30/2015',
    '30-JUN-15':  '06/30/2015', 
    6302014:      '06/30/2014',
    '06/30/2014': '06/30/2014', 
    '30-JUN-13':  '06/30/2013',
    '06/30/2013': '06/30/2013',
    }
measures_df['End Date'].replace(to_replace=d, inplace=True)

measures_df['Start Date'] = pd.to_datetime(measures_df['Start Date'])
measures_df['End Date'] = pd.to_datetime(measures_df['End Date'])
measures_df.to_csv('~/GitHub/hospitals-data-archive/measure_dates/hacrp_df.csv')
measures_df.head()

Unnamed: 0,file_month,file_year,Measure Name,Start Date,End Date
0,1,2023,CAUTI Score,2021-01-01,2021-12-31
1,4,2023,CAUTI Score,2021-01-01,2021-12-31
2,7,2023,CAUTI Score,2021-01-01,2021-12-31
3,10,2023,CAUTI Score,2021-01-01,2021-12-31
4,1,2022,CAUTI Score,2019-01-01,2019-12-31


In [9]:
m2 = measures_df['Measure Name'].unique().tolist()
sorted(m1) == sorted(m2)

True