# Generate HRRP dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '/Users/kenlocey/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:

def check_lists(lists):
    for i, ls in enumerate(lists):
        for i2, ls2 in enumerate(lists):
            for i3 in ls:
                if i3 not in ls2:
                    print('\n')
                    print(i3 + ': NOT FOUND IN')
                    print(ls2)
                    sys.exit()
                        
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Address 1', 'Start_Date', 'End_Date',
             'Excess_Readmission_Ratio', 'Predicted_Readmission_Rate',
             'Number_of_Discharges', 'Hospital_Name', 'Number_of_Readmissions',
             'Provider_Number', 'Measure_Name', 'Provider Number', 
             'Expected_Readmission_Rate',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Address', 'Start Date', 'End Date',
             'Excess Readmission Ratio', 'Predicted Readmission Rate',
             'Number of Discharges', 'Facility Name', 'Number of Readmissions',
             'Facility ID', 'Measure Name', 'Facility ID',
             'Expected Readmission Rate',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = ['Excess Readmission Ratio', 'Number of Readmissions', 'Facility ID', 
            'Predicted Readmission Rate', 'Start Date', 'Measure Name', 'End Date', 
            'State', 'file_year', 'Footnote', 'Number of Discharges', 'file_month', 
            'Facility Name', 'Expected Readmission Rate',
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:

df_list = []
lists = []

yrs = ['2023', '2023',
       '2022', '2022', '2022',
       '2021', '2021', '2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04',
       '01', '04', '07',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07', '04',
       ]

subdirs = ['2023/hospitals_01_2023/FY_2023_Hospital_Readmissions_Reduction_Program_Hospital.csv', 
           '2023/hospitals_04_2023/FY_2023_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           
           '2022/hospitals_01_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv', 
           '2022/hospitals_04_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           '2022/hospitals_07_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           
           '2021/hospitals_01_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           '2021/hospitals_03_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           '2021/hospitals_04_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           '2021/hospitals_07_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           '2021/hospitals_10_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv',
           
           '2020/hospitals_archive_10_2020/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2020/hospitals_archive_07_2020/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/HOSPITAL_QUARTERLY_QUALITYMEASURE_RRP_HOSPITAL.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Readmission Reduction.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Readmission Reduction.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/Readmission Reduction.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/READMISSION REDUCTION.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/READMISSION REDUCTION.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/READMISSION REDUCTION.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/READMISSION REDUCTION.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/READMISSION REDUCTION.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/READMISSION REDUCTION.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/READMISSION REDUCTION.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/READMISSION REDUCTION.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/READMISSION REDUCTION.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/READMISSION REDUCTION.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/READMISSION REDUCTION.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/READMISSION REDUCTION.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/READMISSION REDUCTION.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/READMISSION REDUCTION.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/READMISSION REDUCTION.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/READMISSION REDUCTION.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/READMISSION REDUCTION.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/READMISSION REDUCTION.CSV',
           '2014/HOSArchive_Revised_Flatfiles_20140101/READMISSION REDUCTION.CSV',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/READMISSION REDUCTION.CSV',
           '2013/HOSArchive_Revised_Flatfiles_20130701/READMISSION REDUCTION.CSV',
           '2013/HOSArchive_Revised_Flatfiles_20130401/READMISSION REDUCTION.CSV',
           
           #'2012/HOSArchive_Revised_Flatfiles_20121001/READMISSION REDUCTION.CSV',
           #'2012/HOSArchive_Revised_Flatfiles_20120701/READMISSION REDUCTION.CSV',
           #'2012/HOSArchive_Flatfiles_20120701/READMISSION REDUCTION.CSV',
           
           ]

col_labs = []

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

    col_labs.extend(list(df))
    col_labs = list(set(col_labs))
    
    
print('\n')
print(col_labs)


check_lists(lists)
subdir = 'HRRP/CombinedFiles_HRRP'

df = pd.concat(df_list)
print('df.shape:', df.shape)


2023/hospitals_01_2023/FY_2023_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (18990, 12)
2023/hospitals_04_2023/FY_2023_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (18990, 12)
2022/hospitals_01_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19020, 12)
2022/hospitals_04_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19020, 12)
2022/hospitals_07_2022/FY_2022_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19020, 12)
2021/hospitals_01_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19224, 12)
2021/hospitals_03_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19224, 12)
2021/hospitals_04_2021/FY_2021_Hospital_Readmissions_Reduction_Program_Hospital.csv :  (rows, columns) = (19224, 12)
2021/hospitals_07_2021/FY_2021_Hospital_Readmissions_Reduction_P

In [4]:
print(df['Measure Name'].unique())

['READM-30-HIP-KNEE-HRRP' 'READM-30-COPD-HRRP' 'READM-30-AMI-HRRP'
 'READM-30-PN-HRRP' 'READM-30-HF-HRRP' 'READM-30-CABG-HRRP'
 'READM_30_AMI_HRRP' 'READM_30_CABG_HRRP' 'READM_30_COPD_HRRP'
 'READM_30_HF_HRRP' 'READM_30_HIP_KNEE_HRRP' 'READM_30_PN_HRRP'
 'Acute Myocardial Infarction (AMI) 30-Day Readmissions'
 'Heart Failure (HF) 30-Day Readmissions'
 'Pneumonia (PN) 30-Day Readmissions']


In [5]:
df = df.filter(items=['Facility ID',
                      'file_month',
                      'file_year',
                      'Start Date',
                      'End Date', 
                      'Measure Name',
                      'Excess Readmission Ratio', 
                      'Expected Readmission Rate', 
                      'Number of Discharges',
                      'Number of Readmissions',
                      'Predicted Readmission Rate',
                      'Footnote',
                     ], axis=1)

print(df.shape)

(745170, 12)


In [6]:
d = {
     'READM-30-HIP-KNEE-HRRP': 'READM-30-HIP-KNEE', 
     'READM-30-COPD-HRRP': 'READM-30-COPD',
     'READM-30-AMI-HRRP': 'READM-30-AMI',
     'READM-30-PN-HRRP': 'READM-30-PN',
     'READM-30-HF-HRRP': 'READM-30-HF',
     'READM-30-CABG-HRRP': 'READM-30-CABG',
     'READM_30_AMI_HRRP': 'READM-30-AMI',
     'READM_30_CABG_HRRP': 'READM-30-CABG',
     'READM_30_COPD_HRRP': 'READM-30-COPD',
     'READM_30_HF_HRRP': 'READM-30-HF',
     'READM_30_HIP_KNEE_HRRP': 'READM-30-HIP-KNEE',
     'READM_30_PN_HRRP': 'READM-30-PN',
     'Acute Myocardial Infarction (AMI) 30-Day Readmissions': 'READM-30-AMI',
     'Heart Failure (HF) 30-Day Readmissions': 'READM-30-HF', 
     'Pneumonia (PN) 30-Day Readmissions': 'READM-30-PN',
     }

df['Measure Name'].replace(to_replace=d, inplace=True)


In [7]:
Measures = sorted(df['Measure Name'].unique())
main_df = pd.DataFrame(columns=['Facility ID', 'file_month', 'file_year'])

for i, mi in enumerate(Measures):
    
    tdf = df[df['Measure Name'] == mi]
    measures = sorted(tdf['Measure Name'].unique())
    
    df2 = pd.DataFrame(columns=['Facility ID', 'file_month', 'file_year'])

    for j, m in enumerate(measures):
    
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in ['Facility ID', 'file_month', 'file_year']:
                continue

            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m + ' (' + n + ')'}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        df2 = df2.merge(tdf2, on=['Facility ID', 'file_month', 
                                          'file_year'], how='outer')
    
    main_df = main_df.merge(df2, on=['Facility ID', 'file_month', 
                                          'file_year'], how='outer')



In [8]:
del df2
df = main_df.copy(deep=True)
del main_df

print(df.shape)
df = df.loc[:, ~df.T.duplicated(keep='first')]
df.dropna(how='all', axis=1, inplace=True)
print(df.shape)
df.head()


(141676, 51)
(141676, 39)


Unnamed: 0,Facility ID,file_month,file_year,READM-30-AMI (Excess Readmission Ratio),READM-30-AMI (Expected Readmission Rate),READM-30-AMI (Number of Discharges),READM-30-AMI (Number of Readmissions),READM-30-AMI (Predicted Readmission Rate),READM-30-AMI (Footnote),READM-30-CABG (Excess Readmission Ratio),READM-30-CABG (Expected Readmission Rate),READM-30-CABG (Number of Discharges),READM-30-CABG (Number of Readmissions),READM-30-CABG (Predicted Readmission Rate),READM-30-CABG (Footnote),READM-30-COPD (Excess Readmission Ratio),READM-30-COPD (Expected Readmission Rate),READM-30-COPD (Number of Discharges),READM-30-COPD (Number of Readmissions),READM-30-COPD (Predicted Readmission Rate),READM-30-COPD (Footnote),READM-30-HF (Excess Readmission Ratio),READM-30-HF (Expected Readmission Rate),READM-30-HF (Number of Discharges),READM-30-HF (Number of Readmissions),READM-30-HF (Predicted Readmission Rate),READM-30-HF (Footnote),READM-30-HIP-KNEE (Excess Readmission Ratio),READM-30-HIP-KNEE (Expected Readmission Rate),READM-30-HIP-KNEE (Number of Discharges),READM-30-HIP-KNEE (Number of Readmissions),READM-30-HIP-KNEE (Predicted Readmission Rate),READM-30-HIP-KNEE (Footnote),READM-30-PN (Excess Readmission Ratio),READM-30-PN (Expected Readmission Rate),READM-30-PN (Number of Discharges),READM-30-PN (Number of Readmissions),READM-30-PN (Predicted Readmission Rate),READM-30-PN (Footnote)
0,10001,1,2023,0.9958,14.5498,319.0,46.0,14.4888,,0.9836,11.3202,165.0,18.0,11.1341,,0.9903,19.3765,202.0,38.0,19.1885,,1.0551,21.8333,757.0,178.0,23.0374,,1.0301,4.4481,,,4.5819,,0.9568,17.7701,437.0,71.0,17.0029,
1,10005,1,2023,0.9603,16.9033,,,16.2327,,,,,,,5.0,0.9034,18.2089,235.0,31.0,16.4506,,1.0057,20.0336,157.0,32.0,20.1475,,0.9816,4.1886,,,4.1113,,0.9777,16.0472,361.0,55.0,15.6895,
2,10006,1,2023,1.1127,12.5406,312.0,49.0,13.9536,,1.3316,11.9544,109.0,24.0,15.9187,,0.9027,18.2945,235.0,31.0,16.5144,,0.944,20.7534,550.0,104.0,19.592,,1.1885,4.1172,246.0,15.0,4.8932,,1.1525,16.5034,538.0,113.0,19.0197,
3,10007,1,2023,,,,,,1.0,,,,,,5.0,0.9979,18.3236,72.0,13.0,18.2845,,1.1129,19.3933,51.0,18.0,21.5828,,,,,,,1.0,0.9628,13.273,,,12.7792,
4,10008,1,2023,,,,,,1.0,,,,,,5.0,,,,,,1.0,,,,,,1.0,,,,,,5.0,,,,,,1.0


In [9]:
for i, n in enumerate(list(df)):
    if n in ['Facility ID', 'file_month', 'file_year']:
        continue
            
    df.rename(columns={n: 'HRRP: ' + n}, inplace=True)
    
print(df.shape)
df.head()

(141676, 39)


Unnamed: 0,Facility ID,file_month,file_year,HRRP: READM-30-AMI (Excess Readmission Ratio),HRRP: READM-30-AMI (Expected Readmission Rate),HRRP: READM-30-AMI (Number of Discharges),HRRP: READM-30-AMI (Number of Readmissions),HRRP: READM-30-AMI (Predicted Readmission Rate),HRRP: READM-30-AMI (Footnote),HRRP: READM-30-CABG (Excess Readmission Ratio),HRRP: READM-30-CABG (Expected Readmission Rate),HRRP: READM-30-CABG (Number of Discharges),HRRP: READM-30-CABG (Number of Readmissions),HRRP: READM-30-CABG (Predicted Readmission Rate),HRRP: READM-30-CABG (Footnote),HRRP: READM-30-COPD (Excess Readmission Ratio),HRRP: READM-30-COPD (Expected Readmission Rate),HRRP: READM-30-COPD (Number of Discharges),HRRP: READM-30-COPD (Number of Readmissions),HRRP: READM-30-COPD (Predicted Readmission Rate),HRRP: READM-30-COPD (Footnote),HRRP: READM-30-HF (Excess Readmission Ratio),HRRP: READM-30-HF (Expected Readmission Rate),HRRP: READM-30-HF (Number of Discharges),HRRP: READM-30-HF (Number of Readmissions),HRRP: READM-30-HF (Predicted Readmission Rate),HRRP: READM-30-HF (Footnote),HRRP: READM-30-HIP-KNEE (Excess Readmission Ratio),HRRP: READM-30-HIP-KNEE (Expected Readmission Rate),HRRP: READM-30-HIP-KNEE (Number of Discharges),HRRP: READM-30-HIP-KNEE (Number of Readmissions),HRRP: READM-30-HIP-KNEE (Predicted Readmission Rate),HRRP: READM-30-HIP-KNEE (Footnote),HRRP: READM-30-PN (Excess Readmission Ratio),HRRP: READM-30-PN (Expected Readmission Rate),HRRP: READM-30-PN (Number of Discharges),HRRP: READM-30-PN (Number of Readmissions),HRRP: READM-30-PN (Predicted Readmission Rate),HRRP: READM-30-PN (Footnote)
0,10001,1,2023,0.9958,14.5498,319.0,46.0,14.4888,,0.9836,11.3202,165.0,18.0,11.1341,,0.9903,19.3765,202.0,38.0,19.1885,,1.0551,21.8333,757.0,178.0,23.0374,,1.0301,4.4481,,,4.5819,,0.9568,17.7701,437.0,71.0,17.0029,
1,10005,1,2023,0.9603,16.9033,,,16.2327,,,,,,,5.0,0.9034,18.2089,235.0,31.0,16.4506,,1.0057,20.0336,157.0,32.0,20.1475,,0.9816,4.1886,,,4.1113,,0.9777,16.0472,361.0,55.0,15.6895,
2,10006,1,2023,1.1127,12.5406,312.0,49.0,13.9536,,1.3316,11.9544,109.0,24.0,15.9187,,0.9027,18.2945,235.0,31.0,16.5144,,0.944,20.7534,550.0,104.0,19.592,,1.1885,4.1172,246.0,15.0,4.8932,,1.1525,16.5034,538.0,113.0,19.0197,
3,10007,1,2023,,,,,,1.0,,,,,,5.0,0.9979,18.3236,72.0,13.0,18.2845,,1.1129,19.3933,51.0,18.0,21.5828,,,,,,,1.0,0.9628,13.273,,,12.7792,
4,10008,1,2023,,,,,,1.0,,,,,,5.0,,,,,,1.0,,,,,,1.0,,,,,,5.0,,,,,,1.0


In [10]:
df.to_pickle('dataframes/hrrp_df.pkl.gz', protocol=5, compression='gzip')