# Generate HVBP clinical outcomes dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code',
             'Hospital Name', 'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code',
             'Facility Name', 'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:

df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_clinical_outcomes.csv', 
           '2023/hospitals_04_2023/hvbp_clinical_outcomes.csv',
           '2023/hospitals_07_2023/hvbp_clinical_outcomes.csv',
           '2023/hospitals_10_2023/hvbp_clinical_outcomes.csv',
           
           '2022/hospitals_01_2022/hvbp_clinical_outcomes.csv', 
           '2022/hospitals_04_2022/hvbp_clinical_outcomes.csv',
           '2022/hospitals_07_2022/hvbp_clinical_outcomes.csv',
           '2022/hospitals_10_2022/hvbp_clinical_outcomes.csv',
           
           '2021/hospitals_01_2021/hvbp_clinical_outcomes.csv',
           '2021/hospitals_03_2021/hvbp_clinical_outcomes.csv',
           '2021/hospitals_04_2021/hvbp_clinical_outcomes.csv',
           '2021/hospitals_07_2021/hvbp_clinical_outcomes.csv',
           '2021/hospitals_10_2021/hvbp_clinical_outcomes.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_clinical_outcomes_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_clinical_outcomes.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_clinical_outcomes_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_clinical_outcomes_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_clinical_care_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_clinical_care_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_clinical_care_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_clinical_care_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_clinical_care_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_clinical_care_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_clinical_care_11_07_2017.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_clinical_care_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_clinical_care_outcomes_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_clinical_care_outcomes_11_10_2016.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_clinical_care_outcomes_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_clinical_care_outcomes_11_10_2016.csv', 
           '2016/Hospital_Revised_FlatFiles_20161110/hvbp_outcome_08_26_2016.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/hvbp_outcome_06_08_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/hvbp_outcome_02_18_2016.csv', 
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/hvbp_outcome_11_17_2015.csv', 
           '2015/HOSArchive_Revised_FlatFiles_20151008/hvbp_outcome_08_06_2015.csv', 
           '2015/HOSArchive_Revised_FlatFiles_20150716/hvbp_outcome_05_18_2015.csv', 
           '2015/HOSArchive_Revised_Flatfiles_20150506/hvbp_outcome_02_18_2015.csv', 
           '2015/HOSArchive_Revised_Flatfiles_20150416/hvbp_outcome_02_18_2015.csv', 
           '2015/HOSArchive_Revised_Flatfiles_20150122/hvbp_outcome_10_28_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/hvbp_outcome_10_28_2014.csv', 
           '2014/HOSArchive_Revised_Flatfiles_20141023/hvbp_outcome_02_25_2014.csv', 
           '2014/HOSArchive_Revised_Flatfiles_20140717/hvbp_outcome_02_25_2014.csv',
           ]


for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)
print('df.shape:', df.shape)

del df_list
df.head()

2023/hospitals_01_2023/hvbp_clinical_outcomes.csv :  (rows, columns) = (2517, 50)
2023/hospitals_04_2023/hvbp_clinical_outcomes.csv :  (rows, columns) = (2517, 50)
2023/hospitals_07_2023/hvbp_clinical_outcomes.csv :  (rows, columns) = (2517, 50)
2023/hospitals_10_2023/hvbp_clinical_outcomes.csv :  (rows, columns) = (2517, 50)
2022/hospitals_01_2022/hvbp_clinical_outcomes.csv :  (rows, columns) = (2676, 43)
2022/hospitals_04_2022/hvbp_clinical_outcomes.csv :  (rows, columns) = (2236, 50)
2022/hospitals_07_2022/hvbp_clinical_outcomes.csv :  (rows, columns) = (2236, 50)
2022/hospitals_10_2022/hvbp_clinical_outcomes.csv :  (rows, columns) = (2236, 50)
2021/hospitals_01_2021/hvbp_clinical_outcomes.csv :  (rows, columns) = (2676, 43)
2021/hospitals_03_2021/hvbp_clinical_outcomes.csv :  (rows, columns) = (2676, 43)
2021/hospitals_04_2021/hvbp_clinical_outcomes.csv :  (rows, columns) = (2676, 43)
2021/hospitals_07_2021/hvbp_clinical_outcomes.csv :  (rows, columns) = (2676, 43)
2021/hospitals_1

Unnamed: 0,Address,COMP-HIP-KNEE Achievement Points,COMP-HIP-KNEE Achievement Threshold,COMP-HIP-KNEE Baseline Rate,COMP-HIP-KNEE Benchmark,COMP-HIP-KNEE Improvement Points,COMP-HIP-KNEE Measure Score,COMP-HIP-KNEE Performance Rate,City,County Name,Facility ID,Facility Name,Fiscal Year,MORT-30-AMI Achievement Points,MORT-30-AMI Achievement Threshold,MORT-30-AMI Baseline Rate,MORT-30-AMI Benchmark,MORT-30-AMI Improvement Points,MORT-30-AMI Measure Score,MORT-30-AMI Performance Rate,MORT-30-CABG Achievement Points,MORT-30-CABG Achievement Threshold,MORT-30-CABG Baseline Rate,MORT-30-CABG Benchmark,MORT-30-CABG Improvement Points,MORT-30-CABG Measure Score,MORT-30-CABG Performance Rate,MORT-30-COPD Achievement Points,MORT-30-COPD Achievement Threshold,MORT-30-COPD Baseline Rate,MORT-30-COPD Benchmark,MORT-30-COPD Improvement Points,MORT-30-COPD Measure Score,MORT-30-COPD Performance Rate,MORT-30-HF Achievement Points,MORT-30-HF Achievement Threshold,MORT-30-HF Baseline Rate,MORT-30-HF Benchmark,MORT-30-HF Improvement Points,MORT-30-HF Measure Score,MORT-30-HF Performance Rate,MORT-30-PN Achievement Points,MORT-30-PN Achievement Threshold,MORT-30-PN Baseline Rate,MORT-30-PN Benchmark,MORT-30-PN Improvement Points,MORT-30-PN Measure Score,MORT-30-PN Performance Rate,State,ZIP Code,file_month,file_year,Combined SSI Measure Score,HAI-1 Achievement Points,HAI-1 Improvement Points,HAI-1 Measure Score,HAI-1_Performance_Rate,HAI-2 Achievement Points,HAI-2 Improvement Points,HAI-2 Measure Score,HAI-2 Performance Rate,HAI-3 Achievement Points,HAI-3 Improvement Points,HAI-3 Measure Score,HAI-3 Performance Rate,HAI-4 Achievement Points,HAI-4 Improvement Points,HAI-4 Measure Score,HAI-4 Performance Rate,PSI-90 Achievement Points,PSI-90 Improvement Points,PSI-90 Measure Score,PSI-90 Performance Rate,HAI-1 Performance Rate
0,1108 ROSS CLARK CIRCLE,4 out of 10,0.027428,0.041143,0.019779,7 out of 9,7 out of 10,0.024390,DOTHAN,HOUSTON,10001,SOUTHEAST HEALTH MEDICAL CENTER,2023.0,4 out of 10,0.866548,0.870565,0.885499,2 out of 9,4 out of 10,0.874377,0 out of 10,0.968747,0.964957,0.97962,0 out of 9,0 out of 10,0.953400,0 out of 10,0.919769,0.899833,0.936349,5 out of 9,5 out of 10,0.9184,10 out of 10,0.881939,0.885073,0.906798,9 out of 9,10 out of 10,0.914148,Not Available,0.840138,0.843051,0.871741,Not Available,Not Available,0.838418,AL,36301,1,2023,,,,,,,,,,,,,,,,,,,,,,
1,2505 U S HIGHWAY 431 NORTH,10 out of 10,0.027428,0.031273,0.019779,9 out of 9,10 out of 10,0.017911,BOAZ,MARSHALL,10005,MARSHALL MEDICAL CENTERS,2023.0,4 out of 10,0.866548,0.856504,0.885499,5 out of 9,5 out of 10,0.873459,Not Available,0.968747,Not Available,0.97962,Not Available,Not Available,Not Available,0 out of 10,0.919769,0.908701,0.936349,1 out of 9,1 out of 10,0.912705,0 out of 10,0.881939,0.848032,0.906798,0 out of 9,0 out of 10,0.834068,Not Available,0.840138,0.797336,0.871741,Not Available,Not Available,0.776138,AL,35957,1,2023,,,,,,,,,,,,,,,,,,,,,,
2,1701 VETERANS DRIVE,0 out of 10,0.027428,0.038103,0.019779,2 out of 9,2 out of 10,0.032998,FLORENCE,LAUDERDALE,10006,NORTH ALABAMA MEDICAL CENTER,2023.0,0 out of 10,0.866548,0.844498,0.885499,0 out of 9,0 out of 10,0.836183,0 out of 10,0.968747,0.955079,0.97962,3 out of 9,3 out of 10,0.964657,1 out of 10,0.919769,0.927433,0.936349,0 out of 9,1 out of 10,0.921463,0 out of 10,0.881939,0.870836,0.906798,0 out of 9,0 out of 10,0.874387,Not Available,0.840138,0.823683,0.871741,Not Available,Not Available,0.823979,AL,35630,1,2023,,,,,,,,,,,,,,,,,,,,,,
3,702 N MAIN ST,Not Available,0.027428,0.027864,0.019779,Not Available,Not Available,Not Available,OPP,COVINGTON,10007,MIZELL MEMORIAL HOSPITAL,2023.0,Not Available,0.866548,0.865244,0.885499,Not Available,Not Available,Not Available,Not Available,0.968747,Not Available,0.97962,Not Available,Not Available,Not Available,0 out of 10,0.919769,0.919674,0.936349,0 out of 9,0 out of 10,0.899788,0 out of 10,0.881939,0.853712,0.906798,2 out of 9,2 out of 10,0.865286,Not Available,0.840138,0.7755,0.871741,Not Available,Not Available,0.777866,AL,36467,1,2023,,,,,,,,,,,,,,,,,,,,,,
4,50 MEDICAL PARK EAST DRIVE,5 out of 10,0.027428,0.035758,0.019779,7 out of 9,7 out of 10,0.023839,BIRMINGHAM,JEFFERSON,10011,ST. VINCENT'S EAST,2023.0,1 out of 10,0.866548,0.868232,0.885499,0 out of 9,1 out of 10,0.866983,0 out of 10,0.968747,0.961724,0.97962,1 out of 9,1 out of 10,0.965239,0 out of 10,0.919769,0.917173,0.936349,0 out of 9,0 out of 10,0.916934,0 out of 10,0.881939,0.875159,0.906798,1 out of 9,1 out of 10,0.879817,Not Available,0.840138,0.829594,0.871741,Not Available,Not Available,0.824947,AL,35235,1,2023,,,,,,,,,,,,,,,,,,,,,,


In [4]:
df.drop(labels=['Address', 'City', 'County Name',
                    'State', 'ZIP Code', 'Fiscal Year',
                    ], axis=1, inplace=True)

df.rename(columns={'HAI-1_Performance_Rate': 'HAI-1 Performance_Rate'}, inplace=True)
print(df.shape)

ls_10 = []
ls_9 = []

for l in list(df):
    ls = df[l].tolist()
    for i in ls:
        try:
            if 'out of 10' in i:
                ls_10.append(l)
                break
            elif 'out of 9' in i:
                ls_9.append(l)
                break
        except:
            continue

for l in ls_9:
    df[l] = df[l].replace(['0 out of 9', 
                                   '1 out of 9', '2 out of 9', '3 out of 9',
                                   '4 out of 9', '5 out of 9', '6 out of 9', 
                                   '7 out of 9', '8 out of 9', '9 out of 9'],
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    df[l] = pd.to_numeric(df[l], errors='coerce')

for l in ls_10:
    df[l] = df[l].replace(['0 out of 10', 
                                   '1 out of 10', '2 out of 10', '3 out of 10',
                                   '4 out of 10', '5 out of 10', '6 out of 10', 
                                   '7 out of 10', '8 out of 10', '9 out of 10', 
                                   '10 out of 10'],
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    df[l] = pd.to_numeric(df[l], errors='coerce')

ls = ['file_year', 'file_month', 'Facility Name', 'Facility ID']

for l in ls:
    col = df.pop(l)
    df.insert(0, col.name, col)
    
    
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.drop_duplicates(subset = ['Facility ID', 'Facility Name', 'file_month','file_year'], inplace=True)
print(df.shape)
df.head()

(114015, 68)
(114015, 68)
(114015, 68)
(114015, 68)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,COMP-HIP-KNEE Achievement Points,COMP-HIP-KNEE Achievement Threshold,COMP-HIP-KNEE Baseline Rate,COMP-HIP-KNEE Benchmark,COMP-HIP-KNEE Improvement Points,COMP-HIP-KNEE Measure Score,COMP-HIP-KNEE Performance Rate,MORT-30-AMI Achievement Points,MORT-30-AMI Achievement Threshold,MORT-30-AMI Baseline Rate,MORT-30-AMI Benchmark,MORT-30-AMI Improvement Points,MORT-30-AMI Measure Score,MORT-30-AMI Performance Rate,MORT-30-CABG Achievement Points,MORT-30-CABG Achievement Threshold,MORT-30-CABG Baseline Rate,MORT-30-CABG Benchmark,MORT-30-CABG Improvement Points,MORT-30-CABG Measure Score,MORT-30-CABG Performance Rate,MORT-30-COPD Achievement Points,MORT-30-COPD Achievement Threshold,MORT-30-COPD Baseline Rate,MORT-30-COPD Benchmark,MORT-30-COPD Improvement Points,MORT-30-COPD Measure Score,MORT-30-COPD Performance Rate,MORT-30-HF Achievement Points,MORT-30-HF Achievement Threshold,MORT-30-HF Baseline Rate,MORT-30-HF Benchmark,MORT-30-HF Improvement Points,MORT-30-HF Measure Score,MORT-30-HF Performance Rate,MORT-30-PN Achievement Points,MORT-30-PN Achievement Threshold,MORT-30-PN Baseline Rate,MORT-30-PN Benchmark,MORT-30-PN Improvement Points,MORT-30-PN Measure Score,MORT-30-PN Performance Rate,Combined SSI Measure Score,HAI-1 Achievement Points,HAI-1 Improvement Points,HAI-1 Measure Score,HAI-1 Performance_Rate,HAI-2 Achievement Points,HAI-2 Improvement Points,HAI-2 Measure Score,HAI-2 Performance Rate,HAI-3 Achievement Points,HAI-3 Improvement Points,HAI-3 Measure Score,HAI-3 Performance Rate,HAI-4 Achievement Points,HAI-4 Improvement Points,HAI-4 Measure Score,HAI-4 Performance Rate,PSI-90 Achievement Points,PSI-90 Improvement Points,PSI-90 Measure Score,PSI-90 Performance Rate,HAI-1 Performance Rate
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,4.0,0.027428,0.041143,0.019779,7.0,7.0,0.024390,4.0,0.866548,0.870565,0.885499,2.0,4.0,0.874377,0.0,0.968747,0.964957,0.97962,0.0,0.0,0.953400,0.0,0.919769,0.899833,0.936349,5.0,5.0,0.9184,10.0,0.881939,0.885073,0.906798,9.0,10.0,0.914148,,0.840138,0.843051,0.871741,,,0.838418,,,,,,,,,,,,,,,,,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,10.0,0.027428,0.031273,0.019779,9.0,10.0,0.017911,4.0,0.866548,0.856504,0.885499,5.0,5.0,0.873459,,0.968747,Not Available,0.97962,,,Not Available,0.0,0.919769,0.908701,0.936349,1.0,1.0,0.912705,0.0,0.881939,0.848032,0.906798,0.0,0.0,0.834068,,0.840138,0.797336,0.871741,,,0.776138,,,,,,,,,,,,,,,,,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,0.0,0.027428,0.038103,0.019779,2.0,2.0,0.032998,0.0,0.866548,0.844498,0.885499,0.0,0.0,0.836183,0.0,0.968747,0.955079,0.97962,3.0,3.0,0.964657,1.0,0.919769,0.927433,0.936349,0.0,1.0,0.921463,0.0,0.881939,0.870836,0.906798,0.0,0.0,0.874387,,0.840138,0.823683,0.871741,,,0.823979,,,,,,,,,,,,,,,,,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,,0.027428,0.027864,0.019779,,,Not Available,,0.866548,0.865244,0.885499,,,Not Available,,0.968747,Not Available,0.97962,,,Not Available,0.0,0.919769,0.919674,0.936349,0.0,0.0,0.899788,0.0,0.881939,0.853712,0.906798,2.0,2.0,0.865286,,0.840138,0.7755,0.871741,,,0.777866,,,,,,,,,,,,,,,,,,,,,,
4,10011,ST. VINCENT'S EAST,1,2023,5.0,0.027428,0.035758,0.019779,7.0,7.0,0.023839,1.0,0.866548,0.868232,0.885499,0.0,1.0,0.866983,0.0,0.968747,0.961724,0.97962,1.0,1.0,0.965239,0.0,0.919769,0.917173,0.936349,0.0,0.0,0.916934,0.0,0.881939,0.875159,0.906798,1.0,1.0,0.879817,,0.840138,0.829594,0.871741,,,0.824947,,,,,,,,,,,,,,,,,,,,,,


## Save dataframe

In [5]:
df.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hvbp_clinical_outcomes_df.pkl.gz', protocol=5, compression='gzip')
