# Generate HVBP safety dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:

def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_safety.csv', 
           '2023/hospitals_04_2023/hvbp_safety.csv',
           '2023/hospitals_07_2023/hvbp_safety.csv',
           '2023/hospitals_10_2023/hvbp_safety.csv',
           
           '2022/hospitals_01_2022/hvbp_safety.csv', 
           '2022/hospitals_04_2022/hvbp_safety.csv',
           '2022/hospitals_07_2022/hvbp_safety.csv',
           '2022/hospitals_10_2022/hvbp_safety.csv',
           
           '2021/hospitals_01_2021/hvbp_safety.csv',
           '2021/hospitals_03_2021/hvbp_safety.csv',
           '2021/hospitals_04_2021/hvbp_safety.csv',
           '2021/hospitals_07_2021/hvbp_safety.csv',
           '2021/hospitals_10_2021/hvbp_safety.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_clinical_outcomes_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_safety.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_safety_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_safety_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_safety_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_safety_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_safety_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_safety_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_safety_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_safety_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_safety_11_07_2017.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_safety_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_safety_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_safety_11_10_2016.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_safety_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_safety_11_10_2016.csv', 
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)
print('df.shape:', df.shape)

del df_list
df.head()

2023/hospitals_01_2023/hvbp_safety.csv :  (rows, columns) = (2517, 51)
2023/hospitals_04_2023/hvbp_safety.csv :  (rows, columns) = (2517, 51)
2023/hospitals_07_2023/hvbp_safety.csv :  (rows, columns) = (2517, 51)
2023/hospitals_10_2023/hvbp_safety.csv :  (rows, columns) = (2517, 51)
2022/hospitals_01_2022/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2022/hospitals_04_2022/hvbp_safety.csv :  (rows, columns) = (2236, 51)
2022/hospitals_07_2022/hvbp_safety.csv :  (rows, columns) = (2236, 51)
2022/hospitals_10_2022/hvbp_safety.csv :  (rows, columns) = (2236, 51)
2021/hospitals_01_2021/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2021/hospitals_03_2021/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2021/hospitals_04_2021/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2021/hospitals_07_2021/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2021/hospitals_10_2021/hvbp_safety.csv :  (rows, columns) = (2676, 51)
2020/hospitals_archive_10_2020/hvbp_clinical_outcomes_12_09_2019.csv :  (rows

Unnamed: 0,Address,City,Combined SSI Measure Score,County Name,Facility ID,Facility Name,Fiscal Year,HAI-1 Achievement Points,HAI-1 Achievement Threshold,HAI-1 Baseline Rate,HAI-1 Benchmark,HAI-1 Improvement Points,HAI-1 Measure Score,HAI-1 Performance Rate,HAI-2 Achievement Points,HAI-2 Achievement Threshold,HAI-2 Baseline Rate,HAI-2 Benchmark,HAI-2 Improvement Points,HAI-2 Measure Score,HAI-2 Performance Rate,HAI-3 Achievement Points,HAI-3 Achievement Threshold,HAI-3 Baseline Rate,HAI-3 Benchmark,HAI-3 Improvement Points,HAI-3 Measure Score,HAI-3 Performance Rate,HAI-4 Achievement Points,HAI-4 Achievement Threshold,HAI-4 Baseline Rate,HAI-4 Benchmark,HAI-4 Improvement Points,HAI-4 Measure Score,HAI-4 Performance Rate,HAI-5 Achievement Points,HAI-5 Achievement Threshold,HAI-5 Baseline Rate,HAI-5 Benchmark,HAI-5 Improvement Points,HAI-5 Measure Score,HAI-5 Performance Rate,HAI-6 Achievement Points,HAI-6 Achievement Threshold,HAI-6 Baseline Rate,HAI-6 Benchmark,HAI-6 Improvement Points,HAI-6 Measure Score,HAI-6 Performance Rate,State,ZIP Code,file_month,file_year,COMP-HIP-KNEE Achievement Points,COMP-HIP-KNEE Achievement Threshold,COMP-HIP-KNEE Baseline Rate,COMP-HIP-KNEE Benchmark,COMP-HIP-KNEE Improvement Points,COMP-HIP-KNEE Measure Score,COMP-HIP-KNEE Performance Rate,MORT-30-AMI Achievement Points,MORT-30-AMI Achievement Threshold,MORT-30-AMI Baseline Rate,MORT-30-AMI Benchmark,MORT-30-AMI Improvement Points,MORT-30-AMI Measure Score,MORT-30-AMI Performance Rate,MORT-30-HF Achievement Points,MORT-30-HF Achievement Threshold,MORT-30-HF Baseline Rate,MORT-30-HF Benchmark,MORT-30-HF Improvement Points,MORT-30-HF Measure Score,MORT-30-HF Performance Rate,MORT-30-PN Achievement Points,MORT-30-PN Achievement Threshold,MORT-30-PN Baseline Rate,MORT-30-PN Benchmark,MORT-30-PN Improvement Points,MORT-30-PN Measure Score,MORT-30-PN Performance Rate,PC-01 Achievement Points,PC-01 Achievement Threshold,PC-01 Baseline Rate,PC-01 Benchmark,PC-01 Improvement Points,PC-01 Measure Score,PC-01 Performance Rate,PSI-90 Achievement Points,PSI-90 Achievement Threshold,PSI-90 Baseline Rate,PSI-90 Benchmark,PSI-90 Improvement Points,PSI-90 Measure Score,PSI-90 Performance Rate
0,1108 ROSS CLARK CIRCLE,DOTHAN,Not Available,HOUSTON,10001,SOUTHEAST HEALTH MEDICAL CENTER,2023.0,Not Available,0.589,0.742,0.0,Not Available,Not Available,0.476,Not Available,0.65,0.324,0.0,Not Available,Not Available,0.306,Not Available,0.717,0.650,0.0,Not Available,Not Available,1.228,Not Available,0.738,0.000,0.0,Not Available,Not Available,0.000,Not Available,0.726,0.548,0.0,Not Available,Not Available,0.912,Not Available,0.52,0.537,0.014,Not Available,Not Available,0.644,AL,36301,1,2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2505 U S HIGHWAY 431 NORTH,BOAZ,Not Available,MARSHALL,10005,MARSHALL MEDICAL CENTERS,2023.0,Not Available,0.589,0.000,0.0,Not Available,Not Available,3.311,Not Available,0.65,1.073,0.0,Not Available,Not Available,2.350,Not Available,0.717,0.508,0.0,Not Available,Not Available,0.400,Not Available,0.738,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.726,1.938,0.0,Not Available,Not Available,0.000,Not Available,0.52,0.565,0.014,Not Available,Not Available,0.86,AL,35957,1,2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1701 VETERANS DRIVE,FLORENCE,Not Available,LAUDERDALE,10006,NORTH ALABAMA MEDICAL CENTER,2023.0,Not Available,0.589,0.000,0.0,Not Available,Not Available,0.507,Not Available,0.65,0.222,0.0,Not Available,Not Available,0.602,Not Available,0.717,0.820,0.0,Not Available,Not Available,0.871,Not Available,0.738,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.726,0.654,0.0,Not Available,Not Available,1.640,Not Available,0.52,0.426,0.014,Not Available,Not Available,0.08,AL,35630,1,2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,702 N MAIN ST,OPP,Not Available,COVINGTON,10007,MIZELL MEMORIAL HOSPITAL,2023.0,Not Available,0.589,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.65,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.717,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.738,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.726,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.52,0.0,0.014,Not Available,Not Available,0.41,AL,36467,1,2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,Not Available,JEFFERSON,10011,ST. VINCENT'S EAST,2023.0,Not Available,0.589,0.421,0.0,Not Available,Not Available,0.728,Not Available,0.65,0.598,0.0,Not Available,Not Available,0.927,Not Available,0.717,0.930,0.0,Not Available,Not Available,0.268,Not Available,0.738,Not Available,0.0,Not Available,Not Available,Not Available,Not Available,0.726,0.964,0.0,Not Available,Not Available,1.346,Not Available,0.52,0.343,0.014,Not Available,Not Available,0.607,AL,35235,1,2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
df.drop(labels=['Address', 'City', 'County Name',
                    'State', 'ZIP Code', 'Fiscal Year',
                    ], axis=1, inplace=True)

ls_10 = []
ls_9 = []

for l in list(df):
    ls = df[l].tolist()
    for i in ls:
        try:
            if 'out of 10' in i:
                ls_10.append(l)
                break
            elif 'out of 9' in i:
                ls_9.append(l)
                break
        except:
            continue


for l in ls_9:
    df[l] = df[l].replace(['0 out of 9', 
                                   '1 out of 9', '2 out of 9', '3 out of 9',
                                   '4 out of 9', '5 out of 9', '6 out of 9', 
                                   '7 out of 9', '8 out of 9', '9 out of 9'],
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    df[l] = pd.to_numeric(df[l], errors='coerce')

for l in ls_10:
    df[l] = df[l].replace(['0 out of 10', 
                           '1 out of 10', '2 out of 10', '3 out of 10',
                           '4 out of 10', '5 out of 10', '6 out of 10', 
                           '7 out of 10', '8 out of 10', '9 out of 10', 
                           '10 out of 10'],
                          [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    df[l] = pd.to_numeric(df[l], errors='coerce')


labs = []
for i in list(df):
    ls = df[i].tolist()
    for j in ls:
        try:
            if '%' in j:
                labs.append(i)
                break
        except:
            pass

for l in labs:    
    ls1 = df[l].tolist()
    ls2 = []
    for i in ls1:
        try:
            if '%' in i:
                ls2.append(i[:-1])
            else:
                ls2.append(i)
        except:
            ls2.append(i)
    
    df[l] = ls2
    df[l] = pd.to_numeric(df[l], errors='coerce')
    

ls = ['file_year', 'file_month', 'Facility Name', 'Facility ID']
for l in ls:
    col = df.pop(l)
    df.insert(0, col.name, col)

print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.drop_duplicates(subset = ['Facility ID', 'Facility Name', 'file_month','file_year'], inplace=True)
print(df.shape)
df.head()

(77952, 89)
(77952, 89)
(77952, 89)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Combined SSI Measure Score,HAI-1 Achievement Points,HAI-1 Achievement Threshold,HAI-1 Baseline Rate,HAI-1 Benchmark,HAI-1 Improvement Points,HAI-1 Measure Score,HAI-1 Performance Rate,HAI-2 Achievement Points,HAI-2 Achievement Threshold,HAI-2 Baseline Rate,HAI-2 Benchmark,HAI-2 Improvement Points,HAI-2 Measure Score,HAI-2 Performance Rate,HAI-3 Achievement Points,HAI-3 Achievement Threshold,HAI-3 Baseline Rate,HAI-3 Benchmark,HAI-3 Improvement Points,HAI-3 Measure Score,HAI-3 Performance Rate,HAI-4 Achievement Points,HAI-4 Achievement Threshold,HAI-4 Baseline Rate,HAI-4 Benchmark,HAI-4 Improvement Points,HAI-4 Measure Score,HAI-4 Performance Rate,HAI-5 Achievement Points,HAI-5 Achievement Threshold,HAI-5 Baseline Rate,HAI-5 Benchmark,HAI-5 Improvement Points,HAI-5 Measure Score,HAI-5 Performance Rate,HAI-6 Achievement Points,HAI-6 Achievement Threshold,HAI-6 Baseline Rate,HAI-6 Benchmark,HAI-6 Improvement Points,HAI-6 Measure Score,HAI-6 Performance Rate,COMP-HIP-KNEE Achievement Points,COMP-HIP-KNEE Achievement Threshold,COMP-HIP-KNEE Baseline Rate,COMP-HIP-KNEE Benchmark,COMP-HIP-KNEE Improvement Points,COMP-HIP-KNEE Measure Score,COMP-HIP-KNEE Performance Rate,MORT-30-AMI Achievement Points,MORT-30-AMI Achievement Threshold,MORT-30-AMI Baseline Rate,MORT-30-AMI Benchmark,MORT-30-AMI Improvement Points,MORT-30-AMI Measure Score,MORT-30-AMI Performance Rate,MORT-30-HF Achievement Points,MORT-30-HF Achievement Threshold,MORT-30-HF Baseline Rate,MORT-30-HF Benchmark,MORT-30-HF Improvement Points,MORT-30-HF Measure Score,MORT-30-HF Performance Rate,MORT-30-PN Achievement Points,MORT-30-PN Achievement Threshold,MORT-30-PN Baseline Rate,MORT-30-PN Benchmark,MORT-30-PN Improvement Points,MORT-30-PN Measure Score,MORT-30-PN Performance Rate,PC-01 Achievement Points,PC-01 Achievement Threshold,PC-01 Baseline Rate,PC-01 Benchmark,PC-01 Improvement Points,PC-01 Measure Score,PC-01 Performance Rate,PSI-90 Achievement Points,PSI-90 Achievement Threshold,PSI-90 Baseline Rate,PSI-90 Benchmark,PSI-90 Improvement Points,PSI-90 Measure Score,PSI-90 Performance Rate
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,,,0.589,0.742,0.0,,,0.476,,0.65,0.324,0.0,,,0.306,,0.717,0.650,0.0,,,1.228,,0.738,0.000,0.0,,,0.000,,0.726,0.548,0.0,,,0.912,,0.52,0.537,0.014,,,0.644,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,,,0.589,0.000,0.0,,,3.311,,0.65,1.073,0.0,,,2.350,,0.717,0.508,0.0,,,0.400,,0.738,Not Available,0.0,,,Not Available,,0.726,1.938,0.0,,,0.000,,0.52,0.565,0.014,,,0.86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,,,0.589,0.000,0.0,,,0.507,,0.65,0.222,0.0,,,0.602,,0.717,0.820,0.0,,,0.871,,0.738,Not Available,0.0,,,Not Available,,0.726,0.654,0.0,,,1.640,,0.52,0.426,0.014,,,0.08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,,,0.589,Not Available,0.0,,,Not Available,,0.65,Not Available,0.0,,,Not Available,,0.717,Not Available,0.0,,,Not Available,,0.738,Not Available,0.0,,,Not Available,,0.726,Not Available,0.0,,,Not Available,,0.52,0.0,0.014,,,0.41,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10011,ST. VINCENT'S EAST,1,2023,,,0.589,0.421,0.0,,,0.728,,0.65,0.598,0.0,,,0.927,,0.717,0.930,0.0,,,0.268,,0.738,Not Available,0.0,,,Not Available,,0.726,0.964,0.0,,,1.346,,0.52,0.343,0.014,,,0.607,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Save dataframe

In [5]:
df.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hvbp_safety_df.pkl.gz', protocol=5, compression='gzip')
