# Generate HVBP efficiency and cost reduction dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '/Users/kenlocey/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def check_lists(lists):
    for i, ls in enumerate(lists):
        for i2, ls2 in enumerate(lists):
            for i3 in ls:
                if i3 not in ls2:
                    print('\n')
                    print(i3 + ': NOT FOUND IN')
                    print(ls2)
                    sys.exit()
                        
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code', 'Hospital_Name', 
             'ZIP_Code', 'County_Name', 'Provider_Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code', 'Facility Name',
             'ZIP Code', 'County Name', 'Facility ID',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = ['MSPB-1 Achievement Threshold', 'MSPB-1 Baseline Rate', 'MSPB-1 Benchmark', 'Fiscal Year',
            'Facility Name',
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


###############################################################################
################################  FILES  ######################################
###############################################################################


df_list = []
lists = []

yrs = ['2023', '2023',
       '2022', '2022', '2022', 
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014',
       ]

mos = ['01', '04',
       '01', '04', '07',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_efficiency_and_cost_reduction.csv', 
           '2023/hospitals_04_2023/hvbp_efficiency_and_cost_reduction.csv',
           
           '2022/hospitals_01_2022/hvbp_efficiency_and_cost_reduction.csv', 
           '2022/hospitals_04_2022/hvbp_efficiency_and_cost_reduction.csv',
           '2022/hospitals_07_2022/hvbp_efficiency_and_cost_reduction.csv',
           
           '2021/hospitals_01_2021/hvbp_efficiency_and_cost_reduction.csv',
           '2021/hospitals_03_2021/hvbp_efficiency_and_cost_reduction.csv',
           '2021/hospitals_04_2021/hvbp_efficiency_and_cost_reduction.csv',
           '2021/hospitals_07_2021/hvbp_efficiency_and_cost_reduction.csv',
           '2021/hospitals_10_2021/hvbp_efficiency_and_cost_reduction.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_efficiency_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_efficiency.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_efficiency_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_efficiency_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_efficiency_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_efficiency_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_efficiency_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_efficiency_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_efficiency_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_efficiency_11_07_2017.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_efficiency_11_07_2017.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_efficiency_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_efficiency_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_efficiency_11_10_2016.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_efficiency_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_efficiency_11_10_2016.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/hvbp_efficiency_08_26_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/hvbp_efficiency_06_08_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/hvbp_efficiency_02_18_2016.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/hvbp_Efficiency_10_28_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/hvbp_Efficiency_08_06_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/hvbp_Efficiency_05_20_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/hvbp_Efficiency_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/hvbp_Efficiency_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/hvbp_efficiency_10_28_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/hvbp_efficiency_10_28_2014.csv',
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

check_lists(lists)
subdir = 'HVBP/CombinedFiles_HVBP'
hvb_df = pd.concat(df_list)
print('hvb_df.shape:', hvb_df.shape)


2023/hospitals_01_2023/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2517, 15)
2023/hospitals_04_2023/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2517, 15)
2022/hospitals_01_2022/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2022/hospitals_04_2022/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2236, 15)
2022/hospitals_07_2022/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2236, 15)
2021/hospitals_01_2021/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2021/hospitals_03_2021/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2021/hospitals_04_2021/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2021/hospitals_07_2021/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2021/hospitals_10_2021/hvbp_efficiency_and_cost_reduction.csv :  (rows, columns) = (2676, 15)
2020/hospitals_archive_10_2020/hvbp_efficiency_12_09_2019.cs

In [3]:
hvb_df = pd.read_pickle(main_dir + 'HVBP/CombinedFiles_HVBP/' + 'hvbp_efficiency.pkl')
print('HAI df:', hvb_df.shape)

hvb_df = hvb_df.filter(items=['Facility ID', 'file_month', 'file_year',
                              'Fiscal Year', 'MSPB-1 Achievement Points',
                              'MSPB-1 Achievement Threshold',
                              'MSPB-1 Baseline Rate',
                              'MSPB-1 Benchmark', 
                              'MSPB-1 Improvement Points',
                              'MSPB-1 Measure Score',
                              'MSPB-1 Performance Rate',
                             ], axis=1)

hvb_df['Fiscal Year'] = hvb_df['Fiscal Year'].astype(str)

ls = []
for i in hvb_df['Fiscal Year']:
    try:
        ls.append(i[:-2])
    except:
        ls.append(np.nan)
        
hvb_df['Fiscal Year'] = ls

print(hvb_df.shape)
hvb_df.head()

HAI df: (101299, 17)
(101299, 11)


Unnamed: 0,Facility ID,file_month,file_year,Fiscal Year,MSPB-1 Achievement Points,MSPB-1 Achievement Threshold,MSPB-1 Baseline Rate,MSPB-1 Benchmark,MSPB-1 Improvement Points,MSPB-1 Measure Score,MSPB-1 Performance Rate
0,10001,1,2023,2023,0 out of 10,0.98791,1.020251,0.844806,0 out of 9,0 out of 10,1.016841
1,10005,1,2023,2023,2 out of 10,0.98791,1.011867,0.844806,3 out of 9,3 out of 10,0.95781
2,10006,1,2023,2023,0 out of 10,0.98791,1.016924,0.844806,1 out of 9,1 out of 10,0.994638
3,10007,1,2023,2023,0 out of 10,0.98791,1.110418,0.844806,3 out of 9,3 out of 10,1.016687
4,10011,1,2023,2023,0 out of 10,0.98791,1.029416,0.844806,1 out of 9,1 out of 10,0.993673


In [4]:
for i, n in enumerate(list(hvb_df)):
    if n in ['Facility ID', 'file_month', 'file_year']:
        continue
            
    hvb_df.rename(columns={n: 'HVBP Efficiency: ' + n}, inplace=True)
    
print(hvb_df.shape)
hvb_df.head()


(101299, 11)


Unnamed: 0,Facility ID,file_month,file_year,HVBP Efficiency: Fiscal Year,HVBP Efficiency: MSPB-1 Achievement Points,HVBP Efficiency: MSPB-1 Achievement Threshold,HVBP Efficiency: MSPB-1 Baseline Rate,HVBP Efficiency: MSPB-1 Benchmark,HVBP Efficiency: MSPB-1 Improvement Points,HVBP Efficiency: MSPB-1 Measure Score,HVBP Efficiency: MSPB-1 Performance Rate
0,10001,1,2023,2023,0 out of 10,0.98791,1.020251,0.844806,0 out of 9,0 out of 10,1.016841
1,10005,1,2023,2023,2 out of 10,0.98791,1.011867,0.844806,3 out of 9,3 out of 10,0.95781
2,10006,1,2023,2023,0 out of 10,0.98791,1.016924,0.844806,1 out of 9,1 out of 10,0.994638
3,10007,1,2023,2023,0 out of 10,0.98791,1.110418,0.844806,3 out of 9,3 out of 10,1.016687
4,10011,1,2023,2023,0 out of 10,0.98791,1.029416,0.844806,1 out of 9,1 out of 10,0.993673


In [5]:
print(sorted(hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'].unique()))
hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'] = hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'].replace([
                                                    '0 out of 10', '1 out of 10', '2 out of 10', '3 out of 10',
                                                    '4 out of 10', '5 out of 10', '6 out of 10', '7 out of 10', 
                                                    '8 out of 10', '9 out of 10', '10 out of 10'],
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'] = pd.to_numeric(hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'], errors='coerce')
print(hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'].unique())


['0 out of 10', '1 out of 10', '10 out of 10', '2 out of 10', '3 out of 10', '4 out of 10', '5 out of 10', '6 out of 10', '7 out of 10', '8 out of 10', '9 out of 10', 'Not Available']
[ 0.  2.  1.  8.  3.  4.  7.  9. 10.  5.  6. nan]


In [6]:
print(sorted(hvb_df['HVBP Efficiency: MSPB-1 Improvement Points'].unique()))
hvb_df['HVBP Efficiency: MSPB-1 Improvement Points'] = hvb_df['HVBP Efficiency: MSPB-1 Achievement Points'].replace([
                                                    '0 out of 9', '1 out of 9', '2 out of 9', '3 out of 9',
                                                    '4 out of 9', '5 out of 9', '6 out of 9', '7 out of 9', 
                                                    '8 out of 9', '9 out of 9', ],
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
hvb_df['HVBP Efficiency: MSPB-1 Improvement Points'] = pd.to_numeric(hvb_df['HVBP Efficiency: MSPB-1 Improvement Points'], errors='coerce')
print(sorted(hvb_df['HVBP Efficiency: MSPB-1 Improvement Points'].unique()))


['0 out of 9', '1 out of 9', '2 out of 9', '3 out of 9', '4 out of 9', '5 out of 9', '6 out of 9', '7 out of 9', '8 out of 9', '9 out of 9', 'Not Available']
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, nan]


In [7]:
print(sorted(hvb_df['HVBP Efficiency: MSPB-1 Measure Score'].unique()))
hvb_df['HVBP Efficiency: MSPB-1 Measure Score'] = hvb_df['HVBP Efficiency: MSPB-1 Measure Score'].replace([
                                                    '0 out of 10', '1 out of 10', '2 out of 10', '3 out of 10',
                                                    '4 out of 10', '5 out of 10', '6 out of 10', '7 out of 10', 
                                                    '8 out of 10', '9 out of 10', '10 out of 10'],
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
hvb_df['HVBP Efficiency: MSPB-1 Measure Score'] = pd.to_numeric(hvb_df['HVBP Efficiency: MSPB-1 Measure Score'], errors='coerce')
print(sorted(hvb_df['HVBP Efficiency: MSPB-1 Measure Score'].unique()))


['0 out of 10', '1 out of 10', '10 out of 10', '2 out of 10', '3 out of 10', '4 out of 10', '5 out of 10', '6 out of 10', '7 out of 10', '8 out of 10', '9 out of 10', 'Not Available']
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, nan]


In [8]:
hvb_df.head()

Unnamed: 0,Facility ID,file_month,file_year,HVBP Efficiency: Fiscal Year,HVBP Efficiency: MSPB-1 Achievement Points,HVBP Efficiency: MSPB-1 Achievement Threshold,HVBP Efficiency: MSPB-1 Baseline Rate,HVBP Efficiency: MSPB-1 Benchmark,HVBP Efficiency: MSPB-1 Improvement Points,HVBP Efficiency: MSPB-1 Measure Score,HVBP Efficiency: MSPB-1 Performance Rate
0,10001,1,2023,2023,0.0,0.98791,1.020251,0.844806,0.0,0.0,1.016841
1,10005,1,2023,2023,2.0,0.98791,1.011867,0.844806,2.0,3.0,0.95781
2,10006,1,2023,2023,0.0,0.98791,1.016924,0.844806,0.0,1.0,0.994638
3,10007,1,2023,2023,0.0,0.98791,1.110418,0.844806,0.0,3.0,1.016687
4,10011,1,2023,2023,0.0,0.98791,1.029416,0.844806,0.0,1.0,0.993673


In [9]:
hvb_df.to_pickle('dataframes/hvbp_efficiency_df.pkl.gz', protocol=5, compression='gzip')