# Generate HVBP total performance scores dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '/Users/kenlocey/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def check_lists(lists):
    for i, ls in enumerate(lists):
        for i2, ls2 in enumerate(lists):
            for i3 in ls:
                if i3 not in ls2:
                    print('\n')
                    print(i3 + ': NOT FOUND IN')
                    print(ls2)
                    sys.exit()
                        
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code', 'Zip Code',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code', 'ZIP Code',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = ['Unweighted Person and Community Engagement Domain Score',
            'Weighted Person and Community Engagement Domain Score',
            'Unweighted Normalized Efficiency and Cost Reduction Domain Score',
            'Weighted Efficiency and Cost Reduction Domain Score',
            'Fiscal Year', 'Unweighted Normalized Clinical Outcomes Domain Score',
            'Weighted Normalized Clinical Outcomes Domain Score',
            'Unweighted Normalized Safety Domain Score',
            'Weighted Safety Domain Score',
            'Total Performance Score',
            'Unweighted Person And Community Engagement Domain Score',
            'Weighted Person And Community Engagement Domain Score',
            'Unweighted Normalized Efficiency And Cost Reduction Domain Score',
            'Weighted Efficiency And Cost Reduction Domain Score',
            'Unweighted Normalized Clinical Care Domain Score',
            'Weighted Normalized Clinical Care Domain Score',
            'Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score',
            'Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score',
            'Unweighted Normalized Clinical Care - Process Domain Score',
            'Weighted Clinical Care - Process Domain Score',
            'Unweighted Normalized Clinical Care - Outcomes Domain Score',
            'Weighted Normalized Clinical Care - Outcomes Domain Score',
            'Unweighted Normalized Clinical Process of Care Domain Score',
            'Weighted Clinical Process of Care Domain Score',
            'Unweighted Patient Experience of Care Domain Score',
            'Weighted Patient Experience of Care Domain Score',
            'Unweighted Normalized Outcome Domain Score',
            'Weighted Outcome Domain Score',
            'Unweighted Normalized Efficiency Domain Score',
            'Weighted Efficiency Domain Score',
            
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023',
       '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016', 
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04',
       '01', '04', '07',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_tps.csv', 
           '2023/hospitals_04_2023/hvbp_tps.csv',
           
           '2022/hospitals_01_2022/hvbp_tps.csv', 
           '2022/hospitals_04_2022/hvbp_tps.csv',
           '2022/hospitals_07_2022/hvbp_tps.csv',
           
           '2021/hospitals_01_2021/hvbp_tps.csv',
           '2021/hospitals_03_2021/hvbp_tps.csv',
           '2021/hospitals_04_2021/hvbp_tps.csv',
           '2021/hospitals_07_2021/hvbp_tps.csv',
           '2021/hospitals_10_2021/hvbp_tps.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_tps_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_tps.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_tps_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_tps_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_tps_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_tps_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_tps_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_tps_11_10_2016.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_tps_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_tps_11_10_2016.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/hvbp_tps_08_26_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/hvbp_tps_06_08_2016.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160504/hvbp_tps_02_18_2016.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/hvbp_tps_10_28_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/hvbp_tps_08_06_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/hvbp_tps_05_28_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/hvbp_tps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/hvbp_tps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/hvbp_tps_10_28_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/hvbp_tps_10_28_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/hvbp_tps_10_28_2013.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/hvbp_tps_08_16_2013.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/hvbp_tps_05_28_2013.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/hvbp_tps_02_07_2013.csv',
           ]


for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

check_lists(lists)
subdir = 'HVBP/CombinedFiles_HVBP'
hvb_df = pd.concat(df_list)
print('hvb_df.shape:', hvb_df.shape)

2023/hospitals_01_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2023/hospitals_04_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2022/hospitals_01_2022/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2022/hospitals_04_2022/hvbp_tps.csv :  (rows, columns) = (2236, 17)
2022/hospitals_07_2022/hvbp_tps.csv :  (rows, columns) = (2236, 17)
2021/hospitals_01_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_03_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_04_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_07_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_10_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2020/hospitals_archive_10_2020/hvbp_tps_12_09_2019.csv :  (rows, columns) = (2731, 16)
2020/hospitals_archive_07_2020/hvbp_tps.csv :  (rows, columns) = (2731, 16)
2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_tps_12_09_2019.csv :  (rows, columns) = (2731, 16)
2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_tps_12_09_

In [4]:
hvb_df = pd.read_pickle(main_dir + 'HVBP/CombinedFiles_HVBP/' + 'hvbp_tps.pkl')
print('HVBP df:', hvb_df.shape)

hvb_df.drop(labels=['Address', 'City', 'County Name',
                    'Facility Name', 'State', 'ZIP Code', 'Fiscal Year',
                    ], axis=1, inplace=True)


print(hvb_df.shape)
hvb_df.head()

HVBP df: (121153, 39)
(121153, 32)


Unnamed: 0,Facility ID,Total Performance Score,Unweighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Care - Process Domain Score,Unweighted Normalized Clinical Care Domain Score,Unweighted Normalized Clinical Outcomes Domain Score,Unweighted Normalized Clinical Process of Care Domain Score,Unweighted Normalized Efficiency And Cost Reduction Domain Score,Unweighted Normalized Efficiency Domain Score,Unweighted Normalized Efficiency and Cost Reduction Domain Score,Unweighted Normalized Outcome Domain Score,Unweighted Normalized Safety Domain Score,Unweighted Patient Experience of Care Domain Score,Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Unweighted Person And Community Engagement Domain Score,Unweighted Person and Community Engagement Domain Score,Weighted Clinical Care - Process Domain Score,Weighted Clinical Process of Care Domain Score,Weighted Efficiency And Cost Reduction Domain Score,Weighted Efficiency Domain Score,Weighted Efficiency and Cost Reduction Domain Score,Weighted Normalized Clinical Care - Outcomes Domain Score,Weighted Normalized Clinical Care Domain Score,Weighted Normalized Clinical Outcomes Domain Score,Weighted Outcome Domain Score,Weighted Patient Experience of Care Domain Score,Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Weighted Person And Community Engagement Domain Score,Weighted Person and Community Engagement Domain Score,Weighted Safety Domain Score,file_month,file_year
0,10001,Not Available,,,,52.0,,0.0,,,,Not Available,,,Not Available,,,,0.0,,,,,13.0,,,,Not Available,,Not Available,1,2023
1,10005,Not Available,,,,40.0,,30.0,,,,Not Available,,,Not Available,,,,7.5,,,,,10.0,,,,Not Available,,Not Available,1,2023
2,10006,Not Available,,,,12.0,,10.0,,,,Not Available,,,Not Available,,,,2.5,,,,,3.0,,,,Not Available,,Not Available,1,2023
3,10007,Not Available,,,,10.0,,30.0,,,,Not Available,,,Not Available,,,,7.5,,,,,2.5,,,,Not Available,,Not Available,1,2023
4,10011,Not Available,,,,20.0,,10.0,,,,Not Available,,,Not Available,,,,2.5,,,,,5.0,,,,Not Available,,Not Available,1,2023


In [5]:
print(hvb_df.shape)
hvb_df.dropna(how='all', axis=1, inplace=True)
hvb_df.dropna(how='all', axis=0, inplace=True)
print(hvb_df.shape)

for i in list(hvb_df):
    if i in ['Facility ID', 'file_month', 'file_year']:
        continue
    
    else:
        try:
            hvb_df[i] = pd.to_numeric(hvb_df[i], errors='coerce')
            hvb_df.sort_values(by=[i], ascending=False, inplace=True)
        except:
            pass
    
hvb_df.head()

(121153, 32)
(121153, 32)


Unnamed: 0,Facility ID,Total Performance Score,Unweighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Care - Process Domain Score,Unweighted Normalized Clinical Care Domain Score,Unweighted Normalized Clinical Outcomes Domain Score,Unweighted Normalized Clinical Process of Care Domain Score,Unweighted Normalized Efficiency And Cost Reduction Domain Score,Unweighted Normalized Efficiency Domain Score,Unweighted Normalized Efficiency and Cost Reduction Domain Score,Unweighted Normalized Outcome Domain Score,Unweighted Normalized Safety Domain Score,Unweighted Patient Experience of Care Domain Score,Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Unweighted Person And Community Engagement Domain Score,Unweighted Person and Community Engagement Domain Score,Weighted Clinical Care - Process Domain Score,Weighted Clinical Process of Care Domain Score,Weighted Efficiency And Cost Reduction Domain Score,Weighted Efficiency Domain Score,Weighted Efficiency and Cost Reduction Domain Score,Weighted Normalized Clinical Care - Outcomes Domain Score,Weighted Normalized Clinical Care Domain Score,Weighted Normalized Clinical Outcomes Domain Score,Weighted Outcome Domain Score,Weighted Patient Experience of Care Domain Score,Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Weighted Person And Community Engagement Domain Score,Weighted Person and Community Engagement Domain Score,Weighted Safety Domain Score,file_month,file_year
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,7,2017
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,10,2017
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,12,2016
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,4,2017
388,220062,82.333333,,0.0,,,,,,90.0,,93.333333,,,,,0.0,,,,45.0,,,,,,,,,37.333333,4,2017


In [6]:
for i, n in enumerate(list(hvb_df)):
    if n in ['Facility ID', 'file_month', 'file_year']:
        continue
            
    hvb_df.rename(columns={n: 'HVBP Total Performance: ' + n}, inplace=True)
    
print(hvb_df.shape)
hvb_df.head()


(121153, 32)


Unnamed: 0,Facility ID,HVBP Total Performance: Total Performance Score,HVBP Total Performance: Unweighted Normalized Clinical Care - Outcomes Domain Score,HVBP Total Performance: Unweighted Normalized Clinical Care - Process Domain Score,HVBP Total Performance: Unweighted Normalized Clinical Care Domain Score,HVBP Total Performance: Unweighted Normalized Clinical Outcomes Domain Score,HVBP Total Performance: Unweighted Normalized Clinical Process of Care Domain Score,HVBP Total Performance: Unweighted Normalized Efficiency And Cost Reduction Domain Score,HVBP Total Performance: Unweighted Normalized Efficiency Domain Score,HVBP Total Performance: Unweighted Normalized Efficiency and Cost Reduction Domain Score,HVBP Total Performance: Unweighted Normalized Outcome Domain Score,HVBP Total Performance: Unweighted Normalized Safety Domain Score,HVBP Total Performance: Unweighted Patient Experience of Care Domain Score,HVBP Total Performance: Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,HVBP Total Performance: Unweighted Person And Community Engagement Domain Score,HVBP Total Performance: Unweighted Person and Community Engagement Domain Score,HVBP Total Performance: Weighted Clinical Care - Process Domain Score,HVBP Total Performance: Weighted Clinical Process of Care Domain Score,HVBP Total Performance: Weighted Efficiency And Cost Reduction Domain Score,HVBP Total Performance: Weighted Efficiency Domain Score,HVBP Total Performance: Weighted Efficiency and Cost Reduction Domain Score,HVBP Total Performance: Weighted Normalized Clinical Care - Outcomes Domain Score,HVBP Total Performance: Weighted Normalized Clinical Care Domain Score,HVBP Total Performance: Weighted Normalized Clinical Outcomes Domain Score,HVBP Total Performance: Weighted Outcome Domain Score,HVBP Total Performance: Weighted Patient Experience of Care Domain Score,HVBP Total Performance: Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,HVBP Total Performance: Weighted Person And Community Engagement Domain Score,HVBP Total Performance: Weighted Person and Community Engagement Domain Score,HVBP Total Performance: Weighted Safety Domain Score,file_month,file_year
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,7,2017
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,10,2017
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,12,2016
36,250151,38.666667,,0.0,,,,,,0.0,,96.666667,,,,,0.0,,,,0.0,,,,,,,,,38.666667,4,2017
388,220062,82.333333,,0.0,,,,,,90.0,,93.333333,,,,,0.0,,,,45.0,,,,,,,,,37.333333,4,2017


In [7]:
hvb_df.to_pickle('dataframes/hvbp_total_performance_scores_df.pkl.gz', protocol=5, compression='gzip')