# Generate HVBP total performance scores dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID',
             'Hospital Name', 'Provider Number', ' ZIP Code', 'Zip Code',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID',
             'Facility Name', 'Facility ID', 'ZIP Code', 'ZIP Code',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016', 
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_tps.csv', 
           '2023/hospitals_04_2023/hvbp_tps.csv',
           '2023/hospitals_07_2023/hvbp_tps.csv',
           '2023/hospitals_10_2023/hvbp_tps.csv',
           
           '2022/hospitals_01_2022/hvbp_tps.csv', 
           '2022/hospitals_04_2022/hvbp_tps.csv',
           '2022/hospitals_07_2022/hvbp_tps.csv',
           '2022/hospitals_10_2022/hvbp_tps.csv',
           
           '2021/hospitals_01_2021/hvbp_tps.csv',
           '2021/hospitals_03_2021/hvbp_tps.csv',
           '2021/hospitals_04_2021/hvbp_tps.csv',
           '2021/hospitals_07_2021/hvbp_tps.csv',
           '2021/hospitals_10_2021/hvbp_tps.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_tps_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_tps.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_tps_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_tps_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_tps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_tps_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_tps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_tps_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_tps_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_tps_11_10_2016.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_tps_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_tps_11_10_2016.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/hvbp_tps_08_26_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/hvbp_tps_06_08_2016.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160504/hvbp_tps_02_18_2016.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/hvbp_tps_10_28_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/hvbp_tps_08_06_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/hvbp_tps_05_28_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/hvbp_tps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/hvbp_tps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/hvbp_tps_10_28_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/hvbp_tps_10_28_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/hvbp_tps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/hvbp_tps_10_28_2013.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/hvbp_tps_08_16_2013.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/hvbp_tps_05_28_2013.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/hvbp_tps_02_07_2013.csv',
           ]


for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)
print('df.shape:', df.shape)

del df_list
df.head()

2023/hospitals_01_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2023/hospitals_04_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2023/hospitals_07_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2023/hospitals_10_2023/hvbp_tps.csv :  (rows, columns) = (2517, 17)
2022/hospitals_01_2022/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2022/hospitals_04_2022/hvbp_tps.csv :  (rows, columns) = (2236, 17)
2022/hospitals_07_2022/hvbp_tps.csv :  (rows, columns) = (2236, 17)
2022/hospitals_10_2022/hvbp_tps.csv :  (rows, columns) = (2236, 17)
2021/hospitals_01_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_03_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_04_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_07_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2021/hospitals_10_2021/hvbp_tps.csv :  (rows, columns) = (2676, 17)
2020/hospitals_archive_10_2020/hvbp_tps_12_09_2019.csv :  (rows, columns) = (2731, 16)
2020/hospitals_archive_07_202

Unnamed: 0,Address,City,County Name,Facility ID,Facility Name,Fiscal Year,State,Total Performance Score,Unweighted Normalized Clinical Outcomes Domain Score,Unweighted Normalized Efficiency And Cost Reduction Domain Score,Unweighted Normalized Safety Domain Score,Unweighted Person And Community Engagement Domain Score,Weighted Efficiency And Cost Reduction Domain Score,Weighted Normalized Clinical Outcomes Domain Score,Weighted Person And Community Engagement Domain Score,Weighted Safety Domain Score,ZIP Code,file_month,file_year,Unweighted Normalized Efficiency and Cost Reduction Domain Score,Unweighted Person and Community Engagement Domain Score,Weighted Efficiency and Cost Reduction Domain Score,Weighted Person and Community Engagement Domain Score,Unweighted Normalized Clinical Care Domain Score,Weighted Normalized Clinical Care Domain Score,Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Unweighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Care - Process Domain Score,Weighted Clinical Care - Process Domain Score,Weighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Process of Care Domain Score,Unweighted Normalized Efficiency Domain Score,Unweighted Normalized Outcome Domain Score,Unweighted Patient Experience of Care Domain Score,Weighted Clinical Process of Care Domain Score,Weighted Efficiency Domain Score,Weighted Outcome Domain Score,Weighted Patient Experience of Care Domain Score
0,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,10001,SOUTHEAST HEALTH MEDICAL CENTER,2023.0,AL,Not Available,52.0,0.0,Not Available,Not Available,0.0,13.0,Not Available,Not Available,36301,1,2023,,,,,,,,,,,,,,,,,,,,
1,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,10005,MARSHALL MEDICAL CENTERS,2023.0,AL,Not Available,40.0,30.0,Not Available,Not Available,7.5,10.0,Not Available,Not Available,35957,1,2023,,,,,,,,,,,,,,,,,,,,
2,1701 VETERANS DRIVE,FLORENCE,LAUDERDALE,10006,NORTH ALABAMA MEDICAL CENTER,2023.0,AL,Not Available,12.0,10.0,Not Available,Not Available,2.5,3.0,Not Available,Not Available,35630,1,2023,,,,,,,,,,,,,,,,,,,,
3,702 N MAIN ST,OPP,COVINGTON,10007,MIZELL MEMORIAL HOSPITAL,2023.0,AL,Not Available,10.0,30.0,Not Available,Not Available,7.5,2.5,Not Available,Not Available,36467,1,2023,,,,,,,,,,,,,,,,,,,,
4,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,JEFFERSON,10011,ST. VINCENT'S EAST,2023.0,AL,Not Available,20.0,10.0,Not Available,Not Available,2.5,5.0,Not Available,Not Available,35235,1,2023,,,,,,,,,,,,,,,,,,,,


In [4]:
df.drop(labels=['Address', 'City', 'County Name',
                    'State', 'ZIP Code', 'Fiscal Year',
                    ], axis=1, inplace=True)


ls_10 = []
ls_9 = []
for l in list(df):
    ls = df[l].tolist()
    for i in ls:
        try:
            if 'out of 10' in i:
                ls_10.append(l)
                break
            elif 'out of 9' in i:
                ls_9.append(l)
                break
        except:
            continue

for l in ls_9:
    df[l] = df[l].replace(['0 out of 9', 
                                   '1 out of 9', '2 out of 9', '3 out of 9',
                                   '4 out of 9', '5 out of 9', '6 out of 9', 
                                   '7 out of 9', '8 out of 9', '9 out of 9'],
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    df[l] = pd.to_numeric(df[l], errors='coerce')

for l in ls_10:
    df[l] = df[l].replace(['0 out of 10', 
                                   '1 out of 10', '2 out of 10', '3 out of 10',
                                   '4 out of 10', '5 out of 10', '6 out of 10', 
                                   '7 out of 10', '8 out of 10', '9 out of 10', 
                                   '10 out of 10'],
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    df[l] = pd.to_numeric(df[l], errors='coerce')


labs = []
for i in list(df):
    ls = df[i].tolist()
    for j in ls:
        try:
            if '%' in j:
                labs.append(i)
                break
        except:
            pass

for l in labs:
    ls1 = df[l].tolist()
    ls2 = []
    for i in ls1:
        try:
            if '%' in i:
                ls2.append(i[:-1])
            else:
                ls2.append(i)
        except:
            ls2.append(i)
    df[l] = ls2
    df[l] = pd.to_numeric(df[l], errors='coerce')
    
    
ls = ['file_year', 'file_month', 'Facility Name', 'Facility ID']
for l in ls:
    col = df.pop(l)
    df.insert(0, col.name, col)
    
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.drop_duplicates(subset = ['Facility ID', 'Facility Name', 'file_month','file_year'], inplace=True)
print(df.shape)
df.head()

(128423, 33)
(128423, 33)
(128423, 33)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Total Performance Score,Unweighted Normalized Clinical Outcomes Domain Score,Unweighted Normalized Efficiency And Cost Reduction Domain Score,Unweighted Normalized Safety Domain Score,Unweighted Person And Community Engagement Domain Score,Weighted Efficiency And Cost Reduction Domain Score,Weighted Normalized Clinical Outcomes Domain Score,Weighted Person And Community Engagement Domain Score,Weighted Safety Domain Score,Unweighted Normalized Efficiency and Cost Reduction Domain Score,Unweighted Person and Community Engagement Domain Score,Weighted Efficiency and Cost Reduction Domain Score,Weighted Person and Community Engagement Domain Score,Unweighted Normalized Clinical Care Domain Score,Weighted Normalized Clinical Care Domain Score,Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Unweighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Care - Process Domain Score,Weighted Clinical Care - Process Domain Score,Weighted Normalized Clinical Care - Outcomes Domain Score,Unweighted Normalized Clinical Process of Care Domain Score,Unweighted Normalized Efficiency Domain Score,Unweighted Normalized Outcome Domain Score,Unweighted Patient Experience of Care Domain Score,Weighted Clinical Process of Care Domain Score,Weighted Efficiency Domain Score,Weighted Outcome Domain Score,Weighted Patient Experience of Care Domain Score
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,Not Available,52.0,0.0,Not Available,Not Available,0.0,13.0,Not Available,Not Available,,,,,,,,,,,,,,,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,Not Available,40.0,30.0,Not Available,Not Available,7.5,10.0,Not Available,Not Available,,,,,,,,,,,,,,,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,Not Available,12.0,10.0,Not Available,Not Available,2.5,3.0,Not Available,Not Available,,,,,,,,,,,,,,,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,Not Available,10.0,30.0,Not Available,Not Available,7.5,2.5,Not Available,Not Available,,,,,,,,,,,,,,,,,,,,
4,10011,ST. VINCENT'S EAST,1,2023,Not Available,20.0,10.0,Not Available,Not Available,2.5,5.0,Not Available,Not Available,,,,,,,,,,,,,,,,,,,,


## Save dataframe

In [5]:
df.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hvbp_total_performance_scores_df.pkl.gz', protocol=5, compression='gzip')
