# Generate HVBP HCAHPS dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code', 'Hospital_Name', 
             'ZIP_Code', 'County_Name', 'Communication With Nurses Achievement Points',
             'Communication With Nurses Improvement Points',
             'Communication With Nurses Dimension Score',
             'Communication With Doctors Achievement Points',
             'Communication With Doctors Improvement Points',
             'Communication With Doctors Dimension Score',
             'Communication With Nurses Floor',
             'Responsiveness Of Hospital Staff Achievement Points',
             'Responsiveness Of Hospital Staff Improvement Points',
             'Responsiveness Of Hospital Staff Dimension Score',
             'Cleanliness And Quietness Of Hospital Environment Achievement Points',
             'Cleanliness And Quietness Of Hospital Environment Improvement Points',
             'Cleanliness And Quietness Of Hospital Environment Dimension Score',
             'Overall Rating Of Hospital Achievement Points',
             'Overall Rating Of Hospital Improvement Points',
             'Overall Rating Of Hospital Dimension Score',
             'Hcahps Base Score',
             'Overall Rating Of Hospital Floor',
             'Hcahps Consistency Score',
             'Communication About Medicines Floor',
             'Communication About Medicines Achievement Threshold',
             'Communication About Medicines Benchmark',
             'Communication About Medicines Baseline Rate',
             'Communication About Medicines Performance Rate',
             'Communication About Medicines Achievement Points',
             'Communication About Medicines Improvement Points',
             'Communication About Medicines Dimension Score',
             'Communication With Nurses Achievement Threshold',
             'Communication With Nurses Benchmark',
             'Communication With Nurses Baseline Rate',
             'Communication With Nurses Performance Rate',
             'Communication With Doctors Floor',
             'Communication With Doctors Achievement Threshold',
             'Communication With Doctors Benchmark',
             'Communication With Doctors Baseline Rate',
             'Communication With Doctors Performance Rate',
             'Responsiveness Of Hospital Staff Floor',
             'Responsiveness Of Hospital Staff Achievement Threshold',
             'Responsiveness Of Hospital Staff Benchmark',
             'Responsiveness Of Hospital Staff Baseline Rate',
             'Responsiveness Of Hospital Staff Performance Rate',
             'Cleanliness And Quietness Of Hospital Environment Floor',
             'Cleanliness And Quietness Of Hospital Environment Achievement Threshold',
             'Cleanliness And Quietness Of Hospital Environment Benchmark',
             'Cleanliness And Quietness Of Hospital Environment Baseline Rate',
             'Cleanliness And Quietness Of Hospital Environment Performance Rate',
             'Overall Rating Of Hospital Achievement Threshold',
             'Overall Rating Of Hospital Benchmark',
             'Overall Rating Of Hospital Baseline Rate',
             'Overall Rating Of Hospital Performance Rate',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code', 'Facility Name',
             'ZIP Code', 'County Name', 'Communication with Nurses Achievement Points',
             'Communication with Nurses Improvement Points',
             'Communication with Nurses Dimension Score',
             'Communication with Doctors Achievement Points',
             'Communication with Doctors Improvement Points',
             'Communication with Doctors Dimension Score',
             'Communication with Nurses Floor',
             'Responsiveness of Hospital Staff Achievement Points',
             'Responsiveness of Hospital Staff Improvement Points',
             'Responsiveness of Hospital Staff Dimension Score',
             'Cleanliness and Quietness of Hospital Environment Achievement Points',
             'Cleanliness and Quietness of Hospital Environment Improvement Points',
             'Cleanliness and Quietness of Hospital Environment Dimension Score',
             'Overall Rating of Hospital Achievement Points',
             'Overall Rating of Hospital Improvement Points',
             'Overall Rating of Hospital Dimension Score',
             'HCAHPS Base Score',
             'Overall Rating of Hospital Floor',
             'HCAHPS Consistency Score',
             'Communication about Medicines Floor',
             'Communication about Medicines Achievement Threshold',
             'Communication about Medicines Benchmark',
             'Communication about Medicines Baseline Rate',
             'Communication about Medicines Performance Rate',
             'Communication about Medicines Achievement Points',
             'Communication about Medicines Improvement Points',
             'Communication about Medicines Dimension Score',
             'Communication with Nurses Achievement Threshold',
             'Communication with Nurses Benchmark',
             'Communication with Nurses Baseline Rate',
             'Communication with Nurses Performance Rate',
             'Communication with Doctors Floor',
             'Communication with Doctors Achievement Threshold',
             'Communication with Doctors Benchmark',
             'Communication with Doctors Baseline Rate',
             'Communication with Doctors Performance Rate',
             'Responsiveness of Hospital Staff Floor',
             'Responsiveness of Hospital Staff Achievement Threshold',
             'Responsiveness of Hospital Staff Benchmark',
             'Responsiveness of Hospital Staff Baseline Rate',
             'Responsiveness of Hospital Staff Performance Rate',
             'Cleanliness and Quietness of Hospital Environment Floor',
             'Cleanliness and Quietness of Hospital Environment Achievement Threshold',
             'Cleanliness and Quietness of Hospital Environment Benchmark',
             'Cleanliness and Quietness of Hospital Environment Baseline Rate',
             'Cleanliness and Quietness of Hospital Environment Performance Rate',
             'Overall Rating of Hospital Achievement Threshold',
             'Overall Rating of Hospital Benchmark',
             'Overall Rating of Hospital Baseline Rate',
             'Overall Rating of Hospital Performance Rate',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists

## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016', 
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]


subdirs = ['2023/hospitals_01_2023/hvbp_person_and_community_engagement.csv', 
           '2023/hospitals_04_2023/hvbp_person_and_community_engagement.csv',
           '2023/hospitals_07_2023/hvbp_person_and_community_engagement.csv',
           '2023/hospitals_10_2023/hvbp_person_and_community_engagement.csv',
           
           '2022/hospitals_01_2022/hvbp_person_and_community_engagement.csv', 
           '2022/hospitals_04_2022/hvbp_person_and_community_engagement.csv',
           '2022/hospitals_07_2022/hvbp_person_and_community_engagement.csv',
           '2022/hospitals_10_2022/hvbp_person_and_community_engagement.csv',
                      
           '2021/hospitals_01_2021/hvbp_person_and_community_engagement.csv',
           '2021/hospitals_03_2021/hvbp_person_and_community_engagement.csv',
           '2021/hospitals_04_2021/hvbp_person_and_community_engagement.csv',
           '2021/hospitals_07_2021/hvbp_person_and_community_engagement.csv',
           '2021/hospitals_10_2021/hvbp_person_and_community_engagement.csv',
           
           '2020/hospitals_archive_10_2020/hvbp_hcahps_12_09_2019.csv',
           '2020/hospitals_archive_07_2020/hvbp_hcahps.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/hvbp_hcahps_12_09_2019.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/hvbp_hcahps_12_09_2019.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20190702/hvbp_hcahps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20191030/hvbp_hcahps_11_09_2018.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/hvbp_hcahps_11_09_2018.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/hvbp_hcahps_11_09_2018.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/hvbp_hcahps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/hvbp_hcahps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/hvbp_hcahps_11_07_2017.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/hvbp_hcahps_11_07_2017.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/hvbp_hcahps_11_10_2016.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/hvbp_hcahps_11_10_2016.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/hvbp_hcahps_11_10_2016.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/hvbp_hcahps_11_10_2016.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/hvbp_hcahps_08_26_2016.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/hvbp_hcahps_06_08_2016.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160504/hvbp_hcahps_02_18_2016.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/hvbp_hcahps_10_28_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/hvbp_hcahps_08_06_2015.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/hvbp_hcahps_05_28_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/hvbp_hcahps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/hvbp_hcahps_02_18_2015.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/hvbp_hcahps_10_28_2014.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/hvbp_hcahps_10_28_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/hvbp_hcahps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/hvbp_hcahps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/hvbp_hcahps_02_25_2014.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/hvbp_hcahps_10_28_2013.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/hvbp_hcahps_08_16_2013.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/hvbp_hcahps_05_28_2013.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/hvbp_hcahps_02_07_2013.csv',
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)
print('df.shape:', df.shape)

del df_list
df.head()

2023/hospitals_01_2023/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2517, 74)
2023/hospitals_04_2023/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2517, 74)
2023/hospitals_07_2023/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2517, 74)
2023/hospitals_10_2023/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2517, 74)
2022/hospitals_01_2022/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2676, 74)
2022/hospitals_04_2022/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2236, 74)
2022/hospitals_07_2022/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2236, 74)
2022/hospitals_10_2022/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2236, 74)
2021/hospitals_01_2021/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2676, 74)
2021/hospitals_03_2021/hvbp_person_and_community_engagement.csv :  (rows, columns) = (2676, 74)
2021/hospitals_04_2021/hvbp_person_and_c

Unnamed: 0,Address,Care Transition Achievement Points,Care Transition Achievement Threshold,Care Transition Baseline Rate,Care Transition Benchmark,Care Transition Dimension Score,Care Transition Floor,Care Transition Improvement Points,Care Transition Performance Rate,City,Cleanliness and Quietness of Hospital Environment Achievement Points,Cleanliness and Quietness of Hospital Environment Achievement Threshold,Cleanliness and Quietness of Hospital Environment Baseline Rate,Cleanliness and Quietness of Hospital Environment Benchmark,Cleanliness and Quietness of Hospital Environment Dimension Score,Cleanliness and Quietness of Hospital Environment Floor,Cleanliness and Quietness of Hospital Environment Improvement Points,Cleanliness and Quietness of Hospital Environment Performance Rate,Communication about Medicines Achievement Points,Communication about Medicines Achievement Threshold,Communication about Medicines Baseline Rate,Communication about Medicines Benchmark,Communication about Medicines Dimension Score,Communication about Medicines Floor,Communication about Medicines Improvement Points,Communication about Medicines Performance Rate,Communication with Doctors Achievement Points,Communication with Doctors Achievement Threshold,Communication with Doctors Baseline Rate,Communication with Doctors Benchmark,Communication with Doctors Dimension Score,Communication with Doctors Floor,Communication with Doctors Improvement Points,Communication with Doctors Performance Rate,Communication with Nurses Achievement Points,Communication with Nurses Achievement Threshold,Communication with Nurses Baseline Rate,Communication with Nurses Benchmark,Communication with Nurses Dimension Score,Communication with Nurses Floor,Communication with Nurses Improvement Points,Communication with Nurses Performance Rate,County Name,Discharge Information Achievement Points,Discharge Information Achievement Threshold,Discharge Information Baseline Rate,Discharge Information Benchmark,Discharge Information Dimension Score,Discharge Information Floor,Discharge Information Improvement Points,Discharge Information Performance Rate,Facility ID,Facility Name,Fiscal Year,HCAHPS Base Score,HCAHPS Consistency Score,Overall Rating of Hospital Achievement Points,Overall Rating of Hospital Achievement Threshold,Overall Rating of Hospital Baseline Rate,Overall Rating of Hospital Benchmark,Overall Rating of Hospital Dimension Score,Overall Rating of Hospital Floor,Overall Rating of Hospital Improvement Points,Overall Rating of Hospital Performance Rate,Responsiveness of Hospital Staff Achievement Points,Responsiveness of Hospital Staff Achievement Threshold,Responsiveness of Hospital Staff Baseline Rate,Responsiveness of Hospital Staff Benchmark,Responsiveness of Hospital Staff Dimension Score,Responsiveness of Hospital Staff Floor,Responsiveness of Hospital Staff Improvement Points,Responsiveness of Hospital Staff Performance Rate,State,ZIP Code,file_month,file_year,Pain Management Achievement Points,Pain Management Achievement Threshold,Pain Management Baseline Rate,Pain Management Benchmark,Pain Management Dimension Score,Pain Management Floor,Pain Management Improvement Points,Pain Management Performance Rate
0,1108 ROSS CLARK CIRCLE,Not Available,51.84%,52.7031%,63.57%,Not Available,25.64%,Not Available,51.9870%,DOTHAN,Not Available,65.63%,68.9052%,79.64%,Not Available,45.94%,Not Available,67.8409%,Not Available,63.11%,65.2044%,74.05%,Not Available,39.82%,Not Available,61.0156%,Not Available,79.83%,80.2871%,87.97%,Not Available,62.41%,Not Available,78.6340%,Not Available,79.42%,77.4970%,87.71%,Not Available,53.50%,Not Available,76.9500%,HOUSTON,Not Available,87.23%,89.5346%,92.21%,Not Available,66.92%,Not Available,88.4028%,10001,SOUTHEAST HEALTH MEDICAL CENTER,2023.0,Not Available,Not Available,Not Available,71.66%,71.7905%,85.39%,Not Available,36.31%,Not Available,72.6047%,Not Available,65.52%,66.3995%,81.22%,Not Available,40.40%,Not Available,59.8681%,AL,36301,1,2023,,,,,,,,
1,2505 U S HIGHWAY 431 NORTH,Not Available,51.84%,47.8347%,63.57%,Not Available,25.64%,Not Available,51.0971%,BOAZ,Not Available,65.63%,68.1608%,79.64%,Not Available,45.94%,Not Available,66.8517%,Not Available,63.11%,68.3584%,74.05%,Not Available,39.82%,Not Available,59.4815%,Not Available,79.83%,84.9463%,87.97%,Not Available,62.41%,Not Available,82.2080%,Not Available,79.42%,78.3784%,87.71%,Not Available,53.50%,Not Available,78.5265%,MARSHALL,Not Available,87.23%,89.9947%,92.21%,Not Available,66.92%,Not Available,85.3044%,10005,MARSHALL MEDICAL CENTERS,2023.0,Not Available,Not Available,Not Available,71.66%,71.4240%,85.39%,Not Available,36.31%,Not Available,68.1886%,Not Available,65.52%,64.4797%,81.22%,Not Available,40.40%,Not Available,51.3138%,AL,35957,1,2023,,,,,,,,
2,1701 VETERANS DRIVE,Not Available,51.84%,48.0905%,63.57%,Not Available,25.64%,Not Available,42.8868%,FLORENCE,Not Available,65.63%,69.8706%,79.64%,Not Available,45.94%,Not Available,58.7678%,Not Available,63.11%,61.9801%,74.05%,Not Available,39.82%,Not Available,52.5234%,Not Available,79.83%,81.9974%,87.97%,Not Available,62.41%,Not Available,76.6977%,Not Available,79.42%,77.8473%,87.71%,Not Available,53.50%,Not Available,71.6615%,LAUDERDALE,Not Available,87.23%,84.3397%,92.21%,Not Available,66.92%,Not Available,84.9950%,10006,NORTH ALABAMA MEDICAL CENTER,2023.0,Not Available,Not Available,Not Available,71.66%,68.9040%,85.39%,Not Available,36.31%,Not Available,59.4745%,Not Available,65.52%,55.6354%,81.22%,Not Available,40.40%,Not Available,47.4206%,AL,35630,1,2023,,,,,,,,
3,702 N MAIN ST,Not Available,51.84%,56.0550%,63.57%,Not Available,25.64%,Not Available,57.9234%,OPP,Not Available,65.63%,72.9804%,79.64%,Not Available,45.94%,Not Available,70.3961%,Not Available,63.11%,68.6057%,74.05%,Not Available,39.82%,Not Available,69.3644%,Not Available,79.83%,85.5056%,87.97%,Not Available,62.41%,Not Available,87.0798%,Not Available,79.42%,79.7966%,87.71%,Not Available,53.50%,Not Available,80.8831%,COVINGTON,Not Available,87.23%,89.9143%,92.21%,Not Available,66.92%,Not Available,89.9733%,10007,MIZELL MEMORIAL HOSPITAL,2023.0,Not Available,Not Available,Not Available,71.66%,69.5987%,85.39%,Not Available,36.31%,Not Available,75.0379%,Not Available,65.52%,70.8823%,81.22%,Not Available,40.40%,Not Available,74.7619%,AL,36467,1,2023,,,,,,,,
4,50 MEDICAL PARK EAST DRIVE,Not Available,51.84%,52.4186%,63.57%,Not Available,25.64%,Not Available,51.1531%,BIRMINGHAM,Not Available,65.63%,65.2349%,79.64%,Not Available,45.94%,Not Available,63.7383%,Not Available,63.11%,63.7829%,74.05%,Not Available,39.82%,Not Available,63.3108%,Not Available,79.83%,80.9933%,87.97%,Not Available,62.41%,Not Available,79.7774%,Not Available,79.42%,77.8259%,87.71%,Not Available,53.50%,Not Available,78.4286%,JEFFERSON,Not Available,87.23%,87.7513%,92.21%,Not Available,66.92%,Not Available,86.3131%,10011,ST. VINCENT'S EAST,2023.0,Not Available,Not Available,Not Available,71.66%,72.0017%,85.39%,Not Available,36.31%,Not Available,71.1094%,Not Available,65.52%,69.4084%,81.22%,Not Available,40.40%,Not Available,66.3732%,AL,35235,1,2023,,,,,,,,


In [4]:
df.drop(labels=['Address', 'City', 'County Name',
                    'State', 'ZIP Code', 'Fiscal Year',
                    ], axis=1, inplace=True)


ls_10 = []
ls_9 = []
for l in list(df):
    ls = df[l].tolist()
    for i in ls:
        try:
            if 'out of 10' in i:
                ls_10.append(l)
                break
            elif 'out of 9' in i:
                ls_9.append(l)
                break
        except:
            continue

for l in ls_9:
    df[l] = df[l].replace(['0 out of 9', 
                           '1 out of 9', '2 out of 9', '3 out of 9',
                           '4 out of 9', '5 out of 9', '6 out of 9', 
                           '7 out of 9', '8 out of 9', '9 out of 9'],
                          [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    df[l] = pd.to_numeric(df[l], errors='coerce')

for l in ls_10:
    df[l] = df[l].replace(['0 out of 10', 
                           '1 out of 10', '2 out of 10', '3 out of 10',
                           '4 out of 10', '5 out of 10', '6 out of 10',
                           '7 out of 10', '8 out of 10', '9 out of 10',
                           '10 out of 10'],
                          [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    df[l] = pd.to_numeric(df[l], errors='coerce')


labs = []
for i in list(df):
    ls = df[i].tolist()
    for j in ls:
        try:
            if '%' in j:
                labs.append(i)
                break
        except:
            pass

for l in labs:
    ls1 = df[l].tolist()
    ls2 = []
    for i in ls1:
        try:
            if '%' in i:
                ls2.append(i[:-1])
            else:
                ls2.append(i)
        except:
            ls2.append(i)
    
    df[l] = ls2
    df[l] = pd.to_numeric(df[l], errors='coerce')
    

ls = ['file_year', 'file_month', 'Facility Name', 'Facility ID']
for l in ls:
    col = df.pop(l)
    df.insert(0, col.name, col)
    
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.drop_duplicates(subset = ['Facility ID', 'Facility Name', 'file_month','file_year'], inplace=True)
print(df.shape)
df.head()

(128423, 78)
(128423, 78)
(128423, 78)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Care Transition Achievement Points,Care Transition Achievement Threshold,Care Transition Baseline Rate,Care Transition Benchmark,Care Transition Dimension Score,Care Transition Floor,Care Transition Improvement Points,Care Transition Performance Rate,Cleanliness and Quietness of Hospital Environment Achievement Points,Cleanliness and Quietness of Hospital Environment Achievement Threshold,Cleanliness and Quietness of Hospital Environment Baseline Rate,Cleanliness and Quietness of Hospital Environment Benchmark,Cleanliness and Quietness of Hospital Environment Dimension Score,Cleanliness and Quietness of Hospital Environment Floor,Cleanliness and Quietness of Hospital Environment Improvement Points,Cleanliness and Quietness of Hospital Environment Performance Rate,Communication about Medicines Achievement Points,Communication about Medicines Achievement Threshold,Communication about Medicines Baseline Rate,Communication about Medicines Benchmark,Communication about Medicines Dimension Score,Communication about Medicines Floor,Communication about Medicines Improvement Points,Communication about Medicines Performance Rate,Communication with Doctors Achievement Points,Communication with Doctors Achievement Threshold,Communication with Doctors Baseline Rate,Communication with Doctors Benchmark,Communication with Doctors Dimension Score,Communication with Doctors Floor,Communication with Doctors Improvement Points,Communication with Doctors Performance Rate,Communication with Nurses Achievement Points,Communication with Nurses Achievement Threshold,Communication with Nurses Baseline Rate,Communication with Nurses Benchmark,Communication with Nurses Dimension Score,Communication with Nurses Floor,Communication with Nurses Improvement Points,Communication with Nurses Performance Rate,Discharge Information Achievement Points,Discharge Information Achievement Threshold,Discharge Information Baseline Rate,Discharge Information Benchmark,Discharge Information Dimension Score,Discharge Information Floor,Discharge Information Improvement Points,Discharge Information Performance Rate,HCAHPS Base Score,HCAHPS Consistency Score,Overall Rating of Hospital Achievement Points,Overall Rating of Hospital Achievement Threshold,Overall Rating of Hospital Baseline Rate,Overall Rating of Hospital Benchmark,Overall Rating of Hospital Dimension Score,Overall Rating of Hospital Floor,Overall Rating of Hospital Improvement Points,Overall Rating of Hospital Performance Rate,Responsiveness of Hospital Staff Achievement Points,Responsiveness of Hospital Staff Achievement Threshold,Responsiveness of Hospital Staff Baseline Rate,Responsiveness of Hospital Staff Benchmark,Responsiveness of Hospital Staff Dimension Score,Responsiveness of Hospital Staff Floor,Responsiveness of Hospital Staff Improvement Points,Responsiveness of Hospital Staff Performance Rate,Pain Management Achievement Points,Pain Management Achievement Threshold,Pain Management Baseline Rate,Pain Management Benchmark,Pain Management Dimension Score,Pain Management Floor,Pain Management Improvement Points,Pain Management Performance Rate
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,,51.84,52.7031,63.57,,25.64,,51.987,,65.63,68.9052,79.64,,45.94,,67.8409,,63.11,65.2044,74.05,,39.82,,61.0156,,79.83,80.2871,87.97,,62.41,,78.634,,79.42,77.497,87.71,,53.5,,76.95,,87.23,89.5346,92.21,,66.92,,88.4028,Not Available,Not Available,,71.66,71.7905,85.39,,36.31,,72.6047,,65.52,66.3995,81.22,,40.4,,59.8681,,,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,,51.84,47.8347,63.57,,25.64,,51.0971,,65.63,68.1608,79.64,,45.94,,66.8517,,63.11,68.3584,74.05,,39.82,,59.4815,,79.83,84.9463,87.97,,62.41,,82.208,,79.42,78.3784,87.71,,53.5,,78.5265,,87.23,89.9947,92.21,,66.92,,85.3044,Not Available,Not Available,,71.66,71.424,85.39,,36.31,,68.1886,,65.52,64.4797,81.22,,40.4,,51.3138,,,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,,51.84,48.0905,63.57,,25.64,,42.8868,,65.63,69.8706,79.64,,45.94,,58.7678,,63.11,61.9801,74.05,,39.82,,52.5234,,79.83,81.9974,87.97,,62.41,,76.6977,,79.42,77.8473,87.71,,53.5,,71.6615,,87.23,84.3397,92.21,,66.92,,84.995,Not Available,Not Available,,71.66,68.904,85.39,,36.31,,59.4745,,65.52,55.6354,81.22,,40.4,,47.4206,,,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,,51.84,56.055,63.57,,25.64,,57.9234,,65.63,72.9804,79.64,,45.94,,70.3961,,63.11,68.6057,74.05,,39.82,,69.3644,,79.83,85.5056,87.97,,62.41,,87.0798,,79.42,79.7966,87.71,,53.5,,80.8831,,87.23,89.9143,92.21,,66.92,,89.9733,Not Available,Not Available,,71.66,69.5987,85.39,,36.31,,75.0379,,65.52,70.8823,81.22,,40.4,,74.7619,,,,,,,,
4,10011,ST. VINCENT'S EAST,1,2023,,51.84,52.4186,63.57,,25.64,,51.1531,,65.63,65.2349,79.64,,45.94,,63.7383,,63.11,63.7829,74.05,,39.82,,63.3108,,79.83,80.9933,87.97,,62.41,,79.7774,,79.42,77.8259,87.71,,53.5,,78.4286,,87.23,87.7513,92.21,,66.92,,86.3131,Not Available,Not Available,,71.66,72.0017,85.39,,36.31,,71.1094,,65.52,69.4084,81.22,,40.4,,66.3732,,,,,,,,


## Save dataframe

In [5]:
df.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hvbp_hcahps_df.pkl.gz', protocol=5, compression='gzip')
