# Generate measure dates dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
import os

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'
dir2 = os.path.expanduser("~/GitHub/hospitals-data-archive/")


## Define Custom Functions

In [2]:
def curate(df):
    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Provider Number', ' ZIP Code', 'Zip Code',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility ID', 'ZIP Code', 'ZIP Code',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:
mdf_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023', 
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016', 
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]


subdirs = ['2023/hospitals_01_2023/Measure_Dates.csv', 
           '2023/hospitals_04_2023/Measure_Dates.csv',
           '2023/hospitals_07_2023/Measure_Dates.csv',
           '2023/hospitals_10_2023/Measure_Dates.csv',
           
           '2022/hospitals_01_2022/Measure_Dates.csv', 
           '2022/hospitals_04_2022/Measure_Dates.csv',
           '2022/hospitals_07_2022/Measure_Dates.csv',
           '2022/hospitals_10_2022/Measure_Dates.csv',
           
           '2021/hospitals_01_2021/Measure_Dates.csv',
           '2021/hospitals_03_2021/Measure_Dates.csv',
           '2021/hospitals_04_2021/Measure_Dates.csv',
           '2021/hospitals_07_2021/Measure_Dates.csv',
           '2021/hospitals_10_2021/Measure_Dates.csv',
           
           '2020/hospitals_archive_10_2020/Measure_Dates.csv',
           '2020/hospitals_archive_07_2020/Measure_Dates.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Measure_Dates.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Measure_Dates.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20190702/Measure_Dates.csv',
           '2019/HOSArchive_Revised_Flatfiles_20191030/Measure_Dates.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Measure_Dates.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Measure_Dates.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Measure Dates October 2018.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Measure Dates.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180523/Measure Dates.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Measure Dates.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Measure Dates.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Measure Dates.csv',
           '2017/HOSArchive_Revised_Flatfiles_20170428/Measure Dates.csv',
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Measure Dates.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Measure Dates.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160810/Measure Dates.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160504/Measure Dates.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Measure Dates.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Measure Dates.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Measure Dates.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Measure Dates.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Measure Dates.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Measure Dates.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Measure Dates.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Measure Dates.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Measure Dates.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/Measure Dates.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/Measure Dates.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/Measure Dates.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/Measure Dates.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/Measure Dates.csv',
           ]


for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: mdf = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    mdf, lists = process2(mdf, lists, yrs[i], mos[i])
    mdf_list.append(mdf)

mdf = pd.concat(mdf_list)
mdf = mdf.filter(items=['file_month', 'file_year', 'Measure ID', 
                        'Measure Name', 'Start Date', 'End Date'])

print('mdf.shape:', mdf.shape)
mdf.head()

mdf.shape: (6426, 6)


Unnamed: 0,file_month,file_year,Measure ID,Measure Name,Start Date,End Date
0,1,2023,ASC_11,Percentage of patients who had cataract surger...,01/01/2021,12/31/2021
1,1,2023,ASC_12,Facility 7-Day Risk Standardized Hospital Visi...,01/01/2019,12/31/2021
2,1,2023,ASC_13,Percentage of patients who received anesthesia...,01/01/2021,12/31/2021
3,1,2023,ASC_14,Percentage of cataract surgeries that had an u...,01/01/2021,12/31/2021
4,1,2023,ASC_17,Hospital Visits after Orthopedic Ambulatory Su...,07/01/2020,12/31/2021


In [4]:
ls = mdf['Measure ID'].unique()
ls2 = []
for l in ls:
    try:
        if 'HVBP' in l:
            ls2.append(l)
    except:
        pass
        
print(ls2, '\n')

print(mdf.shape)
mdf = mdf[mdf['Measure ID'].isin(ls2)]
print(mdf.shape, '\n')

IDs1 = mdf['Measure ID'].tolist()

IDs2 = [s.split('_HVBP_')[1] for s in IDs1]
mdf['Measure ID2'] = IDs2

IDs2 = [s.split('_HVBP_')[0] for s in IDs1]
IDs2 = [s.replace('_', '-') for s in IDs2]
mdf['Measure ID'] = IDs2

print(mdf.shape)
mdf = mdf[mdf['Measure ID2'] == 'Performance']
mdf.drop(labels=['Measure ID2'], axis=1, inplace=True)
print(mdf.shape, '\n')

print(sorted(list(set(IDs2))), '\n')

print(sorted(mdf['Measure Name'].unique()))
print(mdf.shape, '\n')
mdf.head()

['COMP_HIP_KNEE_HVBP_Baseline', 'COMP_HIP_KNEE_HVBP_Performance', 'HAI_1_HVBP_Baseline', 'HAI_1_HVBP_Performance', 'HAI_2_HVBP_Baseline', 'HAI_2_HVBP_Performance', 'HAI_3_HVBP_Baseline', 'HAI_3_HVBP_Performance', 'HAI_4_HVBP_Baseline', 'HAI_4_HVBP_Performance', 'HAI_5_HVBP_Baseline', 'HAI_5_HVBP_Performance', 'HAI_6_HVBP_Baseline', 'HAI_6_HVBP_Performance', 'HCAHPS_HVBP_Baseline', 'HCAHPS_HVBP_Performance', 'MORT_30_AMI_HVBP_Baseline', 'MORT_30_AMI_HVBP_Performance', 'MORT_30_CABG_HVBP_Baseline', 'MORT_30_CABG_HVBP_Performance', 'MORT_30_COPD_HVBP_Baseline', 'MORT_30_COPD_HVBP_Performance', 'MORT_30_HF_HVBP_Baseline', 'MORT_30_HF_HVBP_Performance', 'MORT_30_PN_HVBP_Baseline', 'MORT_30_PN_HVBP_Performance', 'MSPB_1_HVBP_Baseline', 'MSPB_1_HVBP_Performance', 'PC_01_HVBP_Baseline', 'PC_01_HVBP_Performance', 'PSI_90_HVBP_Baseline', 'PSI_90_HVBP_Performance', 'AMI_7a_HVBP_Baseline', 'AMI_7a_HVBP_Performance', 'Combined_SSI_Measure_Score_HVBP_Performance', 'IMM_2_HVBP_Baseline', 'IMM_2_HVBP_

Unnamed: 0,file_month,file_year,Measure ID,Measure Name,Start Date,End Date
10,1,2023,COMP-HIP-KNEE,Complication Rate Following Elective Primary T...,04/01/2018,03/31/2021
34,1,2023,HAI-1,Central Line Associated Bloodstream Infection,01/01/2021,12/31/2021
37,1,2023,HAI-2,Catheter Associated Urinary Tract Infections,01/01/2021,12/31/2021
40,1,2023,HAI-3,SSI - Colon Surgery,01/01/2021,12/31/2021
43,1,2023,HAI-4,SSI - Abdominal Hysterectomy,01/01/2021,12/31/2021


In [5]:
tdf1 = mdf.filter(items=['Measure ID', 'Measure Name'], axis=1)
tdf1.drop_duplicates(inplace=True)

print(tdf1.shape)
tdf1.head(25)

(25, 2)


Unnamed: 0,Measure ID,Measure Name
10,COMP-HIP-KNEE,Complication Rate Following Elective Primary T...
34,HAI-1,Central Line Associated Bloodstream Infection
37,HAI-2,Catheter Associated Urinary Tract Infections
40,HAI-3,SSI - Colon Surgery
43,HAI-4,SSI - Abdominal Hysterectomy
46,HAI-5,MRSA
49,HAI-6,CDI
55,HCAHPS,HCAHPS Measures
64,MORT-30-AMI,Acute Myocardial Infarction (AMI) 30-Day Morta...
67,MORT-30-CABG,30-Day All-Cause Mortality Following Coronary ...


In [6]:
tdf = pd.read_csv(dir2 + '/hospital_files/140119.csv')

ls = [' Achievement Threshold',
      ' Achievement Points',
      ' Improvement Points',
      ' Baseline Rate',
      ' Performance Rate',
      ' Benchmark',
      ' Measure Score',
      ' Performance_Rate',
     ]

org_lab = []
mod_lab = []
for mm in list(tdf):
    if 'HVBP' in mm:
        mm = mm.split(': ')[1]
        m = str(mm)
        
        for l in ls:
            if l in m:
                m = m.replace(l, '')
                continue
        
        if 'Weighted' in m or 'Unweighted' in m or 'Total' in m:
            mod_lab.append('Total Performance')
            org_lab.append(mm)
        elif 'HCAHPS' in m or 'Care Transition' in m or 'Cleanliness and Quietness' in m or 'Communication' in m or 'Responsiveness' in m or 'Discharge' in m or 'Overall' in m or 'Pain Management' in m:
            mod_lab.append('HCAHPS')
            org_lab.append(mm)
        else:
            mod_lab.append(m)
            org_lab.append(mm)

tdf2 = pd.DataFrame(columns=['Measure ID', 'Data label'])
tdf2['Measure ID'] = mod_lab
tdf2['Data label'] = org_lab

print(len(mod_lab))
mod_lab = sorted(list(set(mod_lab)))
print(len(mod_lab))
print(mod_lab)

print(tdf2.shape)
tdf2.head()

259
18
['COMP-HIP-KNEE', 'Combined SSI', 'HAI-1', 'HAI-2', 'HAI-3', 'HAI-4', 'HAI-5', 'HAI-6', 'HCAHPS', 'MORT-30-AMI', 'MORT-30-CABG', 'MORT-30-COPD', 'MORT-30-HF', 'MORT-30-PN', 'MSPB-1', 'PC-01', 'PSI-90', 'Total Performance']
(259, 2)


Unnamed: 0,Measure ID,Data label
0,Total Performance,Total Performance Score
1,Total Performance,Unweighted Normalized Clinical Outcomes Domain...
2,Total Performance,Unweighted Normalized Efficiency And Cost Redu...
3,Total Performance,Unweighted Normalized Safety Domain Score
4,Total Performance,Unweighted Person And Community Engagement Dom...


In [7]:
print(mdf.shape)
print(tdf2.shape)

mdf = mdf[mdf['Measure ID'].isin(tdf2['Measure ID'].tolist())]
tdf2 = tdf2[tdf2['Measure ID'].isin(mdf['Measure ID'].tolist())]

print(mdf.shape)
print(tdf2.shape)

tdf1 = mdf.merge(tdf2, how='outer', on='Measure ID') # from measurement dates files
tdf1.drop(labels=['Measure ID', 'Measure Name'], axis=1, inplace=True)
tdf1.rename(columns={'Data label': 'Measure Name'}, inplace=True)

tdf1['Start Date'] = pd.to_datetime(tdf1['Start Date'])
tdf1['End Date'] = pd.to_datetime(tdf1['End Date'])

print(tdf1.shape)
tdf1.head()

(472, 6)
(259, 2)
(424, 6)
(228, 2)
(6628, 5)


Unnamed: 0,file_month,file_year,Start Date,End Date,Measure Name
0,1,2023,2018-04-01,2021-03-31,COMP-HIP-KNEE Achievement Points
1,1,2023,2018-04-01,2021-03-31,COMP-HIP-KNEE Achievement Threshold
2,1,2023,2018-04-01,2021-03-31,COMP-HIP-KNEE Baseline Rate
3,1,2023,2018-04-01,2021-03-31,COMP-HIP-KNEE Benchmark
4,1,2023,2018-04-01,2021-03-31,COMP-HIP-KNEE Improvement Points


## Save

In [8]:
tdf1.to_csv('~/GitHub/hospitals-data-archive/measure_dates/hvbp_measure_dates.csv')
