# Generate Outpatient Imaging Efficiency dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
import time

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions 

In [2]:

def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number']
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Address', 'City', 'County Name',
             'Phone Number']
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists

## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07',
       ]

subdirs = ['2023/hospitals_01_2023/Outpatient_Imaging_Efficiency-Hospital.csv', 
           '2023/hospitals_04_2023/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2023/hospitals_07_2023/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2023/hospitals_10_2023/Outpatient_Imaging_Efficiency-Hospital.csv',
           
           '2022/hospitals_01_2022/Outpatient_Imaging_Efficiency-Hospital.csv', 
           '2022/hospitals_04_2022/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2022/hospitals_07_2022/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2022/hospitals_10_2022/Outpatient_Imaging_Efficiency-Hospital.csv',
           
           '2021/hospitals_01_2021/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2021/hospitals_03_2021/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2021/hospitals_04_2021/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2021/hospitals_07_2021/Outpatient_Imaging_Efficiency-Hospital.csv',
           '2021/hospitals_10_2021/Outpatient_Imaging_Efficiency-Hospital.csv',
           
           '2020/hospitals_archive_10_2020/Outpatient_Imaging_Efficiency_Hospital.csv',
           '2020/hospitals_archive_07_2020/Outpatient_Imaging_Efficiency_Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Outpatient Imaging Efficiency - Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Outpatient Imaging Efficiency - Hospital.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Outpatient Imaging Efficiency - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Outpatient Imaging Efficiency - Hospital.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Outpatient Imaging Efficiency - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Outpatient Imaging Efficiency - Hospital.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Outpatient Imaging Efficiency - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Outpatient Imaging Efficiency - Hospital.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/Outpatient Imaging Efficiency - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Outpatient Imaging Efficiency - Hospital.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Outpatient Imaging Efficiency - Hospital.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Outpatient Imaging Efficiency - Hospital.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/Outpatient Imaging Efficiency - Hospital.csv', 
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Outpatient Imaging Efficiency - Hospital.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Outpatient Imaging Efficiency - Hospital.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/Outpatient Imaging Efficiency - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Outpatient Imaging Efficiency - Hospital.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Outpatient Imaging Efficiency - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Outpatient Imaging Efficiency - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Outpatient Imaging Efficiency - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Outpatient Imaging Efficiency - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Outpatient Imaging Efficiency - Hospital.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Outpatient Imaging Efficiency - Hospital.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Outpatient Imaging Efficiency - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Outpatient Imaging Efficiency - Hospital.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Outpatient Imaging Efficiency - Hospital.csv',
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)

print('df.shape:', df.shape)
df = df[~df['Score'].isin([np.nan, float("NaN"), 'Not Available'])]
print('df.shape:', df.shape)

print(sorted(list(df)))
df.head()


2023/hospitals_01_2023/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18704, 14)
2023/hospitals_04_2023/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18692, 14)
2023/hospitals_07_2023/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18664, 14)
2023/hospitals_10_2023/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18664, 14)
2022/hospitals_01_2022/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (14028, 14)
2022/hospitals_04_2022/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (14028, 14)
2022/hospitals_07_2022/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18684, 14)
2022/hospitals_10_2022/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (18692, 14)
2021/hospitals_01_2021/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (14130, 14)
2021/hospitals_03_2021/Outpatient_Imaging_Efficiency-Hospital.csv :  (rows, columns) = (14130, 14)
2021/hospi

Unnamed: 0,Address,City,County Name,End Date,Facility ID,Facility Name,Footnote,Measure ID,Measure Name,Phone Number,Score,Start Date,State,ZIP Code,file_month,file_year
0,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,,OP-10,Abdomen CT Use of Contrast Material,(334) 793-8701,5.7,07/01/2020,AL,36301,1,2023
1,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,,OP-13,Outpatients who got cardiac imaging stress tes...,(334) 793-8701,6.8,07/01/2020,AL,36301,1,2023
2,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,,OP-39,Breast Cancer Screening Recall Rates,(334) 793-8701,5.5,07/01/2020,AL,36301,1,2023
3,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,,OP-8,MRI Lumbar Spine for Low Back Pain,(334) 793-8701,42.5,07/01/2020,AL,36301,1,2023
4,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,06/30/2021,10005,MARSHALL MEDICAL CENTERS,,OP-10,Abdomen CT Use of Contrast Material,(256) 593-8310,13.8,07/01/2020,AL,35957,1,2023


In [4]:
d = {'OP_10': 'OP-10',
     'OP_11': 'OP-11', 
     'OP_13': 'OP-13',
     'OP_14': 'OP-14',
     'OP_8': 'OP-8', 
     'OP_9': 'OP-9',
     }
df['Measure ID'].replace(to_replace = d, inplace = True)

df['Measure Name'] = df['Measure Name'] + ' (' + df['Measure ID'] + ')'
df = df.filter(items=['Facility ID', 'Facility Name', 'file_month', 'file_year',
                      'Measure Name', 'Start Date', 'End Date',
                      'Score'], axis=1)


In [5]:
cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']
main_df = pd.DataFrame(columns=cols)
Measures = sorted(df['Measure Name'].unique())

for i, mi in enumerate(Measures):    
    tdf = df[df['Measure Name'] == mi]
    measures = sorted(tdf['Measure Name'].unique())
    
    df2 = pd.DataFrame(columns=cols)

    for j, m in enumerate(measures):
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in cols:
                continue
            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m + ' (' + n + ')'}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        df2 = df2.merge(tdf2, on=cols, how='outer')
    
    main_df = main_df.merge(df2, on=cols, how='outer')

tdf = main_df.copy(deep=True)
del df2, main_df

print(tdf.shape)
tdf = tdf.loc[:, ~tdf.T.duplicated(keep='first')]
tdf.dropna(how='all', axis=1, inplace=True)
print(tdf.shape)
tdf.drop_duplicates(inplace=True)
print(tdf.shape)
tdf.drop_duplicates(subset = cols, inplace=True)
print(tdf.shape)
tdf.head()


(162666, 13)
(162666, 13)
(162666, 13)
(162666, 13)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Start Date,End Date,Abdomen CT Use of Contrast Material (OP-10) (Score),Breast Cancer Screening Recall Rates (OP-39) (Score),MRI Lumbar Spine for Low Back Pain (OP-8) (Score),Mammography Follow-up Rates (OP-9) (Score),Outpatients who got cardiac imaging stress tests before low-risk outpatient surgery (OP-13) (Score),Outpatients with brain CT scans who got a sinus CT scan at the same time (OP-14) (Score),Thorax CT Use of Contrast Material (OP-11) (Score)
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,07/01/2020,06/30/2021,5.7,5.5,42.5,,6.8,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,07/01/2020,06/30/2021,13.8,6.7,54.5,,3.8,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,07/01/2020,06/30/2021,11.0,9.4,41.2,,1.7,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,07/01/2020,06/30/2021,5.9,25.8,,,,,
4,10008,CRENSHAW COMMUNITY HOSPITAL,1,2023,07/01/2020,06/30/2021,2.1,,,,,,


In [6]:
ttdf = tdf.drop(labels=['Start Date', 'End Date'], axis=1)
ttdf.drop_duplicates(inplace=True)
print(ttdf.shape)


(162666, 11)


In [7]:

start_time = time.time()
ttdf['marker'] = ttdf['Facility ID'] + ' | ' + ttdf['Facility Name']  + ' | ' + ttdf['file_month'] + ' | ' + ttdf['file_year']
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
column = ttdf.pop('marker')
ttdf.insert(0, column.name, column)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
collapsed_df = ttdf.groupby(ttdf.marker).apply(lambda group: group.ffill().bfill().head(1))
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf = collapsed_df.reset_index(drop=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

ttdf.head()

Run time = 0.091 seconds
(162666, 12)
Run time = 0.002 seconds
(162666, 12)
Run time = 0.173 seconds
(162666, 12)
Run time = 3014.647 seconds
(162666, 12)
Run time = 0.045 seconds
(159402, 12)
Run time = 0.154 seconds
(159402, 12)


Unnamed: 0,marker,Facility ID,Facility Name,file_month,file_year,Abdomen CT Use of Contrast Material (OP-10) (Score),Breast Cancer Screening Recall Rates (OP-39) (Score),MRI Lumbar Spine for Low Back Pain (OP-8) (Score),Mammography Follow-up Rates (OP-9) (Score),Outpatients who got cardiac imaging stress tests before low-risk outpatient surgery (OP-13) (Score),Outpatients with brain CT scans who got a sinus CT scan at the same time (OP-14) (Score),Thorax CT Use of Contrast Material (OP-11) (Score)
0,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2015,4.6,,29.3,5.0,8.0,2.9,2.5
1,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2018,9.0,,35.1,4.6,5.9,1.6,1.5
2,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2020,8.0,,35.2,5.8,6.1,1.7,1.8
3,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2021,7.2,,38.9,,2.8,,
4,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 03...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,3,2019,6.5,,36.1,5.5,7.5,0.7,1.1


In [8]:
ttdf.drop(labels=['marker'], axis=1, inplace=True)
ttdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/Outpatient_Imaging_Efficiency_df.pkl.gz', protocol=5, compression='gzip')


In [9]:
m1 = list(ttdf)
ls = ['Facility ID','Facility Name','file_month','file_year']
for l in ls: 
    m1.remove(l)

## Save measurement dates

In [10]:
# Columns to keep as is
id_cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']

# Melt the specific columns and create the 'Measure' and 'Score' columns
measures_df = tdf.melt(id_vars=id_cols, var_name='Measure Name', value_name='Score')
measures_df.drop(labels=['Score', 'Facility ID', 'Facility Name'], axis=1, inplace=True)

print(measures_df.shape)
measures_df.drop_duplicates(inplace=True)
measures_df.reset_index(drop=True, inplace=True)
print(measures_df.shape)

measures_df['Start Date'] = pd.to_datetime(measures_df['Start Date'])
measures_df['End Date'] = pd.to_datetime(measures_df['End Date'])
measures_df.to_csv('~/GitHub/hospitals-data-archive/measure_dates/Outpatient_Imaging_Efficiency_df.csv')

measures_df.head()

(1138662, 5)
(294, 5)


Unnamed: 0,file_month,file_year,Start Date,End Date,Measure Name
0,1,2023,2020-07-01,2021-06-30,Abdomen CT Use of Contrast Material (OP-10) (S...
1,4,2023,2020-07-01,2021-06-30,Abdomen CT Use of Contrast Material (OP-10) (S...
2,7,2023,2021-07-01,2022-06-30,Abdomen CT Use of Contrast Material (OP-10) (S...
3,10,2023,2021-07-01,2022-06-30,Abdomen CT Use of Contrast Material (OP-10) (S...
4,1,2022,2019-07-01,2019-12-31,Abdomen CT Use of Contrast Material (OP-10) (S...


In [11]:
m2 = measures_df['Measure Name'].unique().tolist()
sorted(m1) == sorted(m2)

True