# Generate Hospital Information dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '/Users/kenlocey/Desktop/Rush/CMS_HospitalArchives/'

## Custom Functions

In [2]:
def check_lists(lists):
    for i, ls in enumerate(lists):
        for i2, ls2 in enumerate(lists):
            for i3 in ls:
                if i3 not in ls2:
                    print('\n')
                    print(i3 + ': NOT FOUND IN')
                    print(ls2)
                    sys.exit()
                        
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', 'Hospital Name', 'Provider Number', 'County Name', 
             'Mortality national comparison', 'Mortality national comparison footnote',
             'Safety of care national comparison', 'Safety of care national comparison footnote',
             'Readmission national comparison', 'Readmission national comparison footnote',
             'Patient experience national comparison',
             'Patient experience national comparison footnote',
             'Effectiveness of care national comparison',
             'Effectiveness of care national comparison footnote',
             'Timeliness of care national comparison',
             'Timeliness of care national comparison footnote',
             'Efficient use of medical imaging national comparison',
             'Efficient use of medical imaging national comparison footnote',
             'Meets criteria for meaningful use of EHRs',
             ]
    
    
    cols2 = ['Facility ID', 'Facility Name', 'Facility ID', 'County',
             'Mortality National Comparison', 'Mortality National Comparison Footnote',
             'Safety of Care National Comparison', 'Safety of Care National Comparison Footnote',
             'Readmission National Comparison', 'Readmission National Comparison Footnote',
             'Patient Experience National Comparison',
             'Patient Experience National Comparison Footnote',
             'Effectiveness of Care National Comparison',
             'Effectiveness of Care National Comparison Footnote',
             'Timeliness of Care National Comparison',
             'Timeliness of Care National Comparison Footnote',
             'Efficient Use of Medical Imaging National Comparison',
             'Efficient Use of Medical Imaging National Comparison Footnote',
             'Meets Criteria for Meaningful Use of EHRs',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = ['MORT Group Measure Count', 'Count of Facility MORT Measures',
            'Count of MORT Measures Better', 'Count of Facility MORT Measures',
            'Count of MORT Measures No Different', 'Count of MORT Measures Worse',
            'MORT Group Footnote', 'Safety Group Measure Count',
            'Count of Facility Safety Measures', 'Count of Safety Measures Better',
            'Count of Safety Measures Worse', 'Count of Safety Measures No Different',
            'Safety Group Footnote', 'READM Group Measure Count',
            'Count of Facility READM Measures', 'Count of READM Measures Better',
            'Count of READM Measures Worse', 'Count of READM Measures No Different',
            'READM Group Footnote',
            'Pt Exp Group Measure Count', 'Count of Facility Pt Exp Measures',
            'Pt Exp Group Footnote', 'TE Group Measure Count', 'TE Group Measure Count',
            'Count of Facility TE Measures', 'TE Group Footnote',
            'Meets criteria for promoting interoperability of EHRs',
            'Hospital overall rating', 'Hospital overall rating footnote', 
            'Address', 'Mortality National Comparison',
            'Mortality National Comparison Footnote', 'Safety of Care National Comparison',
            'Safety of Care National Comparison Footnote',
            'Readmission National Comparison',
            'Readmission National Comparison Footnote',
            'Patient Experience National Comparison',
            'Patient Experience National Comparison Footnote',
            'Effectiveness of Care National Comparison',
            'Effectiveness of Care National Comparison Footnote',
            'Timeliness of Care National Comparison',
            'Timeliness of Care National Comparison Footnote',
            'Efficient Use of Medical Imaging National Comparison',
            'Efficient Use of Medical Imaging National Comparison Footnote',
            'Meets Criteria for Meaningful Use of EHRs',
            'Address 1',
            'Address 2',
            'Address 3',
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists



## Load Files

In [3]:
#########################################################################################
#########################  COMPLICATIONS & DEATHS FILES  ################################
#########################################################################################

df_list = []
lists = []

yrs = ['2023', '2023',
       '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04',
       '01', '04', '07',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]

subdirs = ['2023/hospitals_01_2023/Hospital_General_Information.csv', 
           '2023/hospitals_04_2023/Hospital_General_Information.csv',
           
           '2022/hospitals_01_2022/Hospital_General_Information.csv', 
           '2022/hospitals_04_2022/Hospital_General_Information.csv',
           '2022/hospitals_07_2022/Hospital_General_Information.csv',
           
           '2021/hospitals_01_2021/Hospital_General_Information.csv',
           '2021/hospitals_03_2021/Hospital_General_Information.csv',
           '2021/hospitals_04_2021/Hospital_General_Information.csv',
           '2021/hospitals_07_2021/Hospital_General_Information.csv',
           '2021/hospitals_10_2021/Hospital_General_Information.csv',
           
           '2020/hospitals_archive_10_2020/Hospital_General_Information.csv',
           '2020/hospitals_archive_07_2020/Hospital_General_Information.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Hospital General Information.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Hospital General Information.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Hospital General Information.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Hospital General Information.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Hospital General Information.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Hospital General Information.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Hospital General Information.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Hospital General Information.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/Hospital General Information.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Hospital General Information.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Hospital General Information.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Hospital General Information.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/Hospital General Information.csv', 
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Hospital General Information.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Hospital General Information.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/Hospital General Information.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Hospital General Information.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Hospital General Information.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Hospital General Information.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Hospital General Information.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/Hospital_Data.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/Hospital_Data.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/Hospital_Data.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/Hospital_Data.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/Hospital_Data.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

check_lists(lists)
subdir = 'Hospital_Info/CombinedFiles_Hospital_Info'
hos_df = pd.concat(df_list)
print('hos_df.shape:', hos_df.shape)


2023/hospitals_01_2023/Hospital_General_Information.csv :  (rows, columns) = (5317, 38)
2023/hospitals_04_2023/Hospital_General_Information.csv :  (rows, columns) = (5317, 38)
2022/hospitals_01_2022/Hospital_General_Information.csv :  (rows, columns) = (5306, 38)
2022/hospitals_04_2022/Hospital_General_Information.csv :  (rows, columns) = (5306, 38)
2022/hospitals_07_2022/Hospital_General_Information.csv :  (rows, columns) = (5299, 38)
2021/hospitals_01_2021/Hospital_General_Information.csv :  (rows, columns) = (5324, 28)
2021/hospitals_03_2021/Hospital_General_Information.csv :  (rows, columns) = (5324, 28)
2021/hospitals_04_2021/Hospital_General_Information.csv :  (rows, columns) = (5382, 38)
2021/hospitals_07_2021/Hospital_General_Information.csv :  (rows, columns) = (5336, 38)
2021/hospitals_10_2021/Hospital_General_Information.csv :  (rows, columns) = (5325, 38)
2020/hospitals_archive_10_2020/Hospital_General_Information.csv :  (rows, columns) = (5314, 28)
2020/hospitals_archive_0

In [4]:
hos_df.drop(labels=['Address 1', 'Address 2', 'Address 3',
                    'Count of Facility MORT Measures', 'Count of Facility Pt Exp Measures', 
                    'Count of Facility READM Measures', 'Count of Facility Safety Measures',
                    'Count of Facility TE Measures', 'Count of MORT Measures Better',
                    'Count of MORT Measures No Different', 'Count of MORT Measures Worse', 
                    'Count of READM Measures Better', 'Count of READM Measures No Different',
                    'Count of READM Measures Worse', 'Count of Safety Measures Better',
                    'Count of Safety Measures No Different', 'Count of Safety Measures Worse',
                    'Hospital overall rating footnote', 'MORT Group Footnote', 'MORT Group Measure Count',
                    'Mortality National Comparison Footnote', 'Patient Experience National Comparison Footnote',
                    'Phone Number', 'Pt Exp Group Footnote', 'Pt Exp Group Measure Count',
                    'READM Group Footnote', 'READM Group Measure Count', 
                    'Readmission National Comparison Footnote', 'Safety Group Footnote', 
                    'Safety Group Measure Count', 'Safety of Care National Comparison Footnote',
                    'TE Group Footnote', 'TE Group Measure Count', 'Timeliness of Care National Comparison Footnote',
                    'Effectiveness of Care National Comparison Footnote', 
                    'Efficient Use of Medical Imaging National Comparison Footnote',
                    ], axis=1, inplace=True)

hos_df = hos_df.filter(items=['Facility ID', 'file_month', 'file_year', 'Facility Name',
                              'Address', 'City', 'County', 'State', 'ZIP Code',
                              'Effectiveness of Care National Comparison', 
                              'Efficient Use of Medical Imaging National Comparison',
                              'Emergency Services', 'Hospital Ownership', 'Hospital Type',
                              'Hospital overall rating', 'Meets Criteria for Meaningful Use of EHRs',
                              'Meets criteria for promoting interoperability of EHRs', 
                              'Mortality National Comparison', 'Patient Experience National Comparison',
                              'Readmission National Comparison', 'Safety of Care National Comparison', 
                              'Timeliness of Care National Comparison',
                             ], axis=1)

print(hos_df.shape)
hos_df.head()

(215159, 22)


Unnamed: 0,Facility ID,file_month,file_year,Facility Name,Address,City,County,State,ZIP Code,Effectiveness of Care National Comparison,Efficient Use of Medical Imaging National Comparison,Emergency Services,Hospital Ownership,Hospital Type,Hospital overall rating,Meets Criteria for Meaningful Use of EHRs,Meets criteria for promoting interoperability of EHRs,Mortality National Comparison,Patient Experience National Comparison,Readmission National Comparison,Safety of Care National Comparison,Timeliness of Care National Comparison
0,10001,1,2023,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,AL,36301,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
1,10005,1,2023,MARSHALL MEDICAL CENTERS,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,AL,35957,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
2,10006,1,2023,NORTH ALABAMA MEDICAL CENTER,1701 VETERANS DRIVE,FLORENCE,LAUDERDALE,AL,35630,,,Yes,Proprietary,Acute Care Hospitals,2,,Y,,,,,
3,10007,1,2023,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,COVINGTON,AL,36467,,,Yes,Voluntary non-profit - Private,Acute Care Hospitals,3,,Y,,,,,
4,10008,1,2023,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,CRENSHAW,AL,36049,,,Yes,Proprietary,Acute Care Hospitals,Not Available,,Y,,,,,


In [5]:
for i, n in enumerate(list(hos_df)):
    if n in ['Facility ID', 'file_month', 'file_year']:
        continue
            
    hos_df.rename(columns={n: 'General Info: ' + n}, inplace=True)
    
print(hos_df.shape)
hos_df.head()

(215159, 22)


Unnamed: 0,Facility ID,file_month,file_year,General Info: Facility Name,General Info: Address,General Info: City,General Info: County,General Info: State,General Info: ZIP Code,General Info: Effectiveness of Care National Comparison,General Info: Efficient Use of Medical Imaging National Comparison,General Info: Emergency Services,General Info: Hospital Ownership,General Info: Hospital Type,General Info: Hospital overall rating,General Info: Meets Criteria for Meaningful Use of EHRs,General Info: Meets criteria for promoting interoperability of EHRs,General Info: Mortality National Comparison,General Info: Patient Experience National Comparison,General Info: Readmission National Comparison,General Info: Safety of Care National Comparison,General Info: Timeliness of Care National Comparison
0,10001,1,2023,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,AL,36301,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
1,10005,1,2023,MARSHALL MEDICAL CENTERS,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,AL,35957,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
2,10006,1,2023,NORTH ALABAMA MEDICAL CENTER,1701 VETERANS DRIVE,FLORENCE,LAUDERDALE,AL,35630,,,Yes,Proprietary,Acute Care Hospitals,2,,Y,,,,,
3,10007,1,2023,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,COVINGTON,AL,36467,,,Yes,Voluntary non-profit - Private,Acute Care Hospitals,3,,Y,,,,,
4,10008,1,2023,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,CRENSHAW,AL,36049,,,Yes,Proprietary,Acute Care Hospitals,Not Available,,Y,,,,,


In [6]:
hos_df.to_pickle('dataframes/hospital_info_df.pkl.gz', protocol=5, compression='gzip')