# Generate Hospital Information dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Custom Functions

In [2]:
                        
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', 'Hospital Name', 'Provider Number', 'County Name', 
             'Mortality national comparison', 'Mortality national comparison footnote',
             'Safety of care national comparison', 'Safety of care national comparison footnote',
             'Readmission national comparison', 'Readmission national comparison footnote',
             'Patient experience national comparison',
             'Patient experience national comparison footnote',
             'Effectiveness of care national comparison',
             'Effectiveness of care national comparison footnote',
             'Timeliness of care national comparison',
             'Timeliness of care national comparison footnote',
             'Efficient use of medical imaging national comparison',
             'Efficient use of medical imaging national comparison footnote',
             'Meets criteria for meaningful use of EHRs',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', 'Facility Name', 'Facility ID', 'County',
             'Mortality National Comparison', 'Mortality National Comparison Footnote',
             'Safety of Care National Comparison', 'Safety of Care National Comparison Footnote',
             'Readmission National Comparison', 'Readmission National Comparison Footnote',
             'Patient Experience National Comparison',
             'Patient Experience National Comparison Footnote',
             'Effectiveness of Care National Comparison',
             'Effectiveness of Care National Comparison Footnote',
             'Timeliness of Care National Comparison',
             'Timeliness of Care National Comparison Footnote',
             'Efficient Use of Medical Imaging National Comparison',
             'Efficient Use of Medical Imaging National Comparison Footnote',
             'Meets Criteria for Meaningful Use of EHRs',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:

df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015', '2015', '2015', '2015',
       '2014', '2014', '2014', '2014', '2014',
       '2013', '2013', '2013',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07', '05', '04', '01',
       '12', '10', '07', '04', '01',
       '10', '07' ,'04',
       ]

subdirs = ['2023/hospitals_01_2023/Hospital_General_Information.csv', 
           '2023/hospitals_04_2023/Hospital_General_Information.csv',
           '2023/hospitals_07_2023/Hospital_General_Information.csv',
           '2023/hospitals_10_2023/Hospital_General_Information.csv',
           
           '2022/hospitals_01_2022/Hospital_General_Information.csv', 
           '2022/hospitals_04_2022/Hospital_General_Information.csv',
           '2022/hospitals_07_2022/Hospital_General_Information.csv',
           '2022/hospitals_10_2022/Hospital_General_Information.csv',
           
           '2021/hospitals_01_2021/Hospital_General_Information.csv',
           '2021/hospitals_03_2021/Hospital_General_Information.csv',
           '2021/hospitals_04_2021/Hospital_General_Information.csv',
           '2021/hospitals_07_2021/Hospital_General_Information.csv',
           '2021/hospitals_10_2021/Hospital_General_Information.csv',
           
           '2020/hospitals_archive_10_2020/Hospital_General_Information.csv',
           '2020/hospitals_archive_07_2020/Hospital_General_Information.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Hospital General Information.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Hospital General Information.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Hospital General Information.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Hospital General Information.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Hospital General Information.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Hospital General Information.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Hospital General Information.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Hospital General Information.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/Hospital General Information.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Hospital General Information.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Hospital General Information.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Hospital General Information.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/Hospital General Information.csv', 
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Hospital General Information.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Hospital General Information.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/Hospital General Information.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Hospital General Information.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Hospital General Information.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Hospital General Information.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150506/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150416/Hospital General Information.csv',
           '2015/HOSArchive_Revised_Flatfiles_20150122/Hospital General Information.csv',
           
           '2014/HOSArchive_Revised_Flatfiles_20141218/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20141023/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140717/Hospital General Information.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140417/Hospital_Data.csv',
           '2014/HOSArchive_Revised_Flatfiles_20140101/Hospital_Data.csv',
           
           '2013/HOSArchive_Revised_Flatfiles_20131001/Hospital_Data.csv', 
           '2013/HOSArchive_Revised_Flatfiles_20130701/Hospital_Data.csv',
           '2013/HOSArchive_Revised_Flatfiles_20130401/Hospital_Data.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)
print('hos_df.shape:', df.shape)

df.head()

2023/hospitals_01_2023/Hospital_General_Information.csv :  (rows, columns) = (5317, 38)
2023/hospitals_04_2023/Hospital_General_Information.csv :  (rows, columns) = (5317, 38)
2023/hospitals_07_2023/Hospital_General_Information.csv :  (rows, columns) = (5446, 38)
2023/hospitals_10_2023/Hospital_General_Information.csv :  (rows, columns) = (5446, 38)
2022/hospitals_01_2022/Hospital_General_Information.csv :  (rows, columns) = (5306, 38)
2022/hospitals_04_2022/Hospital_General_Information.csv :  (rows, columns) = (5306, 38)
2022/hospitals_07_2022/Hospital_General_Information.csv :  (rows, columns) = (5299, 38)
2022/hospitals_10_2022/Hospital_General_Information.csv :  (rows, columns) = (5307, 38)
2021/hospitals_01_2021/Hospital_General_Information.csv :  (rows, columns) = (5324, 28)
2021/hospitals_03_2021/Hospital_General_Information.csv :  (rows, columns) = (5324, 28)
2021/hospitals_04_2021/Hospital_General_Information.csv :  (rows, columns) = (5382, 38)
2021/hospitals_07_2021/Hospital_

Unnamed: 0,Address,City,Count of Facility MORT Measures,Count of Facility Pt Exp Measures,Count of Facility READM Measures,Count of Facility Safety Measures,Count of Facility TE Measures,Count of MORT Measures Better,Count of MORT Measures No Different,Count of MORT Measures Worse,Count of READM Measures Better,Count of READM Measures No Different,Count of READM Measures Worse,Count of Safety Measures Better,Count of Safety Measures No Different,Count of Safety Measures Worse,County,Emergency Services,Facility ID,Facility Name,Hospital Ownership,Hospital Type,Hospital overall rating,Hospital overall rating footnote,MORT Group Footnote,MORT Group Measure Count,Meets criteria for promoting interoperability of EHRs,Phone Number,Pt Exp Group Footnote,Pt Exp Group Measure Count,READM Group Footnote,READM Group Measure Count,Safety Group Footnote,Safety Group Measure Count,State,TE Group Footnote,TE Group Measure Count,ZIP Code,file_month,file_year,County Name,Effectiveness of Care National Comparison,Effectiveness of Care National Comparison Footnote,Efficient Use of Medical Imaging National Comparison,Efficient Use of Medical Imaging National Comparison Footnote,Mortality National Comparison,Mortality National Comparison Footnote,Patient Experience National Comparison,Patient Experience National Comparison Footnote,Readmission National Comparison,Readmission National Comparison Footnote,Safety of Care National Comparison,Safety of Care National Comparison Footnote,Timeliness of Care National Comparison,Timeliness of Care National Comparison Footnote,Meets Criteria for Meaningful Use of EHRs,Address 2,Address 3
0,1108 ROSS CLARK CIRCLE,DOTHAN,7,8,11,8,9,1,5,1,0,10,1,2,6,0,HOUSTON,Yes,10001,SOUTHEAST HEALTH MEDICAL CENTER,Government - Hospital District or Authority,Acute Care Hospitals,3,,,7,Y,(334) 793-8701,,8,,11,,8,AL,,12,36301,1,2023,,,,,,,,,,,,,,,,,,
1,2505 U S HIGHWAY 431 NORTH,BOAZ,6,8,10,7,11,0,5,1,0,10,0,0,7,0,MARSHALL,Yes,10005,MARSHALL MEDICAL CENTERS,Government - Hospital District or Authority,Acute Care Hospitals,3,,,7,Y,(256) 593-8310,,8,,11,,8,AL,,12,35957,1,2023,,,,,,,,,,,,,,,,,,
2,1701 VETERANS DRIVE,FLORENCE,7,8,9,7,9,0,6,1,0,8,1,2,5,0,LAUDERDALE,Yes,10006,NORTH ALABAMA MEDICAL CENTER,Proprietary,Acute Care Hospitals,2,,,7,Y,(256) 768-8400,,8,,11,,8,AL,,12,35630,1,2023,,,,,,,,,,,,,,,,,,
3,702 N MAIN ST,OPP,3,8,6,2,5,0,3,0,0,6,0,0,2,0,COVINGTON,Yes,10007,MIZELL MEMORIAL HOSPITAL,Voluntary non-profit - Private,Acute Care Hospitals,3,,,7,Y,(334) 493-3541,,8,,11,,8,AL,,12,36467,1,2023,,,,,,,,,,,,,,,,,,
4,101 HOSPITAL CIRCLE,LUVERNE,2,Not Available,4,1,5,0,2,0,0,4,0,0,1,0,CRENSHAW,Yes,10008,CRENSHAW COMMUNITY HOSPITAL,Proprietary,Acute Care Hospitals,Not Available,16.0,,7,Y,(334) 335-3374,5.0,8,,11,,8,AL,,12,36049,1,2023,,,,,,,,,,,,,,,,,,


In [4]:
df.drop(labels=['Address 2', 'Address 3',
                    'Count of Facility MORT Measures', 'Count of Facility Pt Exp Measures', 
                    'Count of Facility READM Measures', 'Count of Facility Safety Measures',
                    'Count of Facility TE Measures', 'Count of MORT Measures Better',
                    'Count of MORT Measures No Different', 'Count of MORT Measures Worse', 
                    'Count of READM Measures Better', 'Count of READM Measures No Different',
                    'Count of READM Measures Worse', 'Count of Safety Measures Better',
                    'Count of Safety Measures No Different', 'Count of Safety Measures Worse',
                    'Hospital overall rating footnote', 'MORT Group Footnote', 'MORT Group Measure Count',
                    'Mortality National Comparison Footnote', 'Patient Experience National Comparison Footnote',
                    'Phone Number', 'Pt Exp Group Footnote', 'Pt Exp Group Measure Count',
                    'READM Group Footnote', 'READM Group Measure Count', 
                    'Readmission National Comparison Footnote', 'Safety Group Footnote', 
                    'Safety Group Measure Count', 'Safety of Care National Comparison Footnote',
                    'TE Group Footnote', 'TE Group Measure Count', 'Timeliness of Care National Comparison Footnote',
                    'Effectiveness of Care National Comparison Footnote', 
                    'Efficient Use of Medical Imaging National Comparison Footnote',
                    ], axis=1, inplace=True)

df = df.filter(items=['Facility ID', 'file_month', 'file_year', 'Facility Name',
                              'Address', 'City', 'County', 'State', 'ZIP Code',
                              'Effectiveness of Care National Comparison', 
                              'Efficient Use of Medical Imaging National Comparison',
                              'Emergency Services', 'Hospital Ownership', 'Hospital Type',
                              'Hospital overall rating', 'Meets Criteria for Meaningful Use of EHRs',
                              'Meets criteria for promoting interoperability of EHRs', 
                              'Mortality National Comparison', 'Patient Experience National Comparison',
                              'Readmission National Comparison', 'Safety of Care National Comparison', 
                              'Timeliness of Care National Comparison',
                             ], axis=1)

print(df.shape)
df.head()

(231358, 22)


Unnamed: 0,Facility ID,file_month,file_year,Facility Name,Address,City,County,State,ZIP Code,Effectiveness of Care National Comparison,Efficient Use of Medical Imaging National Comparison,Emergency Services,Hospital Ownership,Hospital Type,Hospital overall rating,Meets Criteria for Meaningful Use of EHRs,Meets criteria for promoting interoperability of EHRs,Mortality National Comparison,Patient Experience National Comparison,Readmission National Comparison,Safety of Care National Comparison,Timeliness of Care National Comparison
0,10001,1,2023,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,AL,36301,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
1,10005,1,2023,MARSHALL MEDICAL CENTERS,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,AL,35957,,,Yes,Government - Hospital District or Authority,Acute Care Hospitals,3,,Y,,,,,
2,10006,1,2023,NORTH ALABAMA MEDICAL CENTER,1701 VETERANS DRIVE,FLORENCE,LAUDERDALE,AL,35630,,,Yes,Proprietary,Acute Care Hospitals,2,,Y,,,,,
3,10007,1,2023,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,COVINGTON,AL,36467,,,Yes,Voluntary non-profit - Private,Acute Care Hospitals,3,,Y,,,,,
4,10008,1,2023,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,CRENSHAW,AL,36049,,,Yes,Proprietary,Acute Care Hospitals,Not Available,,Y,,,,,


In [5]:

ls = ['Facility ID',
      'Facility Name',
      'file_month',
      'file_year',
      'State',
      'Hospital Ownership', 
      'Hospital Type',
     ]

tdf = df.filter(items=ls, axis=1)
tdf.drop_duplicates(inplace=True)

print(tdf.shape)
tdf.head()


(231358, 7)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,State,Hospital Ownership,Hospital Type
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,AL,Government - Hospital District or Authority,Acute Care Hospitals
1,10005,MARSHALL MEDICAL CENTERS,1,2023,AL,Government - Hospital District or Authority,Acute Care Hospitals
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,AL,Proprietary,Acute Care Hospitals
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,AL,Voluntary non-profit - Private,Acute Care Hospitals
4,10008,CRENSHAW COMMUNITY HOSPITAL,1,2023,AL,Proprietary,Acute Care Hospitals


In [6]:
tdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/hospital_info_df.pkl.gz', protocol=5, compression='gzip')
