# Generate Unplanned Visits dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
import time

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def curate(df):
    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
            
            
    cols = ['Number of Patients', 'Number of Patients Returned',
            ]
    
    for col in cols:
        if col not in list(df):
            df[col] = float('NaN')
    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists

## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       ]

subdirs = ['2023/hospitals_01_2023/Unplanned_Hospital_Visits-Hospital.csv', 
           '2023/hospitals_04_2023/Unplanned_Hospital_Visits-Hospital.csv',
           '2023/hospitals_07_2023/Unplanned_Hospital_Visits-Hospital.csv',
           '2023/hospitals_10_2023/Unplanned_Hospital_Visits-Hospital.csv',
           
           '2022/hospitals_01_2022/Unplanned_Hospital_Visits-Hospital.csv', 
           '2022/hospitals_04_2022/Unplanned_Hospital_Visits-Hospital.csv',
           '2022/hospitals_07_2022/Unplanned_Hospital_Visits-Hospital.csv',
           '2022/hospitals_10_2022/Unplanned_Hospital_Visits-Hospital.csv',
           
           '2021/hospitals_01_2021/Unplanned_Hospital_Visits-Hospital.csv',
           '2021/hospitals_03_2021/Unplanned_Hospital_Visits-Hospital.csv',
           '2021/hospitals_04_2021/Unplanned_Hospital_Visits-Hospital.csv',
           '2021/hospitals_07_2021/Unplanned_Hospital_Visits-Hospital.csv',
           '2021/hospitals_10_2021/Unplanned_Hospital_Visits-Hospital.csv',
           
           '2020/hospitals_archive_10_2020/Unplanned_Hospital_Visits_Hospital.csv',
           '2020/hospitals_archive_07_2020/Unplanned_Hospital_Visits_Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Unplanned Hospital Visits - Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Unplanned Hospital Visits - Hospital.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Unplanned Hospital Visits - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Unplanned Hospital Visits - Hospital.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Unplanned Hospital Visits - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Unplanned Hospital Visits - Hospital.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Unplanned Hospital Visits - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Unplanned Hospital Visits - Hospital.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/Unplanned Hospital Visits - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Unplanned Hospital Visits - Hospital.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

subdir = 'Unplanned_Visits/CombinedFiles_Unplanned_Visits'
df = pd.concat(df_list)

print('df.shape:', df.shape)
df = df[~df['Score'].isin([np.nan, float("NaN"), 'Not Available'])]
print('df.shape:', df.shape)

print(sorted(list(df)))

df['End Date'] = pd.to_datetime(df['End Date'])
df['Start Date'] = pd.to_datetime(df['Start Date'])

print(df['Measure ID'].unique(), '\n')
print(df['Measure Name'].unique(), '\n')

df.head()

2023/hospitals_01_2023/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67872, 20)
2023/hospitals_04_2023/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67830, 20)
2023/hospitals_07_2023/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67732, 20)
2023/hospitals_10_2023/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67732, 20)
2022/hospitals_01_2022/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67872, 20)
2022/hospitals_04_2022/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67872, 20)
2022/hospitals_07_2022/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67802, 20)
2022/hospitals_10_2022/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (67830, 20)
2021/hospitals_01_2021/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (68348, 20)
2021/hospitals_03_2021/Unplanned_Hospital_Visits-Hospital.csv :  (rows, columns) = (68348, 20)
2021/hospitals_04_2021/Unplanned_Hospital_Visits-H

Unnamed: 0,Address,City,Compared to National,County Name,Denominator,End Date,Facility ID,Facility Name,Footnote,Higher Estimate,Lower Estimate,Measure ID,Measure Name,Number of Patients,Number of Patients Returned,Phone Number,Score,Start Date,State,ZIP Code,file_month,file_year
0,1108 ROSS CLARK CIRCLE,DOTHAN,Average Days per 100 Discharges,HOUSTON,319,2021-06-30,10001,SOUTHEAST HEALTH MEDICAL CENTER,,19.6,-13.2,EDAC_30_AMI,Hospital return days for heart attack patients,305,81,(334) 793-8701,1.9,2018-07-01,AL,36301,1,2023
1,1108 ROSS CLARK CIRCLE,DOTHAN,More Days Than Average per 100 Discharges,HOUSTON,755,2021-06-30,10001,SOUTHEAST HEALTH MEDICAL CENTER,,39.6,4.8,EDAC_30_HF,Hospital return days for heart failure patients,618,211,(334) 793-8701,21.8,2018-07-01,AL,36301,1,2023
2,1108 ROSS CLARK CIRCLE,DOTHAN,Average Days per 100 Discharges,HOUSTON,436,2021-06-30,10001,SOUTHEAST HEALTH MEDICAL CENTER,,16.7,-17.3,EDAC_30_PN,Hospital return days for pneumonia patients,412,99,(334) 793-8701,-1.5,2018-07-01,AL,36301,1,2023
3,1108 ROSS CLARK CIRCLE,DOTHAN,No Different Than the National Rate,HOUSTON,254,2021-12-31,10001,SOUTHEAST HEALTH MEDICAL CENTER,,19.6,10.1,OP_32,Rate of unplanned hospital visits after colono...,Not Applicable,Not Applicable,(334) 793-8701,14.1,2019-01-01,AL,36301,1,2023
4,1108 ROSS CLARK CIRCLE,DOTHAN,No Different Than the National Rate,HOUSTON,214,2021-12-31,10001,SOUTHEAST HEALTH MEDICAL CENTER,,13.2,7.7,OP_35_ADM,Rate of inpatient admissions for patients rece...,Not Applicable,Not Applicable,(334) 793-8701,10.2,2021-01-01,AL,36301,1,2023


In [4]:
d = {
    'EDAC_30_AMI': 'EDAC-30 AMI', 
    'EDAC_30_HF': 'EDAC-30 HF',
    'EDAC_30_PN': 'EDAC-30 PN',
    'OP_32': 'OP-32',
    'OP_35_ADM': 'OP-35 ADM', 
    'OP_35_ED': 'OP-35 ED', 
    'OP_36': 'OP-36', 
    'READM_30_AMI': 'READM-30 AMI', 
    'READM_30_CABG': 'READM-30 CABG',
    'READM_30_COPD': 'READM-30 COPD', 
    'READM_30_HF': 'READM-30 HF', 
    'READM_30_HIP_KNEE': 'READM-30 HIP-KNEE', 
    'READM_30_HOSP_WIDE': 'READM-30 HOSP-WIDE', 
    'READM_30_PN': 'READM-30 PN', 
    'READM_30_STK': 'READM-30 STK',
}

df['Measure ID'].replace(to_replace=d, inplace=True)
print(len(df['Measure ID'].unique()))
print(df['Measure ID'].unique())


d = {
    'Hospital return days for heart attack patients': 'Hospital return days for AMI patients', 
    'Hospital return days for heart failure patients': 'Hospital return days for HF patients',
    'Hospital return days for pneumonia patients': 'Hospital return days for PN patients', 
    'Rate of unplanned hospital visits after colonoscopy (per 1,000 colonoscopies)': 'Rate of unplanned visits after colonoscopy (per 1K)', 
    'Rate of unplanned hospital visits after colonoscopy (per 1,000 colonoscopies)': 'Rate of unplanned visits after colonoscopy (per 1K)',
    
    'Rate of inpatient admissions for patients receiving outpatient chemotherapy': 'Inpatient admit rate for patients receiving outpatient chemo', 
    'Rate of emergency department (ED) visits for patients receiving outpatient chemotherapy': 'ED visit rate for patients receiving outpatient chemo', 
    'Ratio of unplanned hospital visits after hospital outpatient surgery': 'Ratio of unplanned visits after outpatient surgery', 
    'Acute Myocardial Infarction (AMI) 30-Day Readmission Rate': 'AMI 30-Day Readmission Rate', 
    'Rate of readmission for heart attack patients': 'AMI 30-Day Readmission Rate',
    'Rate of readmission for CABG': 'CABG 30-Day Readmission Rate',
    'Rate of readmission for CABG surgery patients': 'CABG 30-Day Readmission Rate',
    
    'Rate of readmission for chronic obstructive pulmonary disease (COPD) patients': '30-Day Readmission Rate', 
    
    'Heart failure (HF) 30-Day Readmission Rate': '30-Day HF Readmission Rate', 
    'Rate of readmission for heart failure patients': '30-Day HF Readmission Rate',
    
    'Rate of readmission after hip/knee replacement': '30-Day Readmission Rate after hip/knee replacement', 
    
    '30-Day Hospital-Wide All-Cause Unplanned Readmission Rate': '30-Day Hospital-Wide All-Cause Unplanned Readmission Rate', 
    'Rate of readmission after discharge from hospital (hospital-wide)': '30-Day Hospital-Wide All-Cause Unplanned Readmission Rate',
    
    'Pneumonia (PN) 30-Day Readmission Rate': '30-Day Pneumonia Readmission Rate', 
    'Rate of readmission for pneumonia patients': '30-Day Pneumonia Readmission Rate',
    
    'Rate of readmission for stroke patients': '30-Day Readmission Rate for stroke patients',
}


df['Measure Name'].replace(to_replace=d, inplace=True)
print(len(df['Measure Name'].unique()))
print(df['Measure Name'].unique())


ls = ['Facility ID', 'Facility Name',
      'file_month', 'file_year',
      'Measure ID', 'Measure Name',
      'Start Date', 'End Date', 
      'Denominator', 'Score',
      'Higher Estimate', 'Lower Estimate',
      'Number of Patients', 'Number of Patients Returned',
      'Compared to National', 
      'Footnote', 'Address', 'City',  
      'County Name', 'Phone Number', 
      'State', 'ZIP Code',
     ]

df = df.filter(items=ls)
print(df.shape)


15
['EDAC-30 AMI' 'EDAC-30 HF' 'EDAC-30 PN' 'OP-32' 'OP-35 ADM' 'OP-35 ED'
 'OP-36' 'READM-30 AMI' 'READM-30 CABG' 'READM-30 COPD' 'READM-30 HF'
 'READM-30 HIP-KNEE' 'READM-30 HOSP-WIDE' 'READM-30 PN' 'READM-30 STK']
15
['Hospital return days for AMI patients'
 'Hospital return days for HF patients'
 'Hospital return days for PN patients'
 'Rate of unplanned visits after colonoscopy (per 1K)'
 'Inpatient admit rate for patients receiving outpatient chemo'
 'ED visit rate for patients receiving outpatient chemo'
 'Ratio of unplanned visits after outpatient surgery'
 'AMI 30-Day Readmission Rate' 'CABG 30-Day Readmission Rate'
 '30-Day Readmission Rate' '30-Day HF Readmission Rate'
 '30-Day Readmission Rate after hip/knee replacement'
 '30-Day Hospital-Wide All-Cause Unplanned Readmission Rate'
 '30-Day Pneumonia Readmission Rate'
 '30-Day Readmission Rate for stroke patients']
(930286, 22)


In [5]:
df['Measure Name'] = df['Measure Name'] + ' (' + df['Measure ID'] + ')'

tdf = df.filter(items=['Measure Name'])
tdf.drop_duplicates(inplace=True)

print(tdf.shape)
print(len(df['Measure Name'].unique()), 'Measure Name, ID')

tdf.sort_values(by=['Measure Name'], ascending=True, inplace=True)
tdf.head(100)

(15, 1)
15 Measure Name, ID


Unnamed: 0,Measure Name
10,30-Day HF Readmission Rate (READM-30 HF)
12,30-Day Hospital-Wide All-Cause Unplanned Readm...
13,30-Day Pneumonia Readmission Rate (READM-30 PN)
9,30-Day Readmission Rate (READM-30 COPD)
11,30-Day Readmission Rate after hip/knee replace...
11,30-Day Readmission Rate for stroke patients (R...
7,AMI 30-Day Readmission Rate (READM-30 AMI)
8,CABG 30-Day Readmission Rate (READM-30 CABG)
5,ED visit rate for patients receiving outpatien...
0,Hospital return days for AMI patients (EDAC-30...


In [6]:
df = df.filter(items=['Facility ID', 'Facility Name',
                      'file_month', 'file_year',
                      'Measure Name',
                      'Denominator', 'Score',
                      'Higher Estimate', 'Lower Estimate',
                      'Number of Patients', 'Number of Patients Returned',
                      'Start Date', 'End Date'], axis=1)


In [7]:
cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']
main_df = pd.DataFrame(columns=cols)
Measures = sorted(df['Measure Name'].unique())

for i, mi in enumerate(Measures):    
    tdf = df[df['Measure Name'] == mi]
    measures = sorted(tdf['Measure Name'].unique())
    
    df2 = pd.DataFrame(columns=cols)

    for j, m in enumerate(measures):
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in cols:
                continue
            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m + ' (' + n + ')'}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        df2 = df2.merge(tdf2, on=cols, how='outer')
    
    main_df = main_df.merge(df2, on=cols, how='outer')

tdf = main_df.copy(deep=True)
del df2, main_df

print(tdf.shape)
tdf = tdf.loc[:, ~tdf.T.duplicated(keep='first')]
tdf.dropna(how='all', axis=1, inplace=True)
print(tdf.shape)
tdf.drop_duplicates(inplace=True)
print(tdf.shape)
tdf.drop_duplicates(subset = cols, inplace=True)
print(tdf.shape)
tdf.head()


(346283, 96)
(346283, 71)
(346283, 71)
(346283, 71)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,30-Day HF Readmission Rate (READM-30 HF) (Denominator),30-Day HF Readmission Rate (READM-30 HF) (Score),30-Day HF Readmission Rate (READM-30 HF) (Higher Estimate),30-Day HF Readmission Rate (READM-30 HF) (Lower Estimate),Start Date,End Date,30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Denominator),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Score),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Higher Estimate),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Lower Estimate),30-Day Pneumonia Readmission Rate (READM-30 PN) (Denominator),30-Day Pneumonia Readmission Rate (READM-30 PN) (Score),30-Day Pneumonia Readmission Rate (READM-30 PN) (Higher Estimate),30-Day Pneumonia Readmission Rate (READM-30 PN) (Lower Estimate),30-Day Readmission Rate (READM-30 COPD) (Denominator),30-Day Readmission Rate (READM-30 COPD) (Score),30-Day Readmission Rate (READM-30 COPD) (Higher Estimate),30-Day Readmission Rate (READM-30 COPD) (Lower Estimate),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Denominator),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Score),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Higher Estimate),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Lower Estimate),30-Day Readmission Rate for stroke patients (READM-30 STK) (Denominator),30-Day Readmission Rate for stroke patients (READM-30 STK) (Score),30-Day Readmission Rate for stroke patients (READM-30 STK) (Higher Estimate),30-Day Readmission Rate for stroke patients (READM-30 STK) (Lower Estimate),AMI 30-Day Readmission Rate (READM-30 AMI) (Denominator),AMI 30-Day Readmission Rate (READM-30 AMI) (Score),AMI 30-Day Readmission Rate (READM-30 AMI) (Higher Estimate),AMI 30-Day Readmission Rate (READM-30 AMI) (Lower Estimate),CABG 30-Day Readmission Rate (READM-30 CABG) (Denominator),CABG 30-Day Readmission Rate (READM-30 CABG) (Score),CABG 30-Day Readmission Rate (READM-30 CABG) (Higher Estimate),CABG 30-Day Readmission Rate (READM-30 CABG) (Lower Estimate),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Denominator),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Score),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Higher Estimate),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Lower Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Denominator),Hospital return days for AMI patients (EDAC-30 AMI) (Score),Hospital return days for AMI patients (EDAC-30 AMI) (Higher Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Lower Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Number of Patients),Hospital return days for AMI patients (EDAC-30 AMI) (Number of Patients Returned),Hospital return days for HF patients (EDAC-30 HF) (Denominator),Hospital return days for HF patients (EDAC-30 HF) (Score),Hospital return days for HF patients (EDAC-30 HF) (Higher Estimate),Hospital return days for HF patients (EDAC-30 HF) (Lower Estimate),Hospital return days for HF patients (EDAC-30 HF) (Number of Patients),Hospital return days for HF patients (EDAC-30 HF) (Number of Patients Returned),Hospital return days for PN patients (EDAC-30 PN) (Denominator),Hospital return days for PN patients (EDAC-30 PN) (Score),Hospital return days for PN patients (EDAC-30 PN) (Higher Estimate),Hospital return days for PN patients (EDAC-30 PN) (Lower Estimate),Hospital return days for PN patients (EDAC-30 PN) (Number of Patients),Hospital return days for PN patients (EDAC-30 PN) (Number of Patients Returned),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Score),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Higher Estimate),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Lower Estimate),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Denominator),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Score),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Higher Estimate),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Lower Estimate),Ratio of unplanned visits after outpatient surgery (OP-36) (Denominator),Ratio of unplanned visits after outpatient surgery (OP-36) (Score),Ratio of unplanned visits after outpatient surgery (OP-36) (Higher Estimate),Ratio of unplanned visits after outpatient surgery (OP-36) (Lower Estimate)
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,755.0,22.4,25.0,20.1,2018-07-01,2021-06-30,,,,,436.0,16.4,18.9,14.2,202.0,19.9,23.5,16.8,98.0,4.2,6.1,2.8,,,,,319.0,14.9,17.7,12.5,165.0,11.7,15.4,8.8,,,,,319.0,1.9,19.6,-13.2,305.0,81.0,755.0,21.8,39.6,4.8,618.0,211.0,436.0,-1.5,16.7,-17.3,412.0,99.0,,,,,,,,,,,
1,10005,MARSHALL MEDICAL CENTERS,1,2023,157.0,21.4,25.4,17.8,2018-07-01,2021-06-30,,,,,361.0,16.7,19.5,14.3,234.0,17.6,21.1,14.8,178.0,4.0,5.8,2.7,,,,,38.0,14.4,17.8,11.4,,,,,,,,,38.0,4.7,49.5,-28.7,38.0,9.0,157.0,9.3,36.5,-15.3,145.0,48.0,361.0,2.2,19.1,-12.2,328.0,98.0,,,,,,,,,,,
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,550.0,20.0,22.7,17.5,2018-07-01,2021-06-30,,,,,538.0,19.5,22.2,17.2,235.0,17.7,21.0,14.9,246.0,4.8,6.7,3.4,,,,,312.0,16.5,19.7,13.8,109.0,15.6,20.6,11.6,,,,,312.0,25.9,45.1,8.4,301.0,80.0,550.0,-2.3,16.8,-18.8,456.0,136.0,538.0,42.3,62.6,24.5,499.0,147.0,,,,,,,,,,,
3,10007,MIZELL MEMORIAL HOSPITAL,1,2023,51.0,23.7,28.3,19.1,2018-07-01,2021-06-30,,,,,99.0,16.4,19.8,13.2,72.0,19.7,23.8,15.7,,,,,,,,,,,,,,,,,,,,,,,,,,,51.0,36.3,75.3,1.9,44.0,22.0,99.0,-12.6,6.9,-29.0,91.0,23.0,,,,,,,,,,,
4,10011,ST. VINCENT'S EAST,1,2023,298.0,19.2,22.5,16.3,2018-07-01,2021-06-30,,,,,346.0,17.5,20.4,15.0,101.0,20.6,24.7,17.0,36.0,4.2,6.4,2.7,,,,,139.0,16.0,19.7,13.0,61.0,13.8,18.4,10.1,,,,,139.0,13.0,36.7,-5.3,134.0,40.0,298.0,17.9,45.6,-4.8,268.0,73.0,346.0,7.7,28.0,-9.5,317.0,85.0,,,,,,,,,,,


In [8]:
ttdf = tdf.drop(labels=['Start Date', 'End Date'], axis=1)
ttdf.drop_duplicates(inplace=True)
print(ttdf.shape)


(346283, 69)


In [9]:

start_time = time.time()
ttdf['marker'] = ttdf['Facility ID'] + ' | ' + ttdf['Facility Name']  + ' | ' + ttdf['file_month'] + ' | ' + ttdf['file_year']
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
column = ttdf.pop('marker')
ttdf.insert(0, column.name, column)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
collapsed_df = ttdf.groupby(ttdf.marker).apply(lambda group: group.ffill().bfill().head(1))
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf = collapsed_df.reset_index(drop=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

ttdf.head()

Run time = 0.214 seconds
(346283, 70)
Run time = 0.003 seconds
(346283, 70)
Run time = 0.956 seconds
(346283, 70)
Run time = 1397.064 seconds
(346283, 70)
Run time = 0.078 seconds
(111463, 70)
Run time = 0.287 seconds
(111463, 70)


Unnamed: 0,marker,Facility ID,Facility Name,file_month,file_year,30-Day HF Readmission Rate (READM-30 HF) (Denominator),30-Day HF Readmission Rate (READM-30 HF) (Score),30-Day HF Readmission Rate (READM-30 HF) (Higher Estimate),30-Day HF Readmission Rate (READM-30 HF) (Lower Estimate),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Denominator),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Score),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Higher Estimate),30-Day Hospital-Wide All-Cause Unplanned Readmission Rate (READM-30 HOSP-WIDE) (Lower Estimate),30-Day Pneumonia Readmission Rate (READM-30 PN) (Denominator),30-Day Pneumonia Readmission Rate (READM-30 PN) (Score),30-Day Pneumonia Readmission Rate (READM-30 PN) (Higher Estimate),30-Day Pneumonia Readmission Rate (READM-30 PN) (Lower Estimate),30-Day Readmission Rate (READM-30 COPD) (Denominator),30-Day Readmission Rate (READM-30 COPD) (Score),30-Day Readmission Rate (READM-30 COPD) (Higher Estimate),30-Day Readmission Rate (READM-30 COPD) (Lower Estimate),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Denominator),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Score),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Higher Estimate),30-Day Readmission Rate after hip/knee replacement (READM-30 HIP-KNEE) (Lower Estimate),30-Day Readmission Rate for stroke patients (READM-30 STK) (Denominator),30-Day Readmission Rate for stroke patients (READM-30 STK) (Score),30-Day Readmission Rate for stroke patients (READM-30 STK) (Higher Estimate),30-Day Readmission Rate for stroke patients (READM-30 STK) (Lower Estimate),AMI 30-Day Readmission Rate (READM-30 AMI) (Denominator),AMI 30-Day Readmission Rate (READM-30 AMI) (Score),AMI 30-Day Readmission Rate (READM-30 AMI) (Higher Estimate),AMI 30-Day Readmission Rate (READM-30 AMI) (Lower Estimate),CABG 30-Day Readmission Rate (READM-30 CABG) (Denominator),CABG 30-Day Readmission Rate (READM-30 CABG) (Score),CABG 30-Day Readmission Rate (READM-30 CABG) (Higher Estimate),CABG 30-Day Readmission Rate (READM-30 CABG) (Lower Estimate),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Denominator),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Score),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Higher Estimate),ED visit rate for patients receiving outpatient chemo (OP-35 ED) (Lower Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Denominator),Hospital return days for AMI patients (EDAC-30 AMI) (Score),Hospital return days for AMI patients (EDAC-30 AMI) (Higher Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Lower Estimate),Hospital return days for AMI patients (EDAC-30 AMI) (Number of Patients),Hospital return days for AMI patients (EDAC-30 AMI) (Number of Patients Returned),Hospital return days for HF patients (EDAC-30 HF) (Denominator),Hospital return days for HF patients (EDAC-30 HF) (Score),Hospital return days for HF patients (EDAC-30 HF) (Higher Estimate),Hospital return days for HF patients (EDAC-30 HF) (Lower Estimate),Hospital return days for HF patients (EDAC-30 HF) (Number of Patients),Hospital return days for HF patients (EDAC-30 HF) (Number of Patients Returned),Hospital return days for PN patients (EDAC-30 PN) (Denominator),Hospital return days for PN patients (EDAC-30 PN) (Score),Hospital return days for PN patients (EDAC-30 PN) (Higher Estimate),Hospital return days for PN patients (EDAC-30 PN) (Lower Estimate),Hospital return days for PN patients (EDAC-30 PN) (Number of Patients),Hospital return days for PN patients (EDAC-30 PN) (Number of Patients Returned),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Score),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Higher Estimate),Inpatient admit rate for patients receiving outpatient chemo (OP-35 ADM) (Lower Estimate),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Denominator),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Score),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Higher Estimate),Rate of unplanned visits after colonoscopy (per 1K) (OP-32) (Lower Estimate),Ratio of unplanned visits after outpatient surgery (OP-36) (Denominator),Ratio of unplanned visits after outpatient surgery (OP-36) (Score),Ratio of unplanned visits after outpatient surgery (OP-36) (Higher Estimate),Ratio of unplanned visits after outpatient surgery (OP-36) (Lower Estimate)
0,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2018,1052.0,20.9,23.1,18.9,5411.0,15.6,17.3,14.2,656.0,18.5,21.2,16.1,615.0,19.7,22.3,17.3,342.0,4.7,6.4,3.5,568.0,12.2,14.5,10.2,810.0,16.1,18.2,14.2,275.0,14.3,17.7,11.5,,,,,810.0,-5.1,6.2,-15.5,,,1052.0,12.3,27.0,-1.0,,,,,,,,,,,,255.0,14.8,21.0,10.2,,,,
1,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2020,1114.0,22.6,24.8,20.5,4659.0,15.4,17.0,13.9,604.0,16.8,19.2,14.6,493.0,19.8,22.7,17.4,301.0,4.7,6.3,3.5,,,,,742.0,16.2,18.5,14.0,279.0,15.6,19.3,12.5,190.0,7.1,9.9,5.0,742.0,-0.8,10.6,-11.6,,,1114.0,17.8,31.5,4.5,,,604.0,-7.8,6.0,-21.3,,,10.8,13.9,8.3,606.0,13.4,18.1,9.8,993.0,0.8,1.0,0.7
2,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2021,1106.0,23.4,25.7,21.4,4474.0,15.6,17.4,14.0,594.0,15.4,17.7,13.3,443.0,20.7,23.7,18.1,258.0,4.6,6.3,3.4,,,,,620.0,16.3,18.8,14.1,268.0,14.9,18.6,11.9,192.0,6.7,9.4,4.6,620.0,7.0,21.7,-5.4,589.0,171.0,1106.0,21.6,36.3,7.8,842.0,310.0,594.0,-20.6,-7.8,-32.9,540.0,132.0,11.3,14.2,8.8,511.0,14.1,19.2,10.4,1003.0,0.9,1.0,0.7
3,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 03...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,3,2019,1060.0,21.8,24.1,19.8,5232.0,15.4,17.0,13.9,603.0,17.4,20.0,15.0,568.0,19.4,22.1,17.0,335.0,4.3,5.8,3.2,610.0,12.4,14.7,10.5,798.0,15.9,18.2,13.8,290.0,14.9,18.3,12.0,,,,,798.0,-5.9,4.8,-15.9,,,1060.0,17.1,31.7,3.4,,,603.0,8.4,24.4,-5.9,,,,,,205.0,13.9,19.2,10.0,,,,
4,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 03...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,3,2021,1106.0,23.4,25.7,21.4,4474.0,15.6,17.4,14.0,594.0,15.4,17.7,13.3,443.0,20.7,23.7,18.1,258.0,4.6,6.3,3.4,,,,,620.0,16.3,18.8,14.1,268.0,14.9,18.6,11.9,192.0,6.7,9.4,4.6,620.0,7.0,21.7,-5.4,589.0,171.0,1106.0,21.6,36.3,7.8,842.0,310.0,594.0,-20.6,-7.8,-32.9,540.0,132.0,11.3,14.2,8.8,511.0,14.1,19.2,10.4,1003.0,0.9,1.0,0.7


## Save dataframe

In [10]:
ttdf.drop(labels=['marker'], axis=1, inplace=True)
ttdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/unplanned_visits_df.pkl.gz', protocol=5, compression='gzip')


In [11]:
m1 = list(ttdf)
ls = ['Facility ID','Facility Name','file_month','file_year']
for l in ls: 
    m1.remove(l)

## Save measurement dates

In [12]:
# Columns to keep as is
id_cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']

# Melt the specific columns and create the 'Measure' and 'Score' columns
measures_df = tdf.melt(id_vars=id_cols, var_name='Measure Name', value_name='Score')
measures_df.drop(labels=['Score', 'Facility ID', 'Facility Name'], axis=1, inplace=True)

print(measures_df.shape)
measures_df.drop_duplicates(inplace=True)
measures_df.reset_index(drop=True, inplace=True)
print(measures_df.shape)

measures_df['Start Date'] = pd.to_datetime(measures_df['Start Date'])
measures_df['End Date'] = pd.to_datetime(measures_df['End Date'])
measures_df.to_csv('~/GitHub/hospitals-data-archive/measure_dates/unplanned_visits_df.csv')

measures_df.head()

(22508395, 5)
(6240, 5)


Unnamed: 0,file_month,file_year,Start Date,End Date,Measure Name
0,1,2023,2018-07-01,2021-06-30,30-Day HF Readmission Rate (READM-30 HF) (Deno...
1,4,2023,2018-07-01,2021-06-30,30-Day HF Readmission Rate (READM-30 HF) (Deno...
2,7,2023,2019-07-01,2022-06-30,30-Day HF Readmission Rate (READM-30 HF) (Deno...
3,10,2023,2019-07-01,2022-06-30,30-Day HF Readmission Rate (READM-30 HF) (Deno...
4,1,2022,2017-07-01,2019-12-01,30-Day HF Readmission Rate (READM-30 HF) (Deno...


In [13]:
m2 = measures_df['Measure Name'].unique().tolist()
sorted(m1) == sorted(m2)

True