In [25]:
# Meta --------------------------------------------------------------------
## Title:         Summary of Changes table
## Author:        Wonjun Choi
## Date Created:  9/9/2022
## Date Edited:   9/9/2022

## Dependency: numpy, pandas, tabula
###########################################################################
import os
import numpy as np
import pandas as pd
import tabula

# Hardcoding part
pagecontent_2007 = {"Reg Del": [1,2], "Reg Add": [3], "Nonreg Del": [5,6], "Nonreg Add": [7,8,9], "Mergers and Acquisitions": [10]}
pagecontent_2008 = {"Reg Del": [1], "Reg Add": [2,3,4], "Nonreg Del": [6,7,8,9,10], "Nonreg Add": [11,12,13,14,15,16], "Mergers and Acquisitions": [17,18]}
pagecontent_2009 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2010 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2011 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2012 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2013 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2014 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2015 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2016 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2017 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2018 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2019 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}

pagecontent_all = {"2007": pagecontent_2007, "2008": pagecontent_2008, "2009": pagecontent_2009,
                  "2010": pagecontent_2010, "2011": pagecontent_2011, "2012": pagecontent_2012,
                  "2013": pagecontent_2013, "2014": pagecontent_2014, "2015": pagecontent_2015,
                  "2016": pagecontent_2016, "2017": pagecontent_2017, "2018": pagecontent_2018,
                  "2019": pagecontent_2019}

# functions
def tidy_nonreg_add(tab):
    """
    tidy tabula result of Nonregistered Additions
    
    the following algorithm can be written in more efficient way, but it works for now.
    
    CAVEAT: this code covers upto 3 line splits (2 lines of NA's)
    """
    t = tab.copy()
    t.columns = t.iloc[0]
    t = t[1:].reset_index(drop=True)

    names = t.columns
    for n in names:
        for i in [1,2]:
            t[n+str(i+1)] = t[n].shift(-i)  # create lead(1), lead(2) columns
    
    for r in range(len(t)):
        for n in ['HOSPITAL NAME', 'CITY', 'STATE']:       # e.g. if CITY2 = NA,
            t[n+str(2)] = t[n+str(2)].fillna(t[n+str(3)])  #      push CITY3 into CITY2

        if pd.isna(t.loc[r,"ID2"]):
            t.loc[r,'ADDITION2'] = str(t.loc[r, 'ADDITION2']) + ' ' + str(t.loc[r, 'ADDITION3'])
            t.loc[r,'ID2'] = t.loc[r,'ID3']  # e.g. if ID2 = NA, merge ADDITION2 and 3, push ID3->2.

    for r in range(len(t)):  # repeat above to (NAME2 -> NAME1) pairs
        for n in ['HOSPITAL NAME', 'CITY', 'STATE']:
            t[n] = t[n].fillna(t[n+str(2)])

        if pd.isna(t.loc[r,"ID"]):
            t.loc[r,'ADDITION'] = str(t.loc[r, 'ADDITION']) + ' ' + str(t.loc[r, 'ADDITION2'])
            t.loc[r,'ID'] = t.loc[r,'ID2']
        
    t = t[['ID', 'ADDITION', 'HOSPITAL NAME', 'CITY', 'STATE']]
    t = t.loc[~t.duplicated('ID')].reset_index().drop('index', axis=1)
    t.columns = ['ID', 'REASON FOR ADDITION', 'HOSPITAL NAME', 'CITY', 'STATE']
    
    return t

def tidy_merger(tab):
    t = tab.copy()
    t = t.loc[1:]

    t['Unnamed: 0'] = t['ID NAME'].apply(lambda idname: ' '.join(idname.split(' ')[1:]) if pd.isna(idname)==False else np.nan)
    t['ID NAME'] = t['ID NAME'].apply(lambda idname: idname.split(' ')[0] if pd.isna(idname)==False else np.nan)

    t['Unnamed: 1'] = t['MERGER MERGED NAME'].apply(lambda mmn: ' '.join(mmn.split(' ')[1:]) if pd.isna(mmn)==False else np.nan)
    t['MERGER MERGED NAME'] = t['MERGER MERGED NAME'].apply(lambda mmn: mmn.split(' ')[0] if pd.isna(mmn)==False else np.nan)

    t['MERGED STATE'] = t['MERGED CITY MERGED'].apply(lambda mcm: mcm.split(' ')[-1] if pd.isna(mcm)==False else np.nan)
    t['MERGED CITY MERGED'] = t['MERGED CITY MERGED'].apply(lambda mcm: ' '.join(mcm.split(' ')[:-1]) if pd.isna(mcm)==False else np.nan)

    t = t.rename(columns = {'ID NAME': 'ID', 'Unnamed: 0': 'NAME', 'MERGER MERGED NAME': 'MERGER RESULT ID',
                       'Unnamed: 1': 'MERGED NAME', 'MERGED CITY MERGED': 'MERGED CITY'})
    t = t.reset_index().drop(['index'],axis=1)
    
    # If na, copy the value from above
    # this code assumes the first row never contains NaN
    while pd.isna(t).values.any():
#        iterations = 1
#        print(iterations)
        for row in range(len(t)):
            if pd.isna(t.loc[row]).values.any():
                for name in ['MERGER RESULT ID', 'MERGED NAME', 'MERGED CITY', 'MERGED STATE']:
                    t.loc[row][name] = t.loc[row-1][name]
#        iterations += 1
    return t

##########################################################################
# make pdf tables into csv

dir_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# dir_root
# os.chdir(dir_root)
for year in range(2007,2009):
    file_pdf = os.path.join(dir_root,'data','input','AHA FY {}'.format(year),'DOC','{} Summary of Changes.pdf'.format(year))
    tables = tabula.read_pdf(file_pdf, pages='all', multiple_tables=True)
    
    pagecontent = pagecontent_all[str(year)]
    
    table_reg_del = pd.concat([tables[page] for page in pagecontent["Reg Del"]], ignore_index=True)
    table_reg_del = table_reg_del.dropna().astype({"ID": int})  # formatting
    
    table_reg_add = pd.concat([tables[page] for page in pagecontent["Reg Add"]], ignore_index=True)
    table_reg_add = table_reg_add.dropna().astype({"ID": int})
    
    table_nonreg_del = pd.concat([tables[page] for page in pagecontent["Nonreg Del"]], ignore_index=True)
    table_nonreg_del = table_nonreg_del.dropna().astype({"ID": int})  # formatting
    
    table_nonreg_add = pd.concat([tidy_nonreg_add(tables[page]) for page in pagecontent["Nonreg Add"]], ignore_index=True)
    table_nonreg_add = table_nonreg_add.dropna().astype({"ID": int})  # formatting
    
    table_merger = pd.concat([tidy_merger(tables[page]) for page in pagecontent["Mergers and Acquisitions"]], ignore_index=True)
    
    # save csv
    outfile_base = os.path.join(dir_root, 'data','temp')
    table_reg_del.to_csv(os.path.join(outfile_base,'change_reg_del_{}.csv'.format(year)), header=True, index=False)
    table_reg_add.to_csv(os.path.join(outfile_base,'change_reg_add_{}.csv'.format(year)), header=True, index=False)
    table_nonreg_del.to_csv(os.path.join(outfile_base,'change_nonreg_del_{}.csv'.format(year)), header=True, index=False)
    table_nonreg_add.to_csv(os.path.join(outfile_base,'change_nonreg_add_{}.csv'.format(year)), header=True, index=False)
    table_merger.to_csv(os.path.join(outfile_base,'change_merger_{}.csv'.format(year)), header=True, index=False)
    
    
print("I'm Done!!")

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('int64'), dtype('<U1')) -> None

In [25]:
# Meta --------------------------------------------------------------------
## Title:         Summary of Changes table
## Author:        Wonjun Choi
## Date Created:  9/9/2022
## Date Edited:   9/9/2022

## Dependency: numpy, pandas, tabula
###########################################################################
import os
import numpy as np
import pandas as pd
import tabula

# Hardcoding part
pagecontent_2007 = {"Reg Del": [1,2], "Reg Add": [3], "Nonreg Del": [5,6], "Nonreg Add": [7,8,9], "Mergers and Acquisitions": [10]}
pagecontent_2008 = {"Reg Del": [1], "Reg Add": [2,3,4], "Nonreg Del": [6,7,8,9,10], "Nonreg Add": [11,12,13,14,15,16], "Mergers and Acquisitions": [17,18]}
pagecontent_2009 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2010 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2011 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2012 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2013 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2014 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2015 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2016 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2017 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2018 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}
pagecontent_2019 = {"Reg Del": [], "Reg Add": [], "Nonreg Del": [], "Nonreg Add": [], "Mergers and Acquisitions": []}

pagecontent_all = {"2007": pagecontent_2007, "2008": pagecontent_2008, "2009": pagecontent_2009,
                  "2010": pagecontent_2010, "2011": pagecontent_2011, "2012": pagecontent_2012,
                  "2013": pagecontent_2013, "2014": pagecontent_2014, "2015": pagecontent_2015,
                  "2016": pagecontent_2016, "2017": pagecontent_2017, "2018": pagecontent_2018,
                  "2019": pagecontent_2019}

# functions
def tidy_nonreg_del(tab):
    t = tab.copy()
    for name in ['REASON FOR DELETION', 'HOSPITAL NAME']:
        t[name] = t[name].apply(lambda text: text.replace("\r"," "))
        
    return t

def tidy_nonreg_add(tab):
    """
    tidy tabula result of Nonregistered Additions
    
    the following algorithm can be written in more efficient way, but it works for now.
    
    CAVEAT: this code covers upto 3 line splits (2 lines of NA's)
    """
    t = tab.copy()
    
    if year is 2007:  # year 2007 document sucks
        t.columns = t.iloc[0]
        t = t[1:].reset_index(drop=True)

        names = t.columns
        for n in names:
            for i in [1,2]:
                t[n+str(i+1)] = t[n].shift(-i)  # create lead(1), lead(2) columns

        for r in range(len(t)):
            for n in ['HOSPITAL NAME', 'CITY', 'STATE']:       # e.g. if CITY2 = NA,
                t[n+str(2)] = t[n+str(2)].fillna(t[n+str(3)])  #      push CITY3 into CITY2

            if pd.isna(t.loc[r,"ID2"]):
                t.loc[r,'ADDITION2'] = str(t.loc[r, 'ADDITION2']) + ' ' + str(t.loc[r, 'ADDITION3'])
                t.loc[r,'ID2'] = t.loc[r,'ID3']  # e.g. if ID2 = NA, merge ADDITION2 and 3, push ID3->2.

        for r in range(len(t)):  # repeat above to (NAME2 -> NAME1) pairs
            for n in ['HOSPITAL NAME', 'CITY', 'STATE']:
                t[n] = t[n].fillna(t[n+str(2)])

            if pd.isna(t.loc[r,"ID"]):
                t.loc[r,'ADDITION'] = str(t.loc[r, 'ADDITION']) + ' ' + str(t.loc[r, 'ADDITION2'])
                t.loc[r,'ID'] = t.loc[r,'ID2']

        t = t[['ID', 'ADDITION', 'HOSPITAL NAME', 'CITY', 'STATE']]
        t = t.loc[~t.duplicated('ID')].reset_index().drop('index', axis=1)
        t.columns = ['ID', 'REASON FOR ADDITION', 'HOSPITAL NAME', 'CITY', 'STATE']
    
    if year is not 2007:
        t.columns = [name.replace("\r"," ") for name in t.columns]
        t["REASON FOR ADDITION"] = t["REASON FOR ADDITION"].apply(lambda x: x.replace("\r"," "))
    
    return t

def tidy_merger(tab):
    t = tab.copy()
    t = t.loc[1:]

    t['Unnamed: 0'] = t['ID NAME'].apply(lambda idname: ' '.join(idname.split(' ')[1:]) if pd.isna(idname)==False else np.nan)
    t['ID NAME'] = t['ID NAME'].apply(lambda idname: idname.split(' ')[0] if pd.isna(idname)==False else np.nan)

    t['Unnamed: 1'] = t['MERGER MERGED NAME'].apply(lambda mmn: ' '.join(mmn.split(' ')[1:]) if pd.isna(mmn)==False else np.nan)
    t['MERGER MERGED NAME'] = t['MERGER MERGED NAME'].apply(lambda mmn: mmn.split(' ')[0] if pd.isna(mmn)==False else np.nan)

    t['MERGED STATE'] = t['MERGED CITY MERGED'].apply(lambda mcm: mcm.split(' ')[-1] if pd.isna(mcm)==False else np.nan)
    t['MERGED CITY MERGED'] = t['MERGED CITY MERGED'].apply(lambda mcm: ' '.join(mcm.split(' ')[:-1]) if pd.isna(mcm)==False else np.nan)

    t = t.rename(columns = {'ID NAME': 'ID', 'Unnamed: 0': 'NAME', 'MERGER MERGED NAME': 'MERGER RESULT ID',
                       'Unnamed: 1': 'MERGED NAME', 'MERGED CITY MERGED': 'MERGED CITY'})
    t = t.reset_index().drop(['index'],axis=1)
    
    # If na, copy the value from above
    # this code assumes the first row never contains NaN
    while pd.isna(t).values.any():
#        iterations = 1
#        print(iterations)
        for row in range(len(t)):
            if pd.isna(t.loc[row]).values.any():
                for name in ['MERGER RESULT ID', 'MERGED NAME', 'MERGED CITY', 'MERGED STATE']:
                    t.loc[row][name] = t.loc[row-1][name]
#        iterations += 1
    return t

##########################################################################
# make pdf tables into csv

dir_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# dir_root
# os.chdir(dir_root)
#for year in range(2007,2009):
year = 2008

file_pdf = os.path.join(dir_root,'data','input','AHA FY {}'.format(year),'DOC','{} Summary of Changes.pdf'.format(year))
tables = tabula.read_pdf(file_pdf, pages='all', multiple_tables=True)
    
pagecontent = pagecontent_all[str(year)]
    
table_reg_del = pd.concat([tables[page] for page in pagecontent["Reg Del"]], ignore_index=True)
table_reg_del = table_reg_del.dropna().astype({"ID": int})  # formatting
print("REG DEL")
    
table_reg_add = pd.concat([tables[page] for page in pagecontent["Reg Add"]], ignore_index=True)
table_reg_add = table_reg_add.dropna().astype({"ID": int})
print("REG ADD")    
    
table_nonreg_del = pd.concat([tidy_nonreg_del(tables[page]) for page in pagecontent["Nonreg Del"]], ignore_index=True)
table_nonreg_del = table_nonreg_del.dropna().astype({"ID": int})  # formatting
print("NON DEL")

table_nonreg_add = pd.concat([tidy_nonreg_add(tables[page]) for page in pagecontent["Nonreg Add"]], ignore_index=True)
table_nonreg_add = table_nonreg_add.dropna().astype({"ID": int})  # formatting
print("NON AGG")

table_merger = pd.concat([tidy_merger(tables[page]) for page in pagecontent["Mergers and Acquisitions"]], ignore_index=True)
print("MERGER")

print("I'm Done!!")

REG DEL
REG ADD
NON DEL
NON AGG


KeyError: 'ID NAME'

In [28]:
# table_reg_add[0:50]
# table_reg_add[51:100]
# table_reg_add[101:]

# table_nonreg_del[0:50]
# table_nonreg_del[51:100]
# table_nonreg_del[101:]

# table_nonreg_add[0:50]
# table_nonreg_add[51:100]
table_nonreg_add[101:]

Unnamed: 0,ID,REASON FOR ADDITION,HOSPITAL NAME,CITY,STATE
101,6850026,Newly added,Physicians Med Ctr of Santa Fe,Santa Fe,NM
102,6860042,Newly added,Gilbert Hospital,Gilbert,AZ
103,6860047,Newly added,Mountain Vista Medical Center,Mesa,AZ
104,6860233,Newly added,Aurora Behavioral Hlth Syst,Glendale,AZ
105,6870015,Newly added,Utah Valley Specialty Hospital,Provo,UT
106,6880034,Newly added,Vegas Valley Rehab Hospital,Las Vegas,NV
107,6880039,Newly added,Centennial Hills Hosp Med Ctr,Las Vegas,NV
108,6880041,Newly added,Spring Mountain Sahara,Las Vegas,NV
109,6910605,Formerly ambulatory care,Group Health Central Hospital,Seattle,WA
110,6920012,Newly added,Sacred Heart Medical Center,Springfield,OR


In [22]:
table_nonreg_add = pd.concat([tidy_nonreg_add(tables[page]) for page in pagecontent["Nonreg Add"]], ignore_index=True)
#table_nonreg_add = table_nonreg_add.dropna().astype({"ID": int})  # formatting


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('int64'), dtype('<U1')) -> None

In [24]:
pagecontent["Nonreg Add"]
tables[11]

Unnamed: 0,ID,REASON FOR\rADDITION,HOSPITAL NAME,CITY,STATE
0,6140028,Newly Added,Kindred Hospital Park View,Rochdale,MA
1,6210060,Newly Added,Brooklyn Children's Psych Ctr,Brooklyn,NY
2,6210061,Newly Added,Eddy Cohoes Rehab Center,Cohoes,NY
3,6211135,Status changed\rfrom registered to,St Joseph Hospital,Cheektowaga,NY
4,6220028,nNoenwrleyg Aisdtedreedd,Kindred Hosp New Jersey-Rahway,Rahway,NJ
5,6220029,Newly Added,Kindred Hosp New Jersey-Wayne,Wayne,NJ
6,6220031,Newly Added,Specialty Hospital at Kimball,Lakewood,NJ
7,6220275,Newly added,South Jersey Healthcare-Elmer,Elmer,NJ
8,6230063,Newly Added,Good Shepherd Specialty Hosp,Bethlehem,PA
9,6230068,Newly Added,Triumph Hospital Easton,Easton,PA
