In [1]:
from collections import namedtuple
from pathlib import Path, PosixPath

import numpy as np
import pandas as pd
from sqlalchemy import create_engine

from src import COMPLETED

pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
Hospital = namedtuple('Hospital', ['cms', 'location', 'fname', 'url'])

In [3]:
PRICE_COLS = ['cms_certification_num', 'payer', 'code', 'internal_revenue_code',
              #'units',
              'description', 'inpatient_outpatient', 'price', 'code_disambiguator']

PK_COLS = ['cms_certification_num', 'code', 'inpatient_outpatient', 'internal_revenue_code',
           'code_disambiguator', 'payer']

CHARGEMASTER_URL = "https://www.swedish.org/patient-visitor-info/pricing-transparency-overview"

In [4]:
RECORDS = [
    Hospital('500027', 'first_hill', '500027_SwedishFirstHill_StandardCharges.xlsx', 'https://www.swedish.org/locations/first-hill-campus'),
    Hospital('500025', 'cherry_hill', '500025_SwedishCherryHill_StandardCharges.xlsx', 'https://www.swedish.org/locations/cherry-hill-campus'),
    Hospital('500026', 'edmonds', '500026_SwedishEdmonds_StandardCharges.xlsx', 'https://www.swedish.org/locations/edmonds-campus'),
    Hospital('500152', 'issaquah', '500152_SwedishIssaquah_StandardCharges.xlsx', 'https://www.swedish.org/locations/issaquah-campus'),

    # not in hospitals table
    Hospital('500027B', 'ballard', '500027B_SwedishBallard_StandardCharges.xlsx', 'https://www.swedish.org/locations/ballard-campus'),
    Hospital('500027M', 'mill_creek', '500027M_SwedishMillCreek_StandardCharges.xlsx', 'https://www.swedish.org/locations/mill-creek-campus'),
    Hospital('500027R', 'redmond', '500027R_SwedishRedmond_StandardCharges.xlsx', 'https://www.swedish.org/locations/redmond-campus'),  # outpatient only
]

In [5]:
def load_sql_or_csv(fname: PosixPath) -> pd.DataFrame:
    """Either load saved csv or load from database and save as csv"""
    if fname.exists():
        print('loading csv')
        return pd.read_csv(fname, dtype={'cms_certification_num': str})
    else:
        print('loading from database')
        engine = create_engine("mysql+pymysql://root@localhost/hospital_price_transparency_v3")
        remaining_wa = pd.read_sql("select * from hospitals WHERE last_edited_by_username IS NULL AND state = 'WA'", engine)
        remaining_wa['cms_certification_num'] = remaining_wa['cms_certification_num'].astype(str)
        remaining_wa.to_csv('remaining_wa.csv', index=False)

        return remaining_wa

def create_hospital_row(df: pd.DataFrame, cms: str, homepage_url: str, charge_url: str) -> pd.DataFrame:
    t = df[df['cms_certification_num'] == cms].copy()
    t['homepage_url'] = homepage_url
    t['chargemaster_url'] = charge_url
    t['last_edited_by_username'] = 'joeeoj'

    return t

def add_prices(prices: pd.Series) -> float:
    """Convert to cents, sum, then convert back to float"""
    cents = [0 if pd.isnull(p) else int(p) * 100 for p in prices]
    return sum(cents)/100.0

In [6]:
remaining_wa = load_sql_or_csv(Path('remaining_wa.csv'))
remaining_wa = remaining_wa[~remaining_wa.isin(COMPLETED)]

remaining_wa[remaining_wa['name'].str.contains('SWEDISH')]

loading csv


Unnamed: 0,cms_certification_num,name,address,city,state,zip5,beds,phone_number,homepage_url,chargemaster_url,last_edited_by_username
19,500026,SWEDISH EDMONDS HOSPITAL,21601 76TH AVENUE WEST,EDMONDS,WA,98026,217,4256404000,,,
30,500152,SWEDISH ISSAQUAH,751 NE BLAKELY DR,ISSAQUAH,WA,98029,120,4253134000,,,
61,500025,SWEDISH MEDICAL CENTER / CHERRY HILL,500 17TH AVENUE,SEATTLE,WA,98122,376,2063202000,,,
62,500027,SWEDISH MEDICAL CENTER,747 BROADWAY,SEATTLE,WA,98122,697,2063866000,,,


## hospital

In [7]:
missing_from_table = set(['500027B', '500027M', '500027R'])
hospitals = []

for record in RECORDS:
    if record.cms not in missing_from_table:
        hospitals.append(create_hospital_row(remaining_wa, record.cms, record.url, CHARGEMASTER_URL))

hospital = pd.concat(hospitals)

hospital.to_csv('hospitals.csv', index=False)

hospital

Unnamed: 0,cms_certification_num,name,address,city,state,zip5,beds,phone_number,homepage_url,chargemaster_url,last_edited_by_username
62,500027,SWEDISH MEDICAL CENTER,747 BROADWAY,SEATTLE,WA,98122,697,2063866000,https://www.swedish.org/locations/first-hill-c...,https://www.swedish.org/patient-visitor-info/p...,joeeoj
61,500025,SWEDISH MEDICAL CENTER / CHERRY HILL,500 17TH AVENUE,SEATTLE,WA,98122,376,2063202000,https://www.swedish.org/locations/cherry-hill-...,https://www.swedish.org/patient-visitor-info/p...,joeeoj
19,500026,SWEDISH EDMONDS HOSPITAL,21601 76TH AVENUE WEST,EDMONDS,WA,98026,217,4256404000,https://www.swedish.org/locations/edmonds-campus,https://www.swedish.org/patient-visitor-info/p...,joeeoj
30,500152,SWEDISH ISSAQUAH,751 NE BLAKELY DR,ISSAQUAH,WA,98029,120,4253134000,https://www.swedish.org/locations/issaquah-campus,https://www.swedish.org/patient-visitor-info/p...,joeeoj


## prices

In [8]:
# cols are slightly different names in the different sheets but in the same order
COLS = ['internal_revenue_code', 'description', 'code', 'both_unit_price', 'both_base_price',
        'outpatient_unit_price', 'outpatient_base_price']

EDMOND_COLS = ['internal_revenue_code', 'description', 'code', 'both_unit_price', 'both_base_price',
               'outpatient_unit_price', 'outpatient_base_price', 'edmonds_only_both_price']

In [9]:
def open_or_load_parquet(fname: str) -> pd.DataFrame:
    """Open saved parquet, otherwise open given fname and save as parquet.
    Basically speed up reloads because reading Excel files is slow."""
    p = Path(fname)
    fout = p.with_suffix('.parquet')

    if not fout.exists():
        header = EDMOND_COLS if 'Edmonds' in fname else COLS

        gross = pd.read_excel(fname, sheet_name='Gross Charges', skiprows=[0,1,2,3])
        gross.columns = header
        gross['payer'] = 'GROSS CHARGE'
    
        cash = pd.read_excel(fname, sheet_name='Discount Cash Price - Gross', skiprows=[0,1,2,3])
        cash.columns = header
        cash['payer'] = 'CASH PRICE'

        df = pd.concat([gross, cash])
    
        round_cols = ['both_unit_price', 'both_base_price', 'outpatient_unit_price', 'outpatient_base_price']
        both_add_cols = ['both_unit_price', 'both_base_price']
        
        if 'Edmonds' in fname:
            round_cols += ['edmonds_only_both_price']
            both_add_cols += ['edmonds_only_both_price']

        for col in round_cols:
            df[col] = df[col].round(2)  # i hate it but it works

        df['both_total'] = df[both_add_cols].apply(add_prices, axis=1).round(2)
        df['outpatient_total'] = df[['outpatient_unit_price', 'outpatient_base_price']].apply(add_prices, axis=1).round(2)

        df.to_parquet(fout)

    return pd.read_parquet(fout)

In [10]:
def parse_swedish(hospital: Hospital) -> pd.DataFrame:
    orig = open_or_load_parquet(hospital.fname)

    # marked as IP / OP hence BOTH
    both = (orig[['internal_revenue_code', 'description', 'code', 'payer', 'both_total']]
            .rename(columns={'both_total': 'price'}))
    both['inpatient_outpatient'] = 'BOTH'

    outpatient = (orig[['internal_revenue_code', 'description', 'code', 'payer', 'outpatient_total']]
                  .rename(columns={'outpatient_total': 'price'}))
    outpatient['inpatient_outpatient'] = 'OUTPATIENT'

    df = pd.concat([both, outpatient])

    df['cms_certification_num'] = hospital.cms
    df['code_disambiguator'] = 'NONE'
    df['description'] = df['description'].str.strip()

    print(f'Dropping {df.duplicated(keep=False).sum():,} fully duplicated rows')
    df = df.drop_duplicates()

    # drop where price is null
    print(f"Dropping {df['price'].isnull().sum():,} rows where price is null")
    df = df.dropna(subset='price')

    # same for $0
    zero_dollars = (df['price'] == 0)
    print(f"Dropping {zero_dollars.sum():,} rows where price is $0")
    df = df[~zero_dollars]

    # fill in blank codes with NONE
    assert df['code'].isnull().sum() > 0
    df['code'] = df['code'].fillna('NONE')

    # dupe checks
    assert df.duplicated().sum() == 0
    assert df[PK_COLS].duplicated().sum() == 0

    # other checks
    assert df['price'].isnull().sum() == 0
    assert (df['price'] == 0).sum() == 0
    assert df['internal_revenue_code'].isnull().sum() == 0

    print(f"min price: {df['price'].min():,.2f}")
    print(f"max price: {df['price'].max():,.2f}")

    print(f'Total rows: {len(df):,}')

    return df

In [11]:
for hospital in RECORDS:
    print(hospital.cms, hospital.fname)
    fout = f'prices_{hospital.location}.csv'

    df = parse_swedish(hospital)
    df.to_csv(fout, index=False)

    print('-' * 80)

500027 500027_SwedishFirstHill_StandardCharges.xlsx
Dropping 0 fully duplicated rows
Dropping 0 rows where price is null
Dropping 9,075 rows where price is $0
min price: 1.00
max price: 391,706.00
Total rows: 16,829
--------------------------------------------------------------------------------
500025 500025_SwedishCherryHill_StandardCharges.xlsx
Dropping 0 fully duplicated rows
Dropping 0 rows where price is null
Dropping 9,075 rows where price is $0
min price: 1.00
max price: 391,706.00
Total rows: 16,829
--------------------------------------------------------------------------------
500026 500026_SwedishEdmonds_StandardCharges.xlsx
Dropping 15,544 fully duplicated rows
Dropping 0 rows where price is null
Dropping 9,904 rows where price is $0
min price: 1.00
max price: 391,706.00
Total rows: 17,676
--------------------------------------------------------------------------------
500152 500152_SwedishIssaquah_StandardCharges.xlsx
Dropping 0 fully duplicated rows
Dropping 0 rows where