# check for dupes and convert NaNs to NONEs

In [1]:
import pandas as pd

pd.set_option('colwidth', 100)

In [2]:
PK_COLS = ['cms_certification_num', 'code', 'inpatient_outpatient', 'internal_revenue_code', 'code_disambiguator', 'payer']

In [3]:
def update_code_disambiguator(row: pd.Series) -> str:
    cd, val = row
    
    if pd.isnull(cd) or cd == 'NONE':
        return str(val)
    return cd + '-' + str(val)

Need to dedupe some rows (mostly insurance payers) because they don't have much `code` data or any `internal_revenue_code` information

In [4]:
df = pd.read_csv('firstpass_prices.csv')

pk_dupes = df[PK_COLS].duplicated(keep=False)
print(f'Duplicate rows across primary key: {pk_dupes.sum():,}')

# deal with pk dupe cols using code_disambiguator
df['add_val'] = None
df['add_val'] = df[pk_dupes].sort_values(PK_COLS + ['description']).groupby(PK_COLS).cumcount() + 1
df['add_val'] = df['add_val'].astype(pd.Int64Dtype())

df.loc[pk_dupes, 'code_disambiguator'] = df[['code_disambiguator', 'add_val']].apply(update_code_disambiguator, axis=1)

df = df.drop('add_val', axis=1)

assert df[PK_COLS].duplicated().sum() == 0
assert df.duplicated().sum() == 0

# check prices
assert df['price'].isnull().sum() == 0
assert (df['price'] == 0).sum() == 0

# fill in null code
assert df['code'].isnull().sum() > 0
df.loc[df['code'].isnull(), 'code'] = 'NONE'

# check other cols
df['inpatient_outpatient'].isnull().sum() == 0
df['code_disambiguator'].isnull().sum() == 0

print(f"min price: {df['price'].min():,.2f}")
print(f"max price: {df['price'].max():,.2f}")

df.head(2)

Duplicate rows across primary key: 820
min price: 1.00
max price: 500,535.00


Unnamed: 0,cms_certification_num,payer,code,internal_revenue_code,description,inpatient_outpatient,price,code_disambiguator
0,20001,GROSS CHARGE,C1876,Px0000085419L,HC ICAST STENT 9X59X120,BOTH,12652.0,NONE
1,20001,GROSS CHARGE,C1876,Px0000085424L,HC ICAST STENT 9X38X120,BOTH,11742.0,NONE


In [5]:
df.to_csv('prices.csv', index=False)

In [6]:
df.groupby('payer')['price'].agg(['sum', 'count']).sort_values('sum', ascending=False).style.format({'sum': '{:,.2f}'})

Unnamed: 0_level_0,sum,count
payer,Unnamed: 1_level_1,Unnamed: 2_level_1
MAX,14455978.0,298
MIN,12803305.0,410
GROSS CHARGE,12230838.0,4294
Premera Preferred,10972404.0,327
CASH PRICE,9783105.0,4294
Aetna PPO,2373125.0,125
UnitedHealthcare,2145633.0,57
Public Education Health Trust,1856404.0,103
Aetna SOA - Retirees,1607764.0,70
Pacific Health Coalition,1218081.0,64


also create hospitals file

In [7]:
homepage = 'https://www.providence.org/'
charge_url = 'https://www.providence.org/obp/ak/pricing-transparency'

cols = ['cms_certification_num', 'name', 'address', 'city', 'state', 'zip5', 'beds', 'phone_number', 'homepage_url', 'chargemaster_url', 'last_edited_by_username']
data = [['020001', 'PROVIDENCE ALASKA MEDICAL CENTER', '3200 PROVIDENCE DRIVE', 'ANCHORAGE', 'AK', '99508', 401, '9075622211'] + [homepage, charge_url, 'joeeoj']]

pd.DataFrame(data, columns=cols).to_csv('hospitals.csv', index=False)