In [1]:
import pandas as pd

pd.set_option('max_colwidth', 100)

In [2]:
# excluding units
PRICE_COLS = ['cms_certification_num', 'payer', 'code', 'internal_revenue_code',
              #'units',
              'description', 'inpatient_outpatient', 'price', 'code_disambiguator']

PK_COLS = ['cms_certification_num', 'code', 'inpatient_outpatient', 'internal_revenue_code', 'code_disambiguator', 'payer']

COL_MAPPING = {
    'Code': 'code',
    'Description': 'description',
    'Charges': 'gross',
    'Self-Pay Rate': 'cash',
}

CMS_CODE = '503300'

## hosptials

In [3]:
hospitals = pd.read_csv('../remaining_wa.csv', dtype={'cms_certification_num': str})
hospitals = hospitals[hospitals['cms_certification_num'] == CMS_CODE]

hospitals['homepage_url'] = 'https://www.seattlechildrens.org/'
hospitals['chargemaster_url'] = 'https://www.seattlechildrens.org/clinics/paying-for-care/insurance/'
hospitals['last_edited_by_username'] = 'joeeoj'

hospitals.to_csv('hospitals.csv', index=False)

## prices

In [4]:
orig = pd.read_csv('910564748_seattlechildrenshospital_standardcharges.csv').rename(COL_MAPPING, axis=1)
orig = orig.drop('LAST UPDATED: 6/10/2021', axis=1)

orig['cms_certification_num'] = CMS_CODE
orig['inpatient_outpatient'] = 'UNSPECIFIED'

# no internal billing code so we need to use these two cols to disambiguate
orig['code_disambiguator'] = orig['Category'] + '-' + orig['Code Type']
orig['internal_revenue_code'] = 'NONE'

cols_subset = ['cms_certification_num', 'code', 'internal_revenue_code', 'description', 'inpatient_outpatient', 'code_disambiguator']

cash = orig[cols_subset + ['cash']].rename(columns={'cash': 'price'})
cash['payer'] = 'CASH PRICE'

gross = orig[cols_subset + ['gross']].rename(columns={'gross': 'price'})
gross['payer'] = 'GROSS CHARGE'

df = pd.concat([cash, gross])

# trim description just in case
df['description'] = df['description'].str.strip()

# drop nulls
print(f"Dropping {df['price'].isnull().sum():,} rows with null price")
# no null prices
df = df.dropna(subset=['price'])

zero_dollars = (df['price'] == 0)
print(f"Dropping {zero_dollars.sum():,} rows with $ 0.00 price")
df = df[~zero_dollars]
print()

# need to fill some nulls
assert df['code'].isnull().sum() > 0
df['code'] = df['code'].fillna('NONE')

# drop complete row duplicates
df = df.drop_duplicates()
assert df.duplicated(keep=False).sum() == 0

print(f"min price: $ {df['price'].min():,.2f}")
print(f"max price: $ {df['price'].max():,.2f}")

df.head(2)

Dropping 3,652 rows with null price
Dropping 4 rows with $ 0.00 price

min price: $ 0.04
max price: $ 2,656,278.60


Unnamed: 0,cms_certification_num,code,internal_revenue_code,description,inpatient_outpatient,code_disambiguator,price,payer
2,503300,NONE,NONE,MR ANGIO PELVIS WITH CONTRAST,UNSPECIFIED,RADIOLOGY-CPTHCPCS,2871.38,CASH PRICE
3,503300,NONE,NONE,THERAPEUTIC BEHAVIORAL SERVICES PER DIEM,UNSPECIFIED,PSYCHIATRY-CPTHCPCS,743.25,CASH PRICE


In [5]:
print(f'total rows: {len(df):,}')

total rows: 63,982


## duplicates

Even after removing duplicate rows (across all columns) there are still duplicates just on the PK columns

In [6]:
pk_dupes = df[PK_COLS].duplicated(keep=False)
print(f'Duplicate rows across primary key: {pk_dupes.sum():,}')

Duplicate rows across primary key: 37,082


### Duplicate rows across all columns except price

In [7]:
all_but_price_col = df.columns.tolist()
all_but_price_col.remove('price')

df = df.sort_values(all_but_price_col + ['price'], ascending=False)
len(df)

# df[df[all_but_price_col].duplicated(keep=False)].head(10)

63982

Include the highest price and drop the rest

In [8]:
df = df.drop_duplicates(keep='first', subset=all_but_price_col)

df = df.reset_index().drop('index', axis=1)

len(df)

54106

Remaining duplicates

In [9]:
remaining_dupes = df[PK_COLS].duplicated(keep=False)

remaining_dupes.sum()

25398

## Add value to code disambiguator

Add incrementing value to code_disambiguator as a final separator for rows that are identical except for the description

In [10]:
df['add_val'] = None

# increment by all PK columns
df['add_val'] = df.sort_values(PK_COLS).groupby(PK_COLS).cumcount() + 1

df.head()

Unnamed: 0,cms_certification_num,code,internal_revenue_code,description,inpatient_outpatient,code_disambiguator,price,payer,add_val
0,503300,c1894,NONE,INTRODUCER CHECK FLO 5FR 0.038MM,UNSPECIFIED,RADIOLOGY-CPTHCPCS,275.0,GROSS CHARGE,1
1,503300,c1894,NONE,INTRODUCER CHECK FLO 5FR 0.038MM,UNSPECIFIED,RADIOLOGY-CPTHCPCS,206.25,CASH PRICE,1
2,503300,c1877,NONE,STENT INTRA LD 26X12MM,UNSPECIFIED,SURGICAL SERVICES-CPTHCPCS,5062.5,GROSS CHARGE,1
3,503300,c1877,NONE,STENT INTRA LD 26X12MM,UNSPECIFIED,SURGICAL SERVICES-CPTHCPCS,3796.88,CASH PRICE,1
4,503300,c1769,NONE,GUIDEWIRE HI-TORQUE 300CM,UNSPECIFIED,RADIOLOGY-CPTHCPCS,475.0,GROSS CHARGE,1


In [11]:
def update_code_disambiguator(row: pd.Series) -> str:
    cd, val = row
    
    if pd.isnull(cd):
        return str(val)
    return cd + '-' + str(val)

In [12]:
# only add to remaining duplicate columns
df.loc[remaining_dupes, 'code_disambiguator'] = df[['code_disambiguator', 'add_val']].apply(update_code_disambiguator, axis=1)

In [13]:
# confirm there should still be some null values
assert df['code_disambiguator'].isnull().sum() > 0

# set remaining to NONE
df.loc[df['code_disambiguator'].isnull(), 'code_disambiguator'] = 'NONE'

assert df[PRICE_COLS].duplicated().sum() == 0
assert df[PK_COLS].duplicated().sum() == 0

In [14]:
print(f'total rows: {len(df):,}')

total rows: 54,106


In [15]:
df[PRICE_COLS].to_csv('prices.csv', index=False)