In [1]:
import pandas as pd

In [2]:
# excluding units
PRICE_COLS = ['cms_certification_num', 'payer', 'code', 'internal_revenue_code',
              #'units',
              'description', 'inpatient_outpatient', 'price', 'code_disambiguator']

PK_COLS = ['cms_certification_num', 'code', 'inpatient_outpatient', 'internal_revenue_code', 'code_disambiguator', 'payer']

COL_MAPPING = {
    'Procedure/Item ID': 'internal_revenue_code',
    'CPT/HCPCS': 'code',
    'Description': 'description',
    'Price': 'price',
}

In [3]:
def parse_excel(fname: str) -> pd.DataFrame:
    df = pd.read_excel(fname).rename(COL_MAPPING, axis=1)

    df['cms_certification_num'] = '500064'
    df['payer'] = 'GROSS CHARGE'
    df['inpatient_outpatient'] = 'UNSPECIFIED'
    df['code_disambiguator'] = 'NONE'

    # trim description just in case
    df['description'] = df['description'].str.strip()

    # no null prices
    assert df['price'].isnull().sum() == 0

    # no nulls
    assert df['internal_revenue_code'].isnull().sum() == 0

    # need to fill some nulls
    assert df['code'].isnull().sum() > 0
    df['code'] = df['code'].fillna('NONE')

    # no dupe rows across PK columns
    assert df[PK_COLS].duplicated().sum() == 0

    print(f"min price: {df['price'].min()}")
    print(f"max price: {df['price'].max()}")

    return df

In [4]:
hmc = parse_excel('HMC_Fees_20211001.xlsx')
hmc[PRICE_COLS].to_csv('prices_hmc.csv', index=False)

min price: 0.01
max price: 820600.0


In [5]:
uwmc = parse_excel('UWMC_Fees_20211001.xlsx')
uwmc[PRICE_COLS].to_csv('prices_uwmc.csv', index=False)

min price: 0.01
max price: 820600.0
