In [None]:
from pathlib import Path, PosixPath

import pandas as pd
from sqlalchemy import create_engine

from src import COMPLETED

In [None]:
PRICE_COLS = ['cms_certification_num', 'payer', 'code', 'internal_revenue_code',
              #'units',
              'description', 'inpatient_outpatient', 'price', 'code_disambiguator']

PK_COLS = ['cms_certification_num', 'code', 'inpatient_outpatient', 'internal_revenue_code',
           'code_disambiguator', 'payer']

COL_MAPPING = {
    'internal_revenue_code': None,
    'code': None,
    'description': None,
    'price': None,
}

CMS_CODE = "{{ cookiecutter.cms }}"
HOMEPAGE_URL = "{{ cookiecutter.homepage_url }}"
CHARGEMASTER_URL = "{{ cookiecutter.chargemaster_url}}"

In [None]:
def load_sql_or_csv(fname: PosixPath) -> pd.DataFrame:
    """Either load saved csv or load from database and save as csv"""
    if fname.exists():
        print('loading csv')
        return pd.read_csv(fname, dtype={'cms_certification_num': str})
    else:
        print('loading from database')
        engine = create_engine("mysql+pymysql://root@localhost/hospital_price_transparency_v3")
        remaining_wa = pd.read_sql("select * from hospitals WHERE last_edited_by_username IS NULL AND state = 'WA'", engine)
        remaining_wa['cms_certification_num'] = remaining_wa['cms_certification_num'].astype(str)
        remaining_wa.to_csv('remaining_wa.csv', index=False)

        return remaining_wa

def create_hospital_row(df: pd.DataFrame, cms: str, homepage_url: str, charge_url: str) -> pd.DataFrame:
    t = df[df['cms_certification_num'] == cms].copy()
    t['homepage_url'] = homepage_url
    t['chargemaster_url'] = charge_url
    t['last_edited_by_username'] = 'joeeoj'

    return t

In [None]:
remaining_wa = load_sql_or_csv(Path('remaining_wa.csv'))
remaining_wa = remaining_wa[~remaining_wa.isin(COMPLETED)]

remaining_wa.sort_values('city').head(5)

## hospital

In [None]:
hospital = create_hospital_row(remaining_wa, CMS_CODE, HOMEPAGE_URL, CHARGEMASTER_URL)

hospital

## prices

In [None]:
fname = "{{ cookiecutter.fname }}"
if fname.endswith('.csv'):
    df = pd.read_csv(fname).rename(COL_MAPPING, axis=1)
elif fname.endswith('.xlsx'):
    df = pd.read_excel(fname).rename(COL_MAPPING, axis=1)

df['cms_certification_num'] = CMS_CODE
df['payer'] = 'GROSS CHARGE'
df['inpatient_outpatient'] = 'UNSPECIFIED'
df['code_disambiguator'] = 'NONE'

# trim description just in case
df['description'] = df['description'].str.strip()

# no null prices
assert df['price'].isnull().sum() == 0

# no nulls
assert df['internal_revenue_code'].isnull().sum() == 0

# need to fill some nulls
assert df['code'].isnull().sum() > 0
df['code'] = df['code'].fillna('NONE')

# no dupe rows across PK columns
assert df[PK_COLS].duplicated().sum() == 0

print(f"min price: {df['price'].min()}")
print(f"max price: {df['price'].max()}")

return df