In [97]:
# import libraries
import os
import pandas as pd
import numpy as np
import datetime as dt

In [98]:
# must match a reference ontology
# path in final version will be /opt/data/dim/
concept = pd.read_csv('volumes/data/dim/CONCEPT.csv.gz', header=0, sep='\t', low_memory=False)

In [99]:
concept = concept[['concept_code', 'concept_id']]

In [100]:
concept.drop_duplicates(inplace=True)

In [101]:
concept.rename(columns= {'concept_code': 'cpt'}, inplace=True)

In [102]:
# make a control file to iterate through
# path in final version will be /opt/data/raw/
path = 'volumes/data/raw/'
files = os.listdir(path)
csv_files = [x for x in files if x.endswith(".json.gz")]

# removes '.json.gz' (8 characters)
hospital_ids = [int(x[:-8]) for x in csv_files] 
control = pd.DataFrame({'file': csv_files, 'hospital_id': hospital_ids})

In [103]:
#cross reference the control file with the static dimension table
# path in final version will be /opt/dta/dim/
dim = pd.read_csv('volumes/data/dim/hospital.csv', usecols=['hospital_id', 'affiliation'])
control = control.merge(dim, how= 'left', on='hospital_id')
control.sort_values(by='hospital_id', inplace=True, ignore_index=True)

In [105]:
## loop (PENDING)

In [106]:
# print file you are working on now
print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ' - parsing hospital - ' + str(control.hospital_id[0]))

2021-09-20 13:13:24 - parsing hospital - 24


In [107]:
# read in the data
# replace 0 with i in loop
df = pd.read_json('volumes/data/raw/' + control.file[0])

In [108]:
pd.set_option('display.max_columns', None)

In [110]:
# cash price
# replace 0 with i in loop
cash = df.loc[df.TabName == 'Hosp Discounted Cash Price']
cash = pd.DataFrame({
    'hospital_id': control.hospital_id[0],
    'cpt': cash['Code'],
    'price': 'cash',
    'amount': cash[' Inpatient Discounted Charge ']
})
cash.drop_duplicates(inplace=True)

In [112]:
cash['amount'] = cash['amount'].str.strip()
cash['amount'] = cash['amount'].str.replace(',', '', regex=False)
cash['amount'] = cash['amount'].str.replace('[$]', '', regex=False)
cash['amount'] = cash.amount.astype(float)

In [113]:
cash['cpt'] = cash['cpt'].str.upper().replace('[^A-Z0-9]', '', regex=True)
cash['cpt'] = cash['cpt'].str.replace('CPT', '', regex=False)

In [114]:
cash = cash.merge(concept, on='cpt', sort=True)

In [115]:
cash = cash.drop('cpt', axis=1)

In [116]:
cash.dropna(subset=['amount'], inplace=True)
cash = cash[cash.amount > 0]

In [117]:
cash = cash[cash.columns[[0,3,1,2]]]

In [133]:
# maximum price
# replace 0 with i in loop
maxi = df.loc[(df.TabName == 'Hosp Deidentified Payor MinMax ') & (df['Min /Max'] == 'MAX')]
maxi = pd.DataFrame({
    'hospital_id': control.hospital_id[0],
    'cpt': maxi['Code'],
    'price': 'max',
    'amount': maxi[' Inpatient Negotiated Charge ']
})
maxi.drop_duplicates(inplace=True)

In [135]:
maxi['amount'] = maxi['amount'].str.strip()
maxi['amount'] = maxi['amount'].str.replace(',', '', regex=False)
maxi['amount'] = maxi['amount'].str.replace('[$]', '', regex=False)

# some values ended up being an empty string
maxi.loc[maxi['amount'] == '', 'amount'] = np.nan

# convert to float
maxi['amount'] = maxi.amount.astype(float)

In [136]:
maxi['cpt'] = maxi['cpt'].str.upper().replace('[^A-Z0-9]', '', regex=True)
maxi['cpt'] = maxi['cpt'].str.replace('CPT', '', regex=False)

In [137]:
maxi = maxi.merge(concept, on='cpt', sort=True)

In [138]:
maxi = maxi.drop('cpt', axis=1)

In [139]:
maxi.dropna(subset=['amount'], inplace=True)
maxi = maxi[maxi.amount > 0]

In [140]:
maxi = maxi[maxi.columns[[0,3,1,2]]]

In [142]:
# minimum price
# replace 0 with i in loop
mini = df.loc[(df.TabName == 'Hosp Deidentified Payor MinMax ') & (df['Min /Max'] == 'MIN')]
mini = pd.DataFrame({
    'hospital_id': control.hospital_id[0],
    'cpt': mini['Code'],
    'price': 'min',
    'amount': mini[' Inpatient Negotiated Charge ']
})
mini.drop_duplicates(inplace=True)

In [143]:
mini['amount'] = mini['amount'].str.strip()
mini['amount'] = mini['amount'].str.replace(',', '', regex=False)
mini['amount'] = mini['amount'].str.replace('[$]', '', regex=False)

# some values ended up being an empty string
mini.loc[mini['amount'] == '', 'amount'] = np.nan

# convert to float
mini['amount'] = mini.amount.astype(float)

In [144]:
mini['cpt'] = mini['cpt'].str.upper().replace('[^A-Z0-9]', '', regex=True)
mini['cpt'] = mini['cpt'].str.replace('CPT', '', regex=False)

In [145]:
mini = mini.merge(concept, on='cpt', sort=True)
mini = mini.drop('cpt', axis=1)

In [146]:
mini.dropna(subset=['amount'], inplace=True)
mini = mini[mini.amount > 0]

In [147]:
mini = mini[mini.columns[[0,3,1,2]]]

In [148]:
out = pd.concat([mini, maxi, cash])

In [149]:
out.head()

Unnamed: 0,hospital_id,concept_id,price,amount
0,24,766231,cash,22.5
1,24,766232,cash,33.75
2,24,766233,cash,22.5
3,24,766234,cash,33.75
4,24,2100780,cash,764.58


In [150]:
out.shape

(37411, 4)

In [151]:
print(mini.shape, maxi.shape, cash.shape)

(12422, 4) (12425, 4) (12564, 4)


In [152]:
out.dtypes

hospital_id      int64
concept_id       int64
price           object
amount         float64
dtype: object

In [153]:
# if out is not Null loop

In [154]:
# write the data to a flatfile for postgres
# path will be later /opt/data/transformed/
# replace 0 with i inside the loop
if out.shape[0] > 0:
    out_path = 'volumes/data/transformed/' + str(control.hospital_id[0]) + '.csv'
    out.to_csv(out_path, header=False, index=None)

In [155]:
# clear variables from last iteration (inside the loop)
# del out

In [156]:
col_names = ['hospital_id', 'concept_id', 'variable', 'value']
dfr = pd.read_csv('volumes/data/transformed/24.csv', header=None, names=col_names)

In [157]:
dfr.head()

Unnamed: 0,hospital_id,concept_id,variable,value
0,24,766231,cash,22.5
1,24,766232,cash,33.75
2,24,766233,cash,22.5
3,24,766234,cash,33.75
4,24,2100780,cash,764.58


In [158]:
dfn = pd.read_csv('../../NC/hospital-price-transparency/volumes/data/transformed/24.csv', header=None, names=col_names)

In [159]:
dfn.head()

Unnamed: 0,hospital_id,concept_id,variable,value
0,24,2100780,min,605.54
1,24,2100954,min,965.31
2,24,2100955,min,1585.58
3,24,709849,min,318.15
4,24,709849,min,239.7


In [163]:
dfn.loc[lambda df: df['concept_id'] == 2100955]

Unnamed: 0,hospital_id,concept_id,variable,value
2,24,2100955,min,1585.58
12424,24,2100955,max,4004.0
24853,24,2100955,cash,2002.0


In [164]:
dfr.loc[lambda df: df['concept_id'] == 2100955]

Unnamed: 0,hospital_id,concept_id,variable,value
6,24,2100955,cash,2002.0
12566,24,2100955,max,4004.0
24991,24,2100955,min,1585.58


In [162]:
print(dfn.shape, dfr.shape)

(37411, 4) (37411, 4)
