In [6]:
# Load duckdb, which lets us efficiently load large files
import duckdb

# Load pandas, which lets us manipulate dataframes
import pandas as pd

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Set configrations on jupysql to directly output data to Pandas and to simplify the output that is printed to the notebook.
%config SqlMagic.autopandas = True

%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to DuckDB using a SQLAlchemy-style connection string. Either connect to an in memory DuckDB, or a file backed db.
%sql duckdb:///:memory:

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [7]:
%%sql 

SELECT * FROM '~/data/ahrq.gov/syhdr_commercial_outpatient_2016.parquet' LIMIT 10;

Unnamed: 0,PERSON_ID,PERSON_WGHT,FACILITY_ID,CLM_CNTL_NUM,AT_SPCLTY,SRVC_BEG_DATE,SRVC_END_DATE,LOS,ADMSN_TYPE,TOB_CD,...,CAST(CPT_PRCDR_CD_28 AS VARCHAR),CAST(CPT_PRCDR_CD_29 AS VARCHAR),CAST(CPT_PRCDR_CD_30 AS VARCHAR),CAST(CPT_PRCDR_CD_31 AS VARCHAR),CAST(CPT_PRCDR_CD_32 AS VARCHAR),CAST(CPT_PRCDR_CD_33 AS VARCHAR),CAST(CPT_PRCDR_CD_34 AS VARCHAR),CPT_PRCDR_CD_35,PLAN_PMT_AMT,TOT_CHRG_AMT
0,106555013,15.35,,162348524.0,A0,2016-11-03,2016-11-03,1,,13,...,,,,,,,,,50.27,
1,101322073,14.15,130002074.0,165390008.0,A3,2016-12-06,2016-12-06,1,,13,...,,,,,,,,,35.080002,
2,107712347,20.29,130020567.0,166984365.0,A0,2016-01-13,2016-01-13,1,,13,...,,,,,,,,,22.1,
3,103556891,24.28,130021019.0,160229186.0,A0,2016-11-17,2016-11-17,1,,13,...,,,,,,,,,244.520004,
4,100690283,19.03,130005583.0,163555025.0,A0,2016-09-19,2016-09-19,1,,13,...,,,,,,,,,95.029999,
5,100142835,25.2,130016338.0,164945423.0,99,2016-03-25,2016-03-25,1,,13,...,,,,,,,,,146.619995,
6,105694677,24.02,130013147.0,166224246.0,A0,2016-12-05,2016-12-05,1,,13,...,,,,,,,,,195.940002,
7,107129256,21.67,,161621816.0,99,2016-10-07,2016-10-07,1,,13,...,,,,,,,,,49.439999,
8,109445091,33.76,130018705.0,164473414.0,A0,2016-07-28,2016-07-28,1,,13,...,,,,,,,,,31.360001,
9,103781315,22.37,,160051590.0,87,2016-03-03,2016-03-03,1,,13,...,,,,,,,,,305.570007,


In [52]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz


In [53]:

# Read the parquet file
df = pd.read_parquet('~/data/ahrq.gov/syhdr_commercial_outpatient_2016.parquet')

In [54]:

# Select CPT procedure code columns
cpt_columns = [col for col in df.columns if col.startswith('CAST(CPT_PRCDR_CD_')]


In [55]:
cpt_columns

['CAST(CPT_PRCDR_CD_2 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_4 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_5 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_6 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_7 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_8 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_9 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_10 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_11 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_12 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_13 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_14 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_15 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_16 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_17 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_18 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_19 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_20 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_21 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_22 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_23 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_24 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_25 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_26 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_27 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_28 AS VARCHAR)',
 'CAST(CPT_PRCDR_CD_29 AS VARCHAR)',
 'CAST(C

In [56]:

# Melt the dataframe to create a long format
melted_df = df.melt(id_vars=['CLM_CNTL_NUM'], value_vars=cpt_columns, var_name='CPT_Column', value_name='CPT_Code')

# Remove rows with null or empty CPT codes
melted_df = melted_df[melted_df['CPT_Code'].notna() & (melted_df['CPT_Code'] != '')]

# Create a unique identifier for each claim
melted_df['claim_id'] = melted_df['CLM_CNTL_NUM'].astype('category').cat.codes

# Create a unique identifier for each CPT code
melted_df['cpt_id'] = melted_df['CPT_Code'].astype('category').cat.codes


In [57]:
melted_df

Unnamed: 0,CLM_CNTL_NUM,CPT_Column,CPT_Code,claim_id,cpt_id
2,166984365.000,CAST(CPT_PRCDR_CD_2 AS VARCHAR),80053,4489445,4765
6,166224246.000,CAST(CPT_PRCDR_CD_2 AS VARCHAR),G0202,4001174,7898
7,161621816.000,CAST(CPT_PRCDR_CD_2 AS VARCHAR),88175,1041759,5994
9,160051590.000,CAST(CPT_PRCDR_CD_2 AS VARCHAR),86922,33165,5750
10,160273695.000,CAST(CPT_PRCDR_CD_2 AS VARCHAR),80061,175879,4768
...,...,...,...,...,...
253061808,163146094.000,CAST(CPT_PRCDR_CD_34 AS VARCHAR),77470,2021893,4632
253061920,165271883.000,CAST(CPT_PRCDR_CD_34 AS VARCHAR),82746,3388522,5178
253063593,161422834.000,CAST(CPT_PRCDR_CD_34 AS VARCHAR),90715,913998,6186
253065173,160570639.000,CAST(CPT_PRCDR_CD_34 AS VARCHAR),71010,366350,4207


In [58]:

# Create the sparse matrix
row = melted_df['claim_id']
col = melted_df['cpt_id']
data = np.ones(len(melted_df))

mat = csr_matrix((data, (row, col)), shape=(row.max() + 1, col.max() + 1))

print(f"Sparse CSR matrix shape: {mat.shape}")
print(f"Number of non-zero elements: {mat.nnz}")


Sparse CSR matrix shape: (5083353, 9536)
Number of non-zero elements: 18516331


In [59]:

save_npz('/Users/me/data/ahrq.gov/claim_matrix.npz', mat)

In [61]:

# Example: Get the CPT codes for the first claim
first_claim_cpt_indices = mat.getrow(0).nonzero()[1]
first_claim_cpt_codes = melted_df[melted_df['cpt_id'].isin(first_claim_cpt_indices)]['CPT_Code'].unique()
print(first_claim_cpt_indices)
print(f"CPT codes for the first claim: {first_claim_cpt_codes}")


[2199 5158 5520 5550]
CPT codes for the first claim: ['36415' '85610' '86140' '82652']


In [64]:
import json

index_to_cpt_code = dict(enumerate(melted_df['CPT_Code'].unique()))
json.dump(index_to_cpt_code, open('/Users/me/data/ahrq.gov/index_to_cpt_code.json', 'w'))

In [41]:
import json
with open('/Users/me/data/ahrq.gov/cpt_code_to_index_mapping.json', 'w') as f:
    json.dump(cpt_code_to_index, f)
    
index_to_cpt_code = {v: k for k, v in cpt_code_to_index.items()}

with open('/Users/me/data/ahrq.gov/index_to_cpt_code_mapping.json', 'w') as f:
    json.dump(index_to_cpt_code, f)
    
import json
# load the file we just saved
with open('/Users/me/data/ahrq.gov/index_to_cpt_code_mapping.json', 'r') as f:
    index_to_cpt_code = json.load(f)

In [51]:

# Create a mapping of CPT codes to their column indices
cpt_code_to_index = dict(zip(melted_df['CPT_Code'], melted_df['cpt_id']))

# Save the mapping for future reference
import json
with open('/Users/me/data/ahrq.gov/cpt_code_to_index_mapping.json', 'w') as f:
    json.dump(cpt_code_to_index, f)

print("CPT code to index mapping saved to 'cpt_code_to_index_mapping.json'")

NameError: name 'melted_df' is not defined

In [37]:
melted_df['claim_id'].unique()

array([4489445, 4001174, 1041759, ..., 4783842, 4145288, 3296749],
      dtype=int32)

In [38]:
mat

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18516331 stored elements and shape (5083353, 9536)>

In [1]:
from scipy.sparse import load_npz

In [2]:
mat = load_npz('/Users/me/data/ahrq.gov/claim_matrix.npz')

In [3]:
import json
with open('/Users/me/data/ahrq.gov/cpt_code_to_index_mapping.json', 'r') as f:
    cpt_code_to_index = json.load(f)

In [4]:
mat

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18516331 stored elements and shape (5083353, 9536)>

In [45]:
len(cpt_code_to_index)

9536

In [29]:
import pandas as pd

cpt_dict = pd.read_excel('/Users/me/data/ahrq.gov/PPRRVU16_V0804.xlsx', usecols=[0, 2], header=None, dtype=str).set_index(0)[2].to_dict()

In [30]:
len(cpt_dict)

14321

In [32]:
n = 0
for key in cpt_code_to_index:
    if key not in cpt_dict:
        n += 1
print(n)

859


In [43]:
mat[0].nonzero()

(array([0, 0, 0, 0], dtype=int32),
 array([2199, 5158, 5520, 5550], dtype=int32))

In [50]:
mat.getrow(0).nonzero()[1]

array([2199, 5158, 5520, 5550], dtype=int32)

In [68]:
index_to_cpt_code[2199], index_to_cpt_code[5158], index_to_cpt_code[5520], index_to_cpt_code[5550]

('78071', '67332', '67110', '81260')