In [1]:
import pandas as pd
import numpy as np
import gensim
import os

## Load Medicare Part B HCPCS

In [3]:
partb = pd.read_csv(os.environ['CMS_PARTB_PATH'], usecols=['hcpcs_code'])
unique_hcpcs = partb['hcpcs_code'].unique()
hcpcs_map = { code: True for code in unique_hcpcs }

## Load Choi et al. Embeddings

Downloaded from https://github.com/clinicalml/embeddings

In [12]:
with open('claims_codes_hs_300.txt', 'r') as fin:
  lines = fin.readlines()
  
lines = lines[2:]
lines = [line.replace('C_', '').strip() for line in lines if line.startswith('C_')]

In [14]:
embeddings = {}

for line in lines:
  [code, *embedding] = line.split(' ')
  embedding = [float(x) for x in embedding]
  if hcpcs_map.get(code) == True:
    embeddings[code] = embedding

## Stats

In [17]:
embedding_size = len(embeddings[unique_hcpcs[0]])
print(f'Embedding size: {embedding_size}')

hcpcs_count = len(unique_hcpcs)
print(f'Medicare data contains {hcpcs_count} unique HCPCS codes')

embedding_count = len(embeddings.keys())
print(f'Embeddings contain {embedding_count} unique HCPCS code embeddings')

coverage = embedding_count / hcpcs_count * 100
print(f'HCPCS embedding coverage: {coverage}%')

Embedding size: 300
Medicare data contains 7527 unique HCPCS codes
Embeddings contain 6367 unique HCPCS code embeddings
HCPCS embedding coverage: 84.58881360435765%


## Save Embeddings

Saving embeddings using gensim KeyedVectors to stay consistent with other embedding types.

In [21]:
model = gensim.models.keyedvectors.Word2VecKeyedVectors(embedding_size)
model.vectors = np.array(list(embeddings.values()))
model.vocab = np.array(list(embeddings.keys()))

model.save('../embeddings/choi-mcemc-e300.kv')