In [29]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
import math
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
Keras = tf.keras
make_sampling_table = Keras.preprocessing.sequence.make_sampling_table
skipgrams = Keras.preprocessing.sequence.skipgrams
Model = Keras.models.Model
Dense, Dot = Keras.layers.Dense, Keras.layers.dot
Embedding, Reshape, Input = Keras.layers.Embedding, Keras.layers.Reshape, Keras.layers.Input



In [30]:
proj_dir = '/Users/jujohnson/git/Hcpcs2Vec/'
data_dir = os.environ['CMS_RAW']

## Load Data

Using a sample of 20K Medicare Part B records from 2012.

In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 7311433 to 5246454
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   npi       500000 non-null  int64  
 1   hcpcs     500000 non-null  object 
 2   count     500000 non-null  float64
 3   hcpcs_id  500000 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 19.1+ MB


In [32]:
data_file = os.path.join(
    data_dir, 
    '2012', 
    'Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_CY2012.csv.gz')


# we only need the NPI and HCPCS Columns
columns = {
    'National Provider Identifier': 'npi',
    'HCPCS Code': 'hcpcs',
    'Number of Services': 'count',
}

data = pd.read_csv(data_file, usecols=list(columns.keys()))
data.rename(columns=columns, inplace=True)
data = data.sample(500000)

## Create HCPCS <--> ID Mapping

Generatea a unique identifier for each HCPCS code.

Saves the encoder for future mapping of codes <--> IDS.

In [33]:
%%time

le = LabelEncoder()

data['hcpcs_id'] = le.fit_transform(data['hcpcs'])

# save label encoder results to enable inverse transform later
hcpcsIdFile = os.path.join(proj_dir, 'data', 'hcpcs-labelencoding.pickle')
with open(hcpcsIdFile, 'wb') as fout:
    pickle.dump(le.classes_, fout)

print(f'Saved HCPCS label encoded classes to {hcpcsIdFile}')

Saved HCPCS label encoded classes to /Users/jujohnson/git/Hcpcs2Vec/data/hcpcs-labelencoding.pickle
CPU times: user 168 ms, sys: 13 ms, total: 181 ms
Wall time: 181 ms


## Extract HCPCS Contexts

Creates a corpus of HCPCS contexts, or sets.

Each set is a group of HCPCS procedure codes that occur in the same context.

This context is defined by the procedures performed by a doctor over a given year.

We sort each context by the frequency of HCPCS occurrences.

In [34]:
%%time

corpus = []

for npi, group in data.groupby(by='npi'):
    group.sort_values(by='count', inplace=True)
    hcpcs_set = np.asarray(group['hcpcs_id'], dtype='int16')
    corpus.append(hcpcs_set)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


CPU times: user 2min 57s, sys: 684 ms, total: 2min 58s
Wall time: 2min 58s


In [35]:
print(f'Corpus length: {len(corpus)}')

Corpus length: 309026


## Remove Long Contexts

We can reduce the longest sequence from 600+ to 50 by removing the largets 2% of HCPCS sets.

In [36]:
lengths = np.array(list(map(lambda x: len(x), corpus)))
max_seq_length = np.quantile(lengths, 0.98)
corpus = np.array(list(filter(lambda x: len(x) <= max_seq_length, corpus)))
print(f'Corpus length: {len(corpus)}')

Corpus length: 304555


  This is separate from the ipykernel package so we can avoid doing imports until


## Skipgram Config

In [37]:
vocab_size = data['hcpcs_id'].nunique()
window_size = 5
embedding_size = 300

In [38]:
%%time

counter = 0

sampling_table = make_sampling_table(vocab_size)

x, y = [], []

for seq in corpus:
  couples, labels = skipgrams(seq, vocab_size, window_size=window_size, sampling_table=sampling_table)
  x.extend(couples)
  y.extend(labels)
  if counter % 50000 == 0:
    print(counter)
  counter += 1

x = np.array(x, dtype='int16')
word_target, word_context = x[:, 0], x[:, 1]
y = np.array(y, dtype='int8')

0
50000
100000
150000
200000
250000
300000
CPU times: user 6.94 s, sys: 108 ms, total: 7.05 s
Wall time: 7.05 s


## Skipgram Model

In [78]:
# define input and embedding layers
input_target = Input((1,))
input_context = Input((1,))
embedding = Embedding(vocab_size, embedding_size, input_length=1, name='embedding')

# embed target and context
target = embedding(input_target)
context = embedding(input_context)

# get similarity of two embeddings via dot product
dot_product = Dot([target, context], axes=1)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

model = Keras.Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

## Training

In [79]:
history = model.fit(
  x=[word_target, word_context], y=y,
  epochs=10, batch_size=128,
  verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save()