In [14]:
import pandas as pd
import os
import sys
import numpy as np

proj_dir = '../'

def load_dmepos_hcpcs_corpus(sample_size=None):
    corpus_file = os.path.join(proj_dir, 'data', 'corpus.npy')
    dmepos_file = '/Users/jujohnson/cms-data/raw/medicare-dmepos-2013-2018.csv.gz'
    dmepos_cols = ['npi', 'year', 'hcpcs_code', 'number_of_supplier_claims']
    
    # load corpus from disk if exists
    if os.path.isfile(corpus_file):
        print(f'Loading corpus from disk {corpus_file}')
        corpus = np.load(corpus_file, allow_pickle=True)
        return corpus

    # load Medicare Data
    data = pd.read_csv(dmepos_file, usecols=dmepos_cols, nrows=sample_size)
    print(f'Loaded data')

    # clean missing values
    data.dropna(subset=['hcpcs_code','number_of_supplier_claims'], inplace=True)

    # generate sequences of HCPCS codes
    # that occur in the same context
    grouped_hcpcs = data \
        .sort_values(by='number_of_supplier_claims') \
        .groupby(by=['year', 'npi'])['hcpcs_code'] \
        .agg(list)
    grouped_hcpcs = pd.DataFrame(grouped_hcpcs)
    print(f'Generated hcpcs sequences')

    # drop top 1 percent longest sequences
    quantile = 0.99
    grouped_hcpcs['seq_length'] = grouped_hcpcs['hcpcs_code'].agg(len)
    max_seq_length = grouped_hcpcs['seq_length'].quantile(quantile)
    grouped_hcpcs = grouped_hcpcs.loc[grouped_hcpcs['seq_length']
                                      <= max_seq_length]
    print(f'Removed sequences longer than {max_seq_length}')

    # save corpus
    np.save(corpus_file, grouped_hcpcs['hcpcs_code'].values)

    return grouped_hcpcs['hcpcs_code'].values

In [15]:
corpus = load_dmepos_hcpcs_corpus()

Loaded data
Generated hcpcs sequences
Removed sequences longer than 32.0


In [16]:
corpus

array([list(['E0431', 'E1390']), list(['G0333', 'J7613', 'E0570']),
       list(['E1038', 'J7620', 'J7613', 'E0570', 'E0607', 'Q0513', 'A4258', 'A4256', 'A4259', 'A4253']),
       ...,
       list(['J7626', 'J7606', 'E0443', 'E0140', 'A7003', 'A4258', 'E1392', 'A4256', 'K0001', 'J7613', 'E0431', 'A4259', 'Q0513', 'E1390', 'E0570', 'A4253']),
       list(['A4604', 'E0431', 'E0601', 'A4253']),
       list(['L3170', 'L3916', 'L1971', 'L0650', 'L3960', 'L1851', 'L2397'])],
      dtype=object)