In [90]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

In [91]:
local = False
if local:
    client = MongoClient('localhost', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


###Load encode definition

In [92]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

def logTime():
    return str(datetime.datetime.now())

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Data uniformity check

In [93]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-04 20:51:05.524491: plan document: 12242
2016-07-04 20:51:05.607379: drug document: 1540473
2016-07-04 20:51:05.690348: unique plan_id: 6035
2016-07-04 20:51:05.690473: unique rxnorm_id: 46206
2016-07-04 20:51:06.574344: plans with multiple documents: 1259
2016-07-04 20:51:47.001284: drugs with multiple documents: 12807
2016-07-04 20:51:47.003948: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY


###Get feature space

In [94]:
state = 'PA' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
print 'processing %d plans for %s ...' %(len(ex_id), 'all' if not state else state)

feature_space = {
    k : (plan_col if k.startswith('plan') else drug_col).find(
        { ('plan_id' if k.startswith('plan') else 'plans.plan_id') : {'$in':ex_id} }
    ).distinct(k[k.index('.')+1:]) 
    for k,v in encode_list.items() if v=='string'
}

feature_space

processing 270 plans for PA ...


{u'drug.plans.drug_tier': [u'SPECIALTY-DRUGS',
  u'NON-PREFERRED-BRAND',
  u'GENERIC',
  u'PREFERRED-BRAND',
  u'ZERO-COST-SHARE-PREVENTIVE-DRUGS',
  u'PREFERRED-GENERIC',
  u'NON-PREFERRED-GENERIC',
  u'SPECIALTY',
  u'BRAND',
  u'TIER-TWO',
  u'TIER-THREE',
  u'TIER-ONE',
  u'TIER-FOUR',
  u'PREFERRED-BRAND-SPECIALTY-DRUGS',
  u'NON-PREFERRED-BRAND-SPECIATLY-DRUGS',
  u'GENERIC-SPECIALTY-DRUGS',
  u'NON-PREFERRED-GENERIC-NON-PREFERRED-BRAND',
  u'NON-PREFERRED-BRAND-SPECIALTY-DRUGS',
  u'PREFERRED-GENERIC-PREFERRED-BRAND'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [u'AFTER-DEDUCTIBLE',
  u'NO-CHARGE',
  None,
  u'NO-CHARGE-AFTER-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.copay_opt': [u'AFTER-DEDUCTIBLE',
  u'NO-CHARGE',
  None],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'1-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL'],
 u'plan.formulary.drug_tier': [u'PREFERRED-GENERIC',
  u'NON-PREFERRED-GENERIC',
  u'PREFERRED-BRAND',
  u'NO

###Get common drugs between plans
- operator reference [link](https://docs.mongodb.com/manual/reference/operator/query/)

In [95]:
# get rxnorm_id group for each plan
plan_drug = [drug_col.find({'plans.plan_id':pid}) for pid in ex_id] 
n_plan = len(ex_id)

# find the first plan with non-zero drug association, 
# otherwise 'i' is out-of-bound of plan_drug and will stop 'naturally'
cnt, i = [], -1
while(len(cnt) == 0):
    i += 1    
    cnt = plan_drug[i].distinct('rxnorm_id')   

common_drug, n_empty = Set(cnt), i

print '%s: checking common drugs among %d %s plans ...' %(logTime(), n_plan - i, state)
for pd in plan_drug[i+1:]:
    i += 1
    rx = pd.distinct('rxnorm_id')
    if len(rx) > 0:
        common_drug.intersection_update(rx)
    else:
        n_empty += 1
    if i%30 == 0:
        print '%s: finishing %d plans, %d plan without drug, %d common drugs ...' %(logTime(), i, n_empty, len(common_drug))

print '%s: plan without drug: %d' %(logTime(), n_empty)

# common_drug and drug_attr list will ensure the order of drug/attribute combination remains unchange
common_drug = list(common_drug)
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = [k[1]=='string' for k in drug_attr]*n_drug

print '%s: there are %d common drug between %d plans!' %(logTime(), n_drug, n_plan-n_empty)

2016-07-04 20:51:55.152638: checking common drugs among 270 PA plans ...
2016-07-04 20:51:59.380357: finishing 30 plans, 0 plan without drug, 2475 common drugs ...
2016-07-04 20:52:03.553435: finishing 60 plans, 0 plan without drug, 2475 common drugs ...
2016-07-04 20:52:07.981472: finishing 90 plans, 0 plan without drug, 2461 common drugs ...
2016-07-04 20:52:12.699088: finishing 120 plans, 0 plan without drug, 2333 common drugs ...
2016-07-04 20:52:17.000407: finishing 150 plans, 0 plan without drug, 47 common drugs ...
2016-07-04 20:52:21.532955: finishing 180 plans, 0 plan without drug, 47 common drugs ...
2016-07-04 20:52:26.216717: finishing 210 plans, 0 plan without drug, 47 common drugs ...
2016-07-04 20:52:30.529520: finishing 240 plans, 0 plan without drug, 42 common drugs ...
2016-07-04 20:52:34.734001: plan without drug: 0
2016-07-04 20:52:34.734954: there are 28 common drug between 270 plans!


###Get pharmacy_type space (over all extracted plan IDs) for each drug_tier

In [96]:
# Note: issue with distinct in the query
# tier_pharm = {tier : plan_col.find(
#         {'plan_id':{'$in':ex_id}, 'formulary.drug_tier':tier, 'formulary.0.cost_sharing.0':{'$exists':True}},
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     ).distinct('formulary.cost_sharing.pharmacy_type')
#     for tier in feature_space['plan.formulary.drug_tier']
# }

tier_pharm = {}
for tier in feature_space['plan.formulary.drug_tier']:
    query = plan_col.find(
        {
            'plan_id':{'$in':ex_id}, 
            'formulary.drug_tier':tier,            
            'formulary.0.cost_sharing.0':{'$exists':True},             
        },
        # NOTE: when fomulary is a dict instead of arrary, this projection won't return content
        {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
    )
    tier_pharm[tier] = list(Set(cs['pharmacy_type'] for p in query for f in p['formulary'] for cs in f['cost_sharing']))

# simple query to double check results
# for p in plan_col.find({'plan_id':{'$in':ex_id}}):
#     if 'formulary' not in p:
#         continue
#     if type(p['formulary']) is dict:
#         p['formulary'] = [p['formulary']]
#     for f in p['formulary']:        
#         if f['drug_tier'] not in tier_pharm:
#             tier_pharm[f['drug_tier']]=[]        
#         if 'cost_sharing' not in f:
#             continue
#         for cs in f['cost_sharing']:
#             tier_pharm[f['drug_tier']].append(cs['pharmacy_type'])
# tier_pharm = {k:list(set(v)) for k,v in tier_pharm.items()}

tier_pharm

{u'BRAND': [u'1-MONTH-IN-RETAIL',
  u'1-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL'],
 u'GENERIC': [u'1-MONTH-IN-RETAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-IN-MAIL'],
 u'NON-PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL',
  u'1-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL'],
 u'NON-PREFERRED-BRAND-SPECIALTY-DRUGS': [u'1-MONTH-IN-RETAIL'],
 u'NON-PREFERRED-GENERIC': [],
 u'NON-PREFERRED-GENERIC-NON-PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL'],
 u'PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'PREFERRED-BRAND-SPECIALTY-DRUGS': [u'1-MONTH-IN-RETAIL'],
 u'PREFERRED-GENERIC': [u'1-MONTH-IN-RETAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-IN-MAIL'],
 u'PREFERRED-GENERIC-PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL'],
 u'SPECIALTY': [u'1-MONTH-IN-RETAIL'],
 u'SPECIALTY-DRUGS': [u'1-MONTH-IN-RETAIL'],
 u'TIER-FOUR': [],
 u'TIER-ONE': [],
 u'TIER-THREE': [],
 u'TIER-TWO': [

###Get unique pharmacy_type for each drug_tier

In [97]:
# put tier names into list so the order is fixed for feature extraction
tiers = tier_pharm.keys()
# we build pharmacy type into the order of feature vector, so no need to include
cost_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if 'cost_sharing' in k and 'pharmacy_type' not in k]
# flatten the vector to combine all tiers
cat2d = [[False] + [k[1]=='string' for k in cost_attr]*len(tier_pharm[t]) for t in tiers]
cost_cat_index = [y for x in cat2d for y in x]
# # plan level attributes
# plan_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith('plan') and 'formulary' not in k]
# plan_cat_index = [a[1]=='string' for a in plan_attr]

# total feature catagrical index - must match with the order of feature canconnation in plan
cat_index = cost_cat_index + drug_cat_index
catagorical_var = [i for i,v in zip(range(len(cat_index)),cat_index) if v]

###Extraction integer features for each plan
- get plan feature --> cost_sharing/pharmacy type
- get drug feature --> drug_tier

In [None]:
# for each plan, get int features (plan level & combined fomulary level)
plan_int_feature, i, skip = {}, 0, 0

for pid in ex_id:
    i += 1
    if False: # i!=112:
        continue
    print '%s: processing plan \'%s\' (%d/%d) ...' %(logTime(), pid, i, len(ex_id))
    # initialize feature vector
    tier_feature = [None]*len(tiers)
    
    # for each plan document, assemble normalized tier info
    for tier, pharm in tier_pharm.items():               
        doc_cur = plan_col.find(
            {'plan_id':pid, 'formulary.drug_tier':tier},
            {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        )
        
        # no data for this tier, fill in the space with None
        n_doc = doc_cur.count()
        if n_doc == 0:            
            tier_feature[tiers.index(tier)] = [None] * (1 + len(pharm) * len(cost_attr))
            continue
            
        # parse tier info        
        pharm_feature = [[None]*len(cost_attr)]*len(pharm)
        for doc in doc_cur:              
            ##### query issue doc is empty when formulary is not array in plan document
            if not doc:
                print '\tWARNING, formulary is not array',pid,tier,n_doc
                fml = plan_col.find_one({'plan_id':pid, 'formulary.drug_tier':tier})['formulary']
            else:
                fml = doc['formulary'][0]
                
            tier_feature[tiers.index(tier)] = [fml['mail_order']]                    
            
            # no pharmarcy type, only put mail_order (from one doc)
            if len(pharm) == 0 or 'cost_sharing' not in fml:                
                continue
                        
            # put pharmarcy types  
            try:
                for cs in fml['cost_sharing']:
                    cost_feature = [cs[a[0]] if a[1]!='string' 
                                else feature_space[a[2]].index(cs[a[0]]) 
                                for a in cost_attr]
                    pharm_feature[pharm.index(cs['pharmacy_type'])] = cost_feature
            except Exception as ex:
                print '\tERROR parsing cost_sharing value', ex
#                 print '\tWARNING: cost_sharing data:', cs
        
        tier_feature[tiers.index(tier)] += [y for x in pharm_feature for y in x]
        
    # flaten the vector for the plan from hierarchy: tier-cost-pharmacy
    formulary_feature = [y for x in tier_feature for y in x]
    if len(formulary_feature) != len(cost_cat_index):
        skip += 1
        print 'Error: plan feature dimension mismatch for %s' %pid
        continue
        
    # get the list of drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':common_drug}},# 'plans.0':{'$exists':True}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    drug_dict = {d['rxnorm_id']:d['plans'][0] for d in drug_cur} #TODO: choose the plan matches the tier

    # drop the plan if it doesn't match with common drug list
    if len(drug_dict) != n_drug:
        print 'plan %s drug list doesn\'t have all common drug (n=%d), skip' %(pid, len(drug_dict))
        skip += 1
        continue

    # flat the drug attributes for all common drugs        
    try:
        drug_feature = [drug_dict[rx][attr[0]] if attr[1]!='string' 
                             else feature_space[attr[2]].index(drug_dict[rx][attr[0]])
                             for rx in common_drug for attr in drug_attr]
    except Exception as ex:
        skip += 1
        print '\tERROR (%s) parsing drug info, skip plan %s' %(ex, pid)
        continue
    
    # combine for plan feature - must match with catagroical index concannation order
    plan_int_feature[pid] = formulary_feature + drug_feature

print '%s: completed processing plan, %d skipped due to parsing issue.' %(logTime(), skip)

2016-07-04 20:53:02.456811: processing plan '53789PA0070011' (1/270) ...
2016-07-04 20:53:03.978473: processing plan '33709PA0690001' (2/270) ...
2016-07-04 20:53:05.832715: processing plan '33709PA0690002' (3/270) ...
2016-07-04 20:53:07.678862: processing plan '33709PA0690003' (4/270) ...
2016-07-04 20:53:09.523453: processing plan '16322PA0040006' (5/270) ...
2016-07-04 20:53:11.041075: processing plan '16322PA0040007' (6/270) ...
2016-07-04 20:53:12.562449: processing plan '16322PA0040008' (7/270) ...
2016-07-04 20:53:14.072910: processing plan '16322PA0040010' (8/270) ...
2016-07-04 20:53:15.619767: processing plan '16322PA0040012' (9/270) ...
2016-07-04 20:53:17.123230: processing plan '16322PA0040024' (10/270) ...
2016-07-04 20:53:18.628060: processing plan '16322PA0040025' (11/270) ...
2016-07-04 20:53:20.156837: processing plan '16322PA0040026' (12/270) ...
2016-07-04 20:53:21.686673: processing plan '16322PA0050029' (13/270) ...
2016-07-04 20:53:23.197773: processing plan '16

In [109]:
client.close()

In [56]:
t0 = time.time()
print 'drugs: %d' %len(drug_col.find({'plans.plan_id':'67577MI0390012'}).distinct('rxnorm_id'))
print("--- %s seconds ---" % (time.time() - t0))

drugs: 2710
--- 19.570182085 seconds ---


In [84]:
# regex filtering

regx = re.compile(".combigan.", re.IGNORECASE)
drug_col.find_one({'drug_name':regx}, {'_id':0, 'drug_name':1, 'rxnorm_id':1})

{u'drug_name': u'Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML Ophthalmic Solution [Combigan]',
 u'rxnorm_id': u'861637'}

In [222]:
# check if tier is defined in multiple doc

[[None]*4]*3+[]

for pid in ex_id:
    for tier in tier_pharm:
        cnt = plan_col.find(
            {'plan_id':pid, 'formulary.drug_tier':tier},
            {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        ).count()
        if cnt>1:
            print pid,tier,cnt

In [88]:
# %reset 