In [1]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

In [110]:
local = False
if local:
    client = MongoClient('localhost', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', 
                                                         len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


###Load encode definition

In [3]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

def logTime():
    return str(datetime.datetime.now())

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Data uniformity check

In [104]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-04 12:28:11.725141: plan document: 12242
2016-07-04 12:28:11.809816: drug document: 1540473
2016-07-04 12:28:11.893561: unique plan_id: 6035
2016-07-04 12:28:11.893687: unique rxnorm_id: 46206
2016-07-04 12:28:12.530894: plans with multiple documents: 1259
2016-07-04 12:28:15.130023: drugs with multiple documents: 12807
2016-07-04 12:28:15.134036: states in the plan: AK,AL,AR,AZ,CO,DE,FL,GA,HI,IA,IL,IN,KS,KY,LA,MA,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,OH,OK,OR,PA,SC,SD,TN,TX,UT,VA,WA,WI,WV,WY


###Get feature space

In [111]:
state = 'OR' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
print 'processing %d plans for %s ...' %(len(ex_id), 'all' if not state else state)

feature_space = {
    k : (plan_col if k.startswith('plan') else drug_col).find(
        { ('plan_id' if k.startswith('plan') else 'plans.plan_id') : {'$in':ex_id} }
    ).distinct(k[k.index('.')+1:]) 
    for k,v in encode_list.items() if v=='string'
}

feature_space

processing 190 plans for OR ...


{u'drug.plans.drug_tier': [u'PREFERRED-BRAND',
  u'NON-PREFERRED-BRAND',
  u'GENERIC',
  u'SPECIALTY',
  u'ZERO-COST-SHARE-PREVENTIVE',
  u'NON-PREFERRED-GENERIC',
  u'PREFERRED',
  u'BRAND',
  u'SELECT',
  u'PREVENTIVE',
  u'VALUE',
  u'MEDICAL-SERVICE-DRUGS',
  u'MEDICAL-SERVICE',
  u'NON-PREFERRED',
  u'FORMULARY-DRUGS',
  u'NONPREFERRED-BRAND',
  u'PREVENTIVE-ACA'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [u'NO-CHARGE',
  None,
  u'AFTER-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.copay_opt': [u'AFTER-DEDUCTIBLE',
  u'NO-CHARGE-AFTER-DEDUCTIBLE',
  u'BEFORE-DEDUCTIBLE',
  u'NO-CHARGE'],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-RETAIL'],
 u'plan.formulary.drug_tier': [u'GENERIC',
  u'NON-PREFERRED-BRAND',
  u'PREFERRED-BRAND',
  u'SPECIALTY',
  u'ZERO-COST-SHARE-PREVENTIVE',
  u'BRAND',
  u'MEDICAL-SERVICE-DRUGS',
  u'PREFERRED',
  u'PREVENTIVE',
  u'SELECT',
  u'VALUE',
  u'MEDICAL-

###Get common drugs between plans
- operator reference [link](https://docs.mongodb.com/manual/reference/operator/query/)

In [112]:
# get rxnorm_id group for each plan
plan_drug = [drug_col.find({'plans.plan_id':pid}) for pid in ex_id] 
n_plan = len(ex_id)

# find the first plan with non-zero drug association, 
# otherwise 'i' is out-of-bound of plan_drug and will stop 'naturally'
cnt, i = [], -1
while(len(cnt) == 0):
    i += 1    
    cnt = plan_drug[i].distinct('rxnorm_id')   

common_drug, n_empty = Set(cnt), i

print '%s: checking common drugs among %d %s plans ...' %(logTime(), n_plan - i, state)
for pd in plan_drug[i+1:]:
    i += 1
    rx = pd.distinct('rxnorm_id')
    if len(rx) > 0:
        common_drug.intersection_update(rx)
    else:
        n_empty += 1
    if i%30 == 0:
        print '%s: finishing %d plans, %d plan without drug, %d common drugs ...' %(logTime(), i, n_empty, len(common_drug))

print '%s: plan without drug: %d' %(logTime(), n_empty)

# common_drug and drug_attr list will ensure the order of drug/attribute combination remains unchange
common_drug = list(common_drug)
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = [k[1]=='string' for k in drug_attr]*n_drug

print '%s: there are %d common drug between %d plans!' %(logTime(), n_drug, n_plan-n_empty)

2016-07-04 12:51:41.326089: checking common drugs among 190 OR plans ...
2016-07-04 12:51:46.596693: finishing 30 plans, 0 plan without drug, 2525 common drugs ...
2016-07-04 12:51:50.465489: finishing 60 plans, 0 plan without drug, 2525 common drugs ...
2016-07-04 12:51:54.349755: finishing 90 plans, 0 plan without drug, 2525 common drugs ...
2016-07-04 12:51:58.336120: finishing 120 plans, 0 plan without drug, 2430 common drugs ...
2016-07-04 12:52:07.675412: finishing 150 plans, 0 plan without drug, 1997 common drugs ...
2016-07-04 12:52:11.567414: finishing 180 plans, 0 plan without drug, 1965 common drugs ...
2016-07-04 12:52:13.022325: plan without drug: 0
2016-07-04 12:52:13.023578: there are 1965 common drug between 190 plans!


###Get pharmacy_type space (over all extracted plan IDs) for each drug_tier

In [211]:
# Note: issue with distinct in the query
# tier_pharm = {tier : plan_col.find(
#         {'plan_id':{'$in':ex_id}, 'formulary.drug_tier':tier, 'formulary.0.cost_sharing.0':{'$exists':True}},
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     ).distinct('formulary.cost_sharing.pharmacy_type')
#     for tier in feature_space['plan.formulary.drug_tier']
# }

tier_pharm = {}
for tier in feature_space['plan.formulary.drug_tier']:
    query = plan_col.find(
        {
            'plan_id':{'$in':ex_id}, 
            'formulary.drug_tier':tier,
            'formulary.0.cost_sharing.0':{'$exists':True},             
        },
        {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
    )
    tier_pharm[tier] = list(Set(cs['pharmacy_type'] for p in query for f in p['formulary'] for cs in f['cost_sharing']))

# simple query to double check results
# for p in plan_col.find({'plan_id':{'$in':ex_id}}):
#     if 'formulary' not in p:
#         continue
#     if type(p['formulary']) is dict:
#         p['formulary'] = [p['formulary']]
#     for f in p['formulary']:        
#         if f['drug_tier'] not in tier_pharm:
#             tier_pharm[f['drug_tier']]=[]        
#         if 'cost_sharing' not in f:
#             continue
#         for cs in f['cost_sharing']:
#             tier_pharm[f['drug_tier']].append(cs['pharmacy_type'])
# tier_pharm = {k:list(set(v)) for k,v in tier_pharm.items()}

tier_pharm

{u'BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'FORMULARY-DRUGS': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'GENERIC': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'MEDICAL-SERVICE': [],
 u'MEDICAL-SERVICE-DRUGS': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-RETAIL'],
 u'NON-PREFERRED BRAND': [],
 u'NON-PREFERRED-BRAND': [],
 u'NON-PREFERRED-GENERIC-PREFERRED-BRAND': [],
 u'NONPREFERRED-BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'PREFERRED': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL'],
 u'PREFERRED BRAND': [],
 u'PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'PREFERRED-GENERIC': [],
 u'PREVENTIVE': [u'1-MONTH-IN-RETAIL'],
 u'PREVENTIVE-ACA': [u'1-MONTH-IN-RETAIL'],
 u'SELECT': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'SPECIALTY': [u'1-MONTH-IN-RETAIL'],
 u'SPECIALTY DRUGS': [],
 u'VALUE': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'ZERO-COST-SHARE-PR

###Get unique pharmacy_type for each drug_tier

In [7]:
# put tier names into list so the order is fixed for feature extraction
tiers = tier_pharm.keys()
# we build pharmacy type into the order of feature vector, so no need to include
cost_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if 'cost_sharing' in k and 'pharmacy_type' not in k]
# flatten the vector to combine all tiers
cost_cat_index = [y for x in [[k[1]=='string' for k in cost_attr]*len(tier_pharm[t]) for t in tiers] for y in x]
# plan level attributes
plan_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith('plan') and 'formulary' not in k]
plan_cat_index = [a[1]=='string' for a in plan_attr]
# total feature catagrical index - must match with the order of feature canconnation in plan
cat_index = cost_cat_index + drug_cat_index

###Extraction integer features for each plan
- get plan feature --> cost_sharing/pharmacy type
- get drug feature --> drug_tier

In [9]:
# for each plan, get int features (plan level & combined fomulary level)
plan_int_feature = {}
# TODO: generate index for categorical variable
for pid in ex_id:
    print '%s: processing plan \'%s\' ...' %(logTime(), pid)
    # initialize feature vector
    tier_feature = [None]*len(tiers)
    
    # for each plan document
    for plan_data in plan_col.find({'plan_id':pid},{'_id':0}):
        # TODO: check if network tier is necessary
        # plan_data['network'][0]['network_tier']

        if 'formulary' not in plan_data:
            print '%s: skip a plan doc without formulary for %s' %(logTime(), pid)
            continue
            
        # handle format irregularity, put formulary as list for some plan doc
        if type(plan_data['formulary']) is dict:
            plan_data['formulary'] = [plan_data['formulary']]
        
        # build feature for each drug_tier in formulary
        for tier in plan_data['formulary']:
            t_name = tier['drug_tier']
            # add mail order feature for the tier
            tier_feature[tiers.index(t_name)] = [tier['mail_order']]
            
            # skip if no cost_sharing field, only mail_order
            if (len(tier_pharm[t_name]) == 0) or ('cost_sharing' not in tier):                
                continue
            
            if type(tier['cost_sharing']) is dict:
                tier['cost_sharing'] = [tier['cost_sharing']]
                
            # for each drug_tier, expland cost_sharing for all pharmacy_type
            cost_feature = [None]*len(tier_pharm[t_name])
            
            for cs in tier['cost_sharing']:            
                ph_idx = tier_pharm[t_name].index(cs['pharmacy_type'])
                cost_feature[ph_idx] = [cs[a[0]] if a[1]!='string'
                                        else feature_space[a[2]].index(cs[a[0]]) 
                                        for a in cost_attr]
            tier_feature[tiers.index(t_name)] += [c if c else [None]*len(cost_attr) for c in cost_feature]        
        
    # flaten the vector for the plan from hierarchy: tier-cost-pharmacy
    formulary_feature = [z for x in tier_feature for y in x for z in y]
    
    
    # get the list of drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':common_drug}},# 'plans.0':{'$exists':True}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    drug_dict = {d['rxnorm_id']:d['plans'][0] for d in drug_cur} #TODO: choose the plan matches the tier

    # drop the plan if it doesn't match with common drug list
    if len(drug_dict) != n_drug:
        print 'plan %s drug list doesn\'t have all common drug (n=%d), skip' %(pid, len(drug_dict))
        continue

    # flat the drug attributes for all common drugs    
    drug_feature = [drug_dict[rx][attr[0]] if attr[1]!='string' 
                             else feature_space[attr[2]].index(drug_dict[rx][attr[0]])
                             for rx in common_drug for attr in drug_attr]
    
    # combine for plan feature - must match with catagroical index concannation order
    plan_int_feature[pid] = formulary_feature + drug_feature


plan 21989AK0030001 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0050001 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0050002 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0070001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0210001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0010001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0010002 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0050001 doesn't have complete Drug Tier info: 0 tiers


In [194]:
pid = '30969OR0120001'
tt = 'NON-PREFERRED-BRAND'

print plan_col.find({'plan_id':pid}).count(),'\n'

for p in plan_col.find(
        {
            'plan_id':{'$in':ex_id}, 
            'formulary.drug_tier':tt,
#             'formulary.0.cost_sharing.0':{'$exists':True},             
        },
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tt}}}
    ).limit(5):
    print p['formulary']

1 

[{u'drug_tier': u'GENERIC', u'mail_order': True}, {u'drug_tier': u'NON-PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'SPECIALTY', u'mail_order': True}, {u'drug_tier': u'ZERO-COST-SHARE-PREVENTIVE', u'mail_order': True}]
[{u'drug_tier': u'GENERIC', u'mail_order': True}, {u'drug_tier': u'NON-PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'SPECIALTY', u'mail_order': True}, {u'drug_tier': u'ZERO-COST-SHARE-PREVENTIVE', u'mail_order': True}]
[{u'drug_tier': u'GENERIC', u'mail_order': True}, {u'drug_tier': u'NON-PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'SPECIALTY', u'mail_order': True}, {u'drug_tier': u'ZERO-COST-SHARE-PREVENTIVE', u'mail_order': True}]
[{u'drug_tier': u'GENERIC', u'mail_order': True}, {u'drug_tier': u'NON-PREFERRED-BRAND', u'mail_order': True}, {u'drug_tier': u'PREFERR

In [163]:
ex_id

[u'56707OR0850005',
 u'56707OR0850006',
 u'56707OR0850008',
 u'56707OR0870002',
 u'56707OR0880002',
 u'56707OR0900008',
 u'56707OR0900009',
 u'56707OR0900011',
 u'56707OR0910008',
 u'56707OR0910009',
 u'56707OR0910011',
 u'56707OR0990002',
 u'56707OR1000002',
 u'63474OR0030003',
 u'39424OR1240001',
 u'39424OR1240002',
 u'39424OR1260001',
 u'39424OR1260002',
 u'39424OR1310001',
 u'39424OR1320001',
 u'39424OR1460001',
 u'39424OR1460002',
 u'39424OR1460003',
 u'39424OR1460004',
 u'39424OR1470001',
 u'39424OR1480001',
 u'39424OR1480002',
 u'39424OR1480003',
 u'39424OR1490001',
 u'39424OR1500001',
 u'10091OR0360004',
 u'10091OR0360005',
 u'10091OR0360006',
 u'10091OR0360007',
 u'10091OR0360008',
 u'10091OR0360009',
 u'10091OR0360010',
 u'10091OR0360011',
 u'10091OR0360012',
 u'10091OR0370002',
 u'10091OR0370003',
 u'10091OR0370004',
 u'10091OR0380003',
 u'10091OR0380004',
 u'10091OR0380005',
 u'10091OR0380006',
 u'10091OR0380007',
 u'10091OR0380008',
 u'10091OR0380009',
 u'10091OR0380010',


In [109]:
client.close()

In [56]:
t0 = time.time()
print 'drugs: %d' %len(drug_col.find({'plans.plan_id':'67577MI0390012'}).distinct('rxnorm_id'))
print("--- %s seconds ---" % (time.time() - t0))

drugs: 2710
--- 19.570182085 seconds ---


In [84]:
# regex filtering

regx = re.compile(".combigan.", re.IGNORECASE)
drug_col.find_one({'drug_name':regx}, {'_id':0, 'drug_name':1, 'rxnorm_id':1})

{u'drug_name': u'Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML Ophthalmic Solution [Combigan]',
 u'rxnorm_id': u'861637'}