In [1]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from sets import Set
import numpy as np
import json

In [2]:
client = MongoClient('localhost', 27017)
plan_col = client.aca.plan
drug_col = client.aca.drug

# plan_col.insert_many(json.load(open('ak_plan.json')))
# plan_col.insert_many(json.load(open('or_plan.json')))
# drug_col.insert_many(json.load(open('ak_drug.json')))
# drug_col.insert_many(json.load(open('or_drug.json')))


###Load encode definition

In [3]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Get feature space

In [4]:
feature_space = {k:client.aca[k.split('.')[0]].distinct(k[k.index('.')+1:]) 
                 for k,v in encode_list.items() if v=='string'}
feature_space

{u'drug.plans.drug_tier': [u'SELECT',
  u'BRAND',
  u'PREFERRED',
  u'VALUE',
  u'SPECIALTY',
  u'PREVENTIVE',
  u'MEDICAL-SERVICE-DRUGS'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [None, u'AFTER-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.copay_opt': [u'BEFORE-DEDUCTIBLE',
  u'NO-CHARGE'],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL',
  u'1-MONTH-OUT-RETAIL'],
 u'plan.formulary.drug_tier': [u'BRAND',
  u'MEDICAL-SERVICE-DRUGS',
  u'PREFERRED',
  u'PREVENTIVE',
  u'SELECT',
  u'SPECIALTY',
  u'VALUE'],
 u'plan.network.network_tier': [u'PREFERRED'],
 u'plan.plan_id_type': [u'HIOS-PLAN-ID']}

###Get common drugs between plans

In [5]:
# get rxnorm_id group for each plan
all_plans = plan_col.distinct('plan_id')
plan_drug = [drug_col.find({'plans.plan_id':pid}, {'_id':0, 'rxnorm_id':1}) for pid in all_plans]

common_drug = Set(x['rxnorm_id'] for x in plan_drug[0])

for pd in plan_drug[1:]:
    rx = [x['rxnorm_id'] for x in pd]
    if len(rx)>0:
        common_drug.intersection_update(rx)
        
# common_drug and drug_attr list will ensure the order of drug/attribute combination remains unchange
common_drug = list(common_drug)
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = [k[1]=='string' for k in drug_attr]*n_drug

###Get pharmacy_type space (over all plans) for each drug_tier

In [6]:
# NOTE: seems there is a bug for the distinct query with sub-array $elemMatch query, below doesn't work
# tier_pharm = {tier : plan_col.find(
#         {'formulary.drug_tier':tier},
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}, 'formulary.cost_sharing.pharmacy_type':1}
#     ).distinct('formulary.cost_sharing.pharmacy_type')
#     for tier in feature_space['drug.plans.drug_tier']}

# doing this in traditional way
tier_pharm = {}
for tier in feature_space['drug.plans.drug_tier']:
    query = plan_col.find(
        {'formulary.drug_tier':tier}, 
        {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}, 'formulary.cost_sharing.pharmacy_type':1})
    tier_pharm[tier] = list(Set(cs['pharmacy_type'] for p in query for cs in p['formulary'][0]['cost_sharing']))

tier_pharm
# # SELECT, u'PREFERRED', u'BRAND', u'SPECIALTY', u'PREVENTIVE', u'VALUE', u'MEDICAL-SERVICE-DRUGS

# # check how pharmacy_type spreads over the plan
# for p in plan_col.find({},{'_id':-1, 'network':1, 'formulary':1}):
#     if len(p['formulary'])>0:
#         print [y['pharmacy_type'] for x in p['formulary'] for y in x['cost_sharing'] if x['drug_tier']=='SELECT']
# #         print '\n'

{u'BRAND': [u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'1-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL'],
 u'MEDICAL-SERVICE-DRUGS': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-RETAIL'],
 u'PREFERRED': [u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'1-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL'],
 u'PREVENTIVE': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-RETAIL'],
 u'SELECT': [u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'1-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL'],
 u'SPECIALTY': [u'1-MONTH-IN-RETAIL'],
 u'VALUE': [u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'1-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL']}

In [7]:
# put tier names into list so the order is fixed for feature extraction
tiers = tier_pharm.keys()
# we build pharmacy type into the order of feature vector, so no need to include
cost_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if 'cost_sharing' in k and 'pharmacy_type' not in k]
# flatten the vector to combine all tiers
cost_cat_index = [y for x in [[k[1]=='string' for k in cost_attr]*len(tier_pharm[t]) for t in tiers] for y in x]
# plan level attributes
plan_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith('plan') and 'formulary' not in k]
plan_cat_index = [a[1]=='string' for a in plan_attr]
# total feature catagrical index - must match with the order of feature canconnation in plan
cat_index = cost_cat_index + drug_cat_index

###Extraction integer features for each plan
- get plan feature --> cost_sharing/drug_tier
- get drug feature --> pharmacy type

In [9]:
# for each plan, get int features (plan level & combined fomulary level)
plan_int_feature = {}
for pid in all_plans:
    # extract plan feature
    plan_data = plan_col.find_one({'plan_id':pid})
    
    
    # TODO: check if network tier is necessary
    # plan_data['network'][0]['network_tier']
    
    #drop the plan if it doesn't have complete drug tiers set (7)
    if len(plan_data['formulary']) != len(tier_pharm):
        print 'plan %s doesn\'t have complete Drug Tier info: %d tiers' %(pid, len(plan_data['formulary']))
        continue

    # for the formulary, build feature for each drug_tier
    tier_feature = [None]*len(tiers)
    for tier in plan_data['formulary']:
        # for each drug_tier, expland cost_sharing for all pharmacy_type
        cost_feature = [None]*len(tier_pharm[tier['drug_tier']])
        for cs in tier['cost_sharing']:            
            ph_idx = tier_pharm[tier['drug_tier']].index(cs['pharmacy_type'])
            cost_feature[ph_idx] = [cs[a[0]] if a[1]!='string' 
                                else feature_space[a[2]].index(cs[a[0]]) 
                                for a in cost_attr]
        tier_feature[tiers.index(tier['drug_tier'])] = [c if c else [None]*len(cost_attr) for c in cost_feature]        
    # flaten the vector for the plan from hierarchy: tier-cost-pharmacy
    formulary_feature = [z for x in tier_feature for y in x for z in y]
    
    
    # get the list of drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':common_drug}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    drug_dict = {d['rxnorm_id']:d['plans'][0] for d in drug_cur}

    # drop the plan if it doesn't match with common drug list
    if len(drug_dict) != n_drug:
        print 'plan %s drug list is nonnormal (n=%d), skip' %(pid, len(drug_dict))
        continue

    # flat the drug attributes for all common drugs    
    drug_feature = [drug_dict[rx][attr[0]] if attr[1]!='string' 
                             else feature_space[attr[2]].index(drug_dict[rx][attr[0]])
                             for rx in common_drug for attr in drug_attr]
    
    # combine for plan feature - must match with catagroical index concannation order
    plan_int_feature[pid] = formulary_feature + drug_feature


plan 21989AK0030001 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0050001 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0050002 doesn't have complete Drug Tier info: 0 tiers
plan 21989AK0070001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0210001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0010001 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0010002 doesn't have complete Drug Tier info: 0 tiers
plan 28415OR0050001 doesn't have complete Drug Tier info: 0 tiers


In [10]:
[len(x) for x in plan_int_feature.values()]
len(cost_cat_index + drug_cat_index)

35336

In [238]:
client.close()