In [2]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from sets import Set
import numpy as np
import json

In [241]:
client = MongoClient('localhost', 27017)
plan_col = client.aca.plan
drug_col = client.aca.drug

# plan_col.insert_many(json.load(open('ak_plan.json')))
# plan_col.insert_many(json.load(open('or_plan.json')))
# drug_col.insert_many(json.load(open('ak_drug.json')))
# drug_col.insert_many(json.load(open('or_drug.json')))


###Load encode definition

In [232]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Get feature space

In [196]:
feature_space = {k:client.aca[k.split('.')[0]].distinct(k[k.index('.')+1:]) 
                 for k,v in encode_list.items() if v=='string'}
feature_space

{u'drug.plans.drug_tier': [u'SELECT',
  u'BRAND',
  u'PREFERRED',
  u'VALUE',
  u'SPECIALTY',
  u'PREVENTIVE',
  u'MEDICAL-SERVICE-DRUGS'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [None, u'AFTER-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.copay_opt': [u'BEFORE-DEDUCTIBLE',
  u'NO-CHARGE'],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL',
  u'1-MONTH-OUT-RETAIL'],
 u'plan.formulary.drug_tier': [u'BRAND',
  u'MEDICAL-SERVICE-DRUGS',
  u'PREFERRED',
  u'PREVENTIVE',
  u'SELECT',
  u'SPECIALTY',
  u'VALUE'],
 u'plan.network.network_tier': [u'PREFERRED'],
 u'plan.plan_id_type': [u'HIOS-PLAN-ID']}

###Get common drugs between plans

In [207]:
# get rxnorm_id group for each plan
all_plans = plan_col.distinct('plan_id')
plan_drug = [drug_col.find({'plans.plan_id':pid}, {'_id':0, 'rxnorm_id':1}) for pid in all_plans]

common_drug = Set(x['rxnorm_id'] for x in plan_drug[0])

for pd in plan_drug[1:]:
    rx = [x['rxnorm_id'] for x in pd]
    if len(rx)>0:
        common_drug.intersection_update(rx)
        
# common_drug and drug_attr list will ensure the order of drug/attribute combination remains unchange
common_drug = list(common_drug)
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = [k[1]=='string' for k in drug_attr]*n_drug

###Get pharmacy_type space (over all plans) for each drug_tier

In [307]:
tier_pharm = {tier:plan_col.find({'formulary.drug_tier':tier}).distinct('formulary.cost_sharing.pharmacy_type')
              for tier in feature_space['drug.plans.drug_tier']}

# # SELECT, u'PREFERRED', u'BRAND', u'SPECIALTY', u'PREVENTIVE', u'VALUE', u'MEDICAL-SERVICE-DRUGS

# # check how pharmacy_type spreads over the plan
# for p in plan_col.find({},{'_id':-1, 'network':1, 'formulary':1}):
#     if len(p['formulary'])>0:
#         print [y['pharmacy_type'] for x in p['formulary'] for y in x['cost_sharing'] if x['drug_tier']=='SELECT']
# #         print '\n'

In [315]:
plan_data = plan_col.find_one({'plan_id':'73836AK0750001' }, {'_id':0})

# for the formulary, build feature for each drug_tier
if len(plan_data['formulary']) != len(tier_pharm):
    print 'plan %s doesn\'t have complete Drug Tier info' %('pid')

# for each drug_tier, expland cost_sharing for all pharmacy_type
for tier in tier_pharm:
    print tier

BRAND
SPECIALTY
VALUE
MEDICAL-SERVICE-DRUGS
PREFERRED
PREVENTIVE
SELECT


###Extraction integer features for each plan
- get plan feature --> cost_sharing/drug_tier
- get drug feature --> pharmacy type

In [222]:
plan_int_feature = {}
for pid in all_plans:
    # extract plan feature
    plan_data = plan_col.find_one({'plan_id':pid})
    
    # get the list of drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':common_drug}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    drug_dict = {d['rxnorm_id']:d['plans'][0] for d in drug_cur}

    # drop the plan if it doesn't match with common drug list
    if len(drug_dict) != n_drug:
        print 'plan %s drug list is nonnormal (n=%d), skip' %(pid, %len(drug_dict))
        continue

    # flat the drug attributes for all common drugs
    plan_int_feature[pid] = [drug_dict[rx][attr[0]] if attr[1]!='string' 
                             else feature_space[attr[2]].index(drug_dict[rx][attr[0]])
                             for rx in common_drug for attr in drug_attr]


plan 21989AK0030001 drug list is nonnormal, skip
plan 21989AK0050001 drug list is nonnormal, skip
plan 21989AK0050002 drug list is nonnormal, skip
plan 21989AK0070001 drug list is nonnormal, skip
plan 28415OR0210001 drug list is nonnormal, skip
plan 28415OR0010001 drug list is nonnormal, skip
plan 28415OR0010002 drug list is nonnormal, skip
plan 28415OR0050001 drug list is nonnormal, skip


In [308]:
plan_col.distinct('plan_id')

[u'73836AK0750001',
 u'73836AK0750002',
 u'73836AK0750003',
 u'73836AK0750004',
 u'73836AK0840001',
 u'73836AK0850001',
 u'73836AK0850002',
 u'73836AK0860001',
 u'73836AK0860002',
 u'73836AK0790001',
 u'73836AK0790002',
 u'21989AK0030001',
 u'21989AK0050001',
 u'21989AK0050002',
 u'21989AK0070001',
 u'39424OR1240001',
 u'39424OR1260001',
 u'39424OR1460001',
 u'39424OR1460002',
 u'39424OR1240002',
 u'39424OR1260002',
 u'39424OR1460003',
 u'39424OR1460004',
 u'39424OR1470001',
 u'39424OR1310001',
 u'39424OR1480001',
 u'39424OR1480003',
 u'39424OR1490001',
 u'39424OR1500001',
 u'39424OR1320001',
 u'28415OR0210001',
 u'28415OR0010001',
 u'28415OR0010002',
 u'28415OR0050001',
 u'39424OR1480002']

In [238]:
client.close()