In [1]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

In [2]:
local = False
if local:
    client = MongoClient('localhost', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


###Load encode definition

In [3]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

def logTime():
    return str(datetime.datetime.now())

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Data uniformity check

In [4]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-05 20:31:18.759817: plan document: 12136
2016-07-05 20:31:18.842571: drug document: 1540473
2016-07-05 20:31:18.938232: unique plan_id: 6035
2016-07-05 20:31:18.938358: unique rxnorm_id: 46206
2016-07-05 20:31:19.884878: plans with multiple documents: 1148
2016-07-05 20:33:07.686372: drugs with multiple documents: 12807
2016-07-05 20:33:07.688559: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY


###Get feature space

In [5]:
state = 'OR' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
print 'processing %d plans for %s ...' %(len(ex_id), 'all' if not state else state)

feature_space = {
    k : (plan_col if k.startswith('plan') else drug_col).find(
        { ('plan_id' if k.startswith('plan') else 'plans.plan_id') : {'$in':ex_id} }
    ).distinct(k[k.index('.')+1:]) 
    for k,v in encode_list.items() if v=='string'
}

feature_space

processing 190 plans for OR ...


{u'drug.plans.drug_tier': [u'PREFERRED-BRAND',
  u'NON-PREFERRED-BRAND',
  u'GENERIC',
  u'SPECIALTY',
  u'ZERO-COST-SHARE-PREVENTIVE',
  u'NON-PREFERRED-GENERIC',
  u'PREFERRED',
  u'BRAND',
  u'SELECT',
  u'PREVENTIVE',
  u'VALUE',
  u'MEDICAL-SERVICE-DRUGS',
  u'MEDICAL-SERVICE',
  u'NON-PREFERRED',
  u'FORMULARY-DRUGS',
  u'NONPREFERRED-BRAND',
  u'PREVENTIVE-ACA'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [u'AFTER-DEDUCTIBLE',
  u'NO-CHARGE',
  None],
 u'plan.formulary.cost_sharing.copay_opt': [u'NO-CHARGE',
  u'AFTER-DEDUCTIBLE',
  u'NO-CHARGE-AFTER-DEDUCTIBLE',
  u'BEFORE-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL',
  u'3-MONTH-IN-RETAIL'],
 u'plan.formulary.drug_tier': [u'GENERIC',
  u'NONPREFERRED-BRAND',
  u'PREFERRED-BRAND',
  u'PREVENTIVE-ACA',
  u'SPECIALTY',
  u'FORMULARY-DRUGS',
  u'NON-PREFERRED BRAND',
  u'PREFERRED BRAND',
  u'SPECIALTY DRUGS',
  u'NON-PREFERRED-BRAND',
  u'Z

###Get common drugs between plans
- operator reference [link](https://docs.mongodb.com/manual/reference/operator/query/)

In [6]:
# get rxnorm_id group for each plan
common_drug = drug_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('rxnorm_id')
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = ([False]+[k[1]=='string' for k in drug_attr])*n_drug
print '%s: there are %d drugs for %d plans!' %(logTime(), n_drug, len(ex_id))

2016-07-05 20:33:52.285964: there are 10632 drugs for 190 plans!


###Get unique pharmacy_type for each drug_tier

In [10]:
# Note: issue with distinct in the query
# tier_pharm = {tier : plan_col.find(
#         {'plan_id':{'$in':ex_id}, 'formulary.drug_tier':tier}, #'formulary.0.cost_sharing.0':{'$exists':True}},
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     ).distinct('formulary.cost_sharing.pharmacy_type')
#     for tier in feature_space['plan.formulary.drug_tier']
# }

# traditional way to check
tier_pharm = {}
# for tier in feature_space['plan.formulary.drug_tier']:
#     query = plan_col.find(
#         {
#             'plan_id':{'$in':ex_id}, 
#             'formulary.drug_tier':tier,            
#             'formulary.0.cost_sharing.0':{'$exists':True},             
#         },
#         # NOTE: when fomulary is a dict instead of arrary, this projection won't return content
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     )
#     tier_pharm[tier] = list(Set(cs['pharmacy_type'] for p in query for f in p['formulary'] for cs in f['cost_sharing']))

# simple query to double check results
for p in plan_col.find({'plan_id':{'$in':ex_id}}):
    if 'formulary' not in p:
        continue
    if type(p['formulary']) is dict:
        p['formulary'] = [p['formulary']]
    for f in p['formulary']:        
        if f['drug_tier'] not in tier_pharm:
            tier_pharm[f['drug_tier']]=[]        
        if 'cost_sharing' not in f:
            continue
        for cs in f['cost_sharing']:
            tier_pharm[f['drug_tier']].append(cs['pharmacy_type'])
tier_pharm = {k:list(set(v)) for k,v in tier_pharm.items()}

tier_pharm

{u'BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'FORMULARY-DRUGS': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'GENERIC': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'MEDICAL-SERVICE': [],
 u'MEDICAL-SERVICE-DRUGS': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-RETAIL'],
 u'NON-PREFERRED BRAND': [],
 u'NON-PREFERRED-BRAND': [],
 u'NON-PREFERRED-GENERIC-PREFERRED-BRAND': [],
 u'NONPREFERRED-BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'PREFERRED': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'1-MONTH-OUT-RETAIL'],
 u'PREFERRED BRAND': [],
 u'PREFERRED-BRAND': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL'],
 u'PREFERRED-GENERIC': [],
 u'PREVENTIVE': [u'1-MONTH-IN-RETAIL'],
 u'PREVENTIVE-ACA': [u'1-MONTH-IN-RETAIL'],
 u'SELECT': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'SPECIALTY': [u'1-MONTH-IN-RETAIL'],
 u'SPECIALTY DRUGS': [],
 u'VALUE': [u'1-MONTH-IN-RETAIL', u'3-MONTH-IN-MAIL', u'1-MONTH-OUT-RETAIL'],
 u'ZERO-COST-SHARE-PR

###Evaluate feature dimension and variable index

In [11]:
# put tier names into list so the order is fixed for feature extraction
tiers = tier_pharm.keys()
# we build pharmacy type into the order of feature vector, so no need to include
cost_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() 
             if 'cost_sharing' in k and 'pharmacy_type' not in k]
# flatten the vector to combine all tiers
cat2d = [[False] + [k[1]=='string' for k in cost_attr]*len(tier_pharm[t]) for t in tiers]
cost_cat_index = [y for x in cat2d for y in x]
# # plan level attributes
# plan_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith('plan') and 'formulary' not in k]
# plan_cat_index = [a[1]=='string' for a in plan_attr]

# total feature catagrical index - must match with the order of feature canconnation in plan
cat_index = cost_cat_index + drug_cat_index
catagorical_var = [i for i,v in zip(range(len(cat_index)),cat_index) if v]
print '%s: feature dimension before encode: %d' %(logTime(), len(cat_index))

2016-07-05 20:38:24.598636: feature dimension before encode: 53280


###Extraction integer features for each plan
- get plan feature --> cost_sharing/pharmacy type
- get drug feature --> drug_tier

In [12]:
# for each plan, get int features (plan level & combined fomulary level)
plan_int_feature, i, skip = {}, 0, 0

for pid in ex_id:
    i += 1    
    if i%10==0:
        print '%s: processing plans %d/%d ...' %(logTime(), i, len(ex_id))
    # initialize feature vector
    tier_feature = [None]*len(tiers)
    
    # for each plan document, assemble normalized tier info
    for tier, pharm in tier_pharm.items():               
        doc_cur = plan_col.find(
            {'plan_id':pid, 'formulary.drug_tier':tier},
            {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        )
        
        # no data for this tier, fill in the space with None
        n_doc = doc_cur.count()
        if n_doc == 0:            
            tier_feature[tiers.index(tier)] = [None] * (1 + len(pharm) * len(cost_attr))
            continue
            
        if n_doc > 1:
            print '\tWARNING: plan with multiple document:', pid,tier
            
        # parse tier info        
        pharm_feature = [[None]*len(cost_attr)]*len(pharm)
        for doc in doc_cur:              
            ##### query issue doc is empty when formulary is not array in plan document
            if not doc:
                print '\tWARNING, formulary is not array',pid,tier,n_doc
                fml = plan_col.find_one({'plan_id':pid, 'formulary.drug_tier':tier})['formulary']
            else:
                fml = doc['formulary'][0]
                
            tier_feature[tiers.index(tier)] = [fml['mail_order']]                    
            
            # no pharmarcy type, only put mail_order (from one doc)
            if len(pharm) == 0 or 'cost_sharing' not in fml:                
                continue
                        
            # put pharmarcy types  
            try:
                for cs in fml['cost_sharing']:
                    cost_feature = [cs[a[0]] if a[1]!='string' 
                                else feature_space[a[2]].index(cs[a[0]]) 
                                for a in cost_attr]
                    pharm_feature[pharm.index(cs['pharmacy_type'])] = cost_feature
            except Exception as ex:
                print '\tERROR parsing cost_sharing value',ex,pid,tier
        # attach pharmacy info to tier feature
        tier_feature[tiers.index(tier)] += [y for x in pharm_feature for y in x]
        
    # flaten the vector for the plan from hierarchy: tier-cost-pharmacy
    formulary_feature = [y for x in tier_feature for y in x]
    if len(formulary_feature) != len(cost_cat_index):
        skip += 1
        print 'Error: plan feature dimension mismatch for %s' %pid
        continue
        
    # get the list of drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':common_drug}},# 'plans.0':{'$exists':True}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    drug_dict = {d['rxnorm_id']:d['plans'][0] for d in drug_cur}

    # flat the drug attributes for all common drugs    
    drug_feature = []
    try:
        for rx in common_drug:
            if rx not in drug_dict:
                drug_feature += ([False]+[None]*len(drug_attr))
            else:                
                drug_feature += ([True]+[drug_dict[rx][attr[0]] if attr[1]!='string' 
                                 else feature_space[attr[2]].index(drug_dict[rx][attr[0]])
                                 for attr in drug_attr])
    except Exception as ex:
        skip += 1
        print '\tERROR parsing drug info, skip plan',ex,pid
        continue
    
    # combine for plan feature - must match with catagroical index concannation order
    plan_int_feature[pid] = formulary_feature + drug_feature

print '%s: completed processing %s plan, %d skipped due to parsing issue.' %(logTime(), state, skip)

2016-07-05 20:39:34.498790: processing plans 10/190 ...
2016-07-05 20:40:28.743518: processing plans 20/190 ...
2016-07-05 20:41:23.356078: processing plans 30/190 ...
2016-07-05 20:42:10.717855: processing plans 40/190 ...
2016-07-05 20:42:59.714045: processing plans 50/190 ...
2016-07-05 20:43:49.045349: processing plans 60/190 ...
2016-07-05 20:44:36.268172: processing plans 70/190 ...
2016-07-05 20:45:24.525161: processing plans 80/190 ...
2016-07-05 20:46:13.077918: processing plans 90/190 ...
2016-07-05 20:47:01.709539: processing plans 100/190 ...
2016-07-05 20:47:46.222877: processing plans 110/190 ...
2016-07-05 20:48:32.488614: processing plans 120/190 ...
	ERROR parsing drug info, skip plan u'quantity_limit' 30969OR0050001
	ERROR parsing drug info, skip plan u'quantity_limit' 30969OR0060001
	ERROR parsing drug info, skip plan u'quantity_limit' 30969OR0070001
	ERROR parsing drug info, skip plan u'quantity_limit' 30969OR0080001
	ERROR parsing drug info, skip plan u'quantity_li

###Provider & Facility dev

In [84]:
# find all specialities the plans cover
# specs = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('speciality')
# getProviderSpeciality(prov_col, '30969OR0040001', 'speciality', specs)

# find langurage
lang = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('languages')
getProviderAttrCount(prov_col, '30969OR0040001', 'languages', lang)

In [89]:
getProviderUniqueAttr(prov_col, 'accepting', ex_id)

#TODO: add another feature: number/count of locations

[u'accepting', u'not accepting', u'accepting in some locations']

In [87]:
# build 
def getProviderUniqueAttr(provider_col, attr, plan_id):
    return provider_col.find({'plans.plan_id':{'$in':plan_id}}).distinct(attr)

def getProviderAttrCount(provider_col, plan_id, attr, all_specs):
    spec_count = [0]*len(all_specs)
    for sp in prov_col.aggregate(
        [
            {'$match':{'plans.plan_id':plan_id}},
            {'$group':{'_id':'$'+attr, 'cnt':{'$sum':1}}},
            {'$project':{'spec':'$_id', 'count':'$cnt', '_id':0}}   
        ]
    ):
        spec_count[all_specs.index(sp['spec'][0])] = sp['count']        
        
    return spec_count

    
prov_col.find_one()

{u'_id': ObjectId('5775ff81c421d272dcd681f5'),
 u'accepting': u'accepting',
 u'addresses': [{u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'}],
 u'languages': [u'English'],
 u'last_updated_on': u'2015-10-06',
 u'name':

In [None]:
faci_col.find({'plans.plan_id':'10191NJ0070002'}).distinct('facility_type')

###Reset

In [105]:
client.close()
%reset 

In [56]:
t0 = time.time()
print 'drugs: %d' %len(drug_col.find({'plans.plan_id':'67577MI0390012'}).distinct('rxnorm_id'))
print("--- %s seconds ---" % (time.time() - t0))

drugs: 2710
--- 19.570182085 seconds ---


In [84]:
# regex filtering

regx = re.compile(".combigan.", re.IGNORECASE)
drug_col.find_one({'drug_name':regx}, {'_id':0, 'drug_name':1, 'rxnorm_id':1})

{u'drug_name': u'Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML Ophthalmic Solution [Combigan]',
 u'rxnorm_id': u'861637'}

In [155]:
# check if tier is defined in multiple doc

[[None]*4]*3+[]


# for p in 
plan_col.find_one(
            {'plan_id':'30969OR0050001'} #, 'formulary.drug_tier':tier},
#             {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        )
#     print p,'\t'
        

{u'_id': ObjectId('577873392ccfd955ecdf4abc'),
 u'benefits': [],
 u'formulary': {u'cost_sharing': [{u'coinsurance_opt': u'NO-CHARGE',
    u'coinsurance_rate': 0.0,
    u'copay_amount': 10.0,
    u'copay_opt': u'NO-CHARGE-AFTER-DEDUCTIBLE',
    u'pharmacy_type': u'1-MONTH-IN-RETAIL'},
   {u'coinsurance_opt': u'NO-CHARGE',
    u'coinsurance_rate': 0.0,
    u'copay_amount': 30.0,
    u'copay_opt': u'NO-CHARGE-AFTER-DEDUCTIBLE',
    u'pharmacy_type': u'3-MONTH-IN-MAIL'}],
  u'drug_tier': u'GENERIC',
  u'mail_order': True},
 u'last_updated_on': u'2015-11-09',
 u'marketing_name': u'Zoom Health Plan Zoom Oregon Standard Gold Plan ',
 u'marketing_url': u'https://www.zoomcare.com/plusyou',
 u'network': [{u'network_tier': u'PREFERRED'}],
 u'plan_contact': u'athayer@zoomcare.com',
 u'plan_id': u'30969OR0050001',
 u'plan_id_type': u'HIOS-PLAN-ID',
 u'summary_url': u'https://www.zoomcare.com/sbc/isgd01.pdf'}