In [1]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

In [2]:
local = True
if local:
    client = MongoClient('fc8iasm01', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using local Mongo, total drug: 8968, total plan: 27


###Load encode definition

In [3]:
def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + ('' if f=='properties' else '.'+f))
    return rtn

def logTime():
    return str(datetime.datetime.now())

encode_list = getEncodeFields(json.load(open('encode2.json')), {})
encode_list

{u'drug.plans.drug_tier': u'string',
 u'drug.plans.prior_authorization': u'boolean',
 u'drug.plans.quantity_limit': u'boolean',
 u'drug.plans.step_therapy': u'boolean',
 u'plan.formulary.cost_sharing.coinsurance_opt': u'string',
 u'plan.formulary.cost_sharing.coinsurance_rate': u'float',
 u'plan.formulary.cost_sharing.copay_amount': u'float',
 u'plan.formulary.cost_sharing.copay_opt': u'string',
 u'plan.formulary.cost_sharing.pharmacy_type': u'string',
 u'plan.formulary.drug_tier': u'string',
 u'plan.formulary.mail_order': u'boolean',
 u'plan.network.network_tier': u'string',
 u'plan.plan_id_type': u'string'}

###Data uniformity check

In [4]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-06 16:03:38.685000: plan document: 35
2016-07-06 16:03:38.687000: drug document: 17777
2016-07-06 16:03:38.689000: unique plan_id: 27
2016-07-06 16:03:38.689000: unique rxnorm_id: 8968
2016-07-06 16:03:38.693000: plans with multiple documents: 0
2016-07-06 16:03:39.081000: drugs with multiple documents: 8809
2016-07-06 16:03:39.092000: states in the plan: AK, OR


###Get feature space

In [6]:
state = 'AK' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
print 'processing %d plans for %s ...' %(len(ex_id), 'all' if not state else state)

feature_space = {
    k : (plan_col if k.startswith('plan') else drug_col).find(
        { ('plan_id' if k.startswith('plan') else 'plans.plan_id') : {'$in':ex_id} }
    ).distinct(k[k.index('.')+1:]) 
    for k,v in encode_list.items() if v=='string'
}

feature_space

processing 11 plans for AK ...


{u'drug.plans.drug_tier': [u'SELECT',
  u'BRAND',
  u'PREFERRED',
  u'VALUE',
  u'SPECIALTY',
  u'PREVENTIVE',
  u'MEDICAL-SERVICE-DRUGS'],
 u'plan.formulary.cost_sharing.coinsurance_opt': [None, u'AFTER-DEDUCTIBLE'],
 u'plan.formulary.cost_sharing.copay_opt': [u'BEFORE-DEDUCTIBLE',
  u'NO-CHARGE'],
 u'plan.formulary.cost_sharing.pharmacy_type': [u'1-MONTH-IN-RETAIL',
  u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL'],
 u'plan.formulary.drug_tier': [u'BRAND',
  u'MEDICAL-SERVICE-DRUGS',
  u'PREFERRED',
  u'PREVENTIVE',
  u'SELECT',
  u'SPECIALTY',
  u'VALUE'],
 u'plan.network.network_tier': [u'PREFERRED'],
 u'plan.plan_id_type': [u'HIOS-PLAN-ID']}

###Get common drugs between plans
- operator reference [link](https://docs.mongodb.com/manual/reference/operator/query/)

In [7]:
# get rxnorm_id group for each plan
common_drug = drug_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('rxnorm_id')
n_drug = len(common_drug)
drug_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith ('drug')]
drug_cat_index = ([False]+[k[1]=='string' for k in drug_attr])*n_drug
print '%s: there are %d drugs for %d plans!' %(logTime(), n_drug, len(ex_id))

2016-07-06 16:04:25.293000: there are 8809 drugs for 11 plans!


###Get unique pharmacy_type for each drug_tier

In [8]:
# Note: issue with distinct in the query
# tier_pharm = {tier : plan_col.find(
#         {'plan_id':{'$in':ex_id}, 'formulary.drug_tier':tier}, #'formulary.0.cost_sharing.0':{'$exists':True}},
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     ).distinct('formulary.cost_sharing.pharmacy_type')
#     for tier in feature_space['plan.formulary.drug_tier']
# }

# traditional way to check
tier_pharm = {}
# for tier in feature_space['plan.formulary.drug_tier']:
#     query = plan_col.find(
#         {
#             'plan_id':{'$in':ex_id}, 
#             'formulary.drug_tier':tier,            
#             'formulary.0.cost_sharing.0':{'$exists':True},             
#         },
#         # NOTE: when fomulary is a dict instead of arrary, this projection won't return content
#         {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}
#     )
#     tier_pharm[tier] = list(Set(cs['pharmacy_type'] for p in query for f in p['formulary'] for cs in f['cost_sharing']))

# simple query to double check results
for p in plan_col.find({'plan_id':{'$in':ex_id}}):
    if 'formulary' not in p:
        continue
    if type(p['formulary']) is dict:
        p['formulary'] = [p['formulary']]
    for f in p['formulary']:        
        if f['drug_tier'] not in tier_pharm:
            tier_pharm[f['drug_tier']]=[]        
        if 'cost_sharing' not in f:
            continue
        for cs in f['cost_sharing']:
            tier_pharm[f['drug_tier']].append(cs['pharmacy_type'])
tier_pharm = {k:list(set(v)) for k,v in tier_pharm.items()}

tier_pharm

{u'BRAND': [u'3-MONTH-IN-MAIL', u'3-MONTH-IN-RETAIL', u'3-MONTH-OUT-RETAIL'],
 u'MEDICAL-SERVICE-DRUGS': [u'3-MONTH-IN-RETAIL'],
 u'PREFERRED': [u'3-MONTH-IN-MAIL',
  u'3-MONTH-IN-RETAIL',
  u'3-MONTH-OUT-RETAIL'],
 u'PREVENTIVE': [u'3-MONTH-IN-RETAIL'],
 u'SELECT': [u'3-MONTH-IN-MAIL', u'3-MONTH-IN-RETAIL', u'3-MONTH-OUT-RETAIL'],
 u'SPECIALTY': [u'1-MONTH-IN-RETAIL'],
 u'VALUE': [u'3-MONTH-IN-MAIL', u'3-MONTH-IN-RETAIL', u'3-MONTH-OUT-RETAIL']}

###Check provider feature space

In [13]:
prov_attr = ['speciality', 'languages', 'type', 'accepting']
prov_attr_space = {f:getProviderUniqueAttr(prov_col, f, ex_id) for f in prov_attr}
prov_cat_index = [False]*sum(len(v) for v in prov_attr_space.values())

NameError: global name 'getProviderUniqueAttr' is not defined

###Evaluate feature dimension and variable index

In [9]:
# put tier names into list so the order is fixed for feature extraction
tiers = tier_pharm.keys()
# we build pharmacy type into the order of feature vector, so no need to include
cost_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() 
             if 'cost_sharing' in k and 'pharmacy_type' not in k]
# flatten the vector to combine all tiers
cat2d = [[False] + [k[1]=='string' for k in cost_attr]*len(tier_pharm[t]) for t in tiers]
cost_cat_index = [y for x in cat2d for y in x]
# # plan level attributes
# plan_attr = [[k.split('.')[-1],v,k] for k,v in encode_list.items() if k.startswith('plan') and 'formulary' not in k]
# plan_cat_index = [a[1]=='string' for a in plan_attr]

# total feature catagrical index - must match with the order of feature canconnation in plan
cat_index = cost_cat_index + drug_cat_index + prov_cat_index
catagorical_var = [i for i,v in zip(range(len(cat_index)),cat_index) if v]
print '%s: feature dimension before encode: %d' %(logTime(), len(cat_index))

2016-07-06 16:04:41.817000: feature dimension before encode: 44112


###Extraction integer features for each plan
- get plan feature --> cost_sharing/pharmacy type
- get drug feature --> drug_tier

In [None]:
def getPharmacyFeatureFromPlan(pid, tier_pharmacy, plan_collection, all_tier, pharm_attr, fea_space):
    # initialize feature vector
    tier_feature = [None]*len(tiers)    
    # for each tier, assemble pharmacy info
    for tier, pharm in tier_pharmacy.items():         
        doc_cur = plan_collection.find(
            {'plan_id':pid, 'formulary.drug_tier':tier},
            {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        )   
        # assemble pharmacy feature for this tier                
        n_doc, mail_order, pharm_feature = 0, None, [[None]*len(pharm_attr)]*len(pharm)        
        for doc in doc_cur:              
            n_doc += 1
            fml = doc['formulary'][0]
            mail_order = fml['mail_order']
            # no pharmarcy type on the tier, only put mail_order (from last doc)
            if len(pharm) == 0 or 'cost_sharing' not in fml:                
                continue                        
            # loop pharmarcy types  
            try:
                for cs in fml['cost_sharing']:
                    cost_feature = [cs[a[0]] if a[1]!='string' else fea_space[a[2]].index(cs[a[0]]) 
                                    for a in pharm_attr]
                    pharm_feature[pharm.index(cs['pharmacy_type'])] = cost_feature
            except Exception as ex:
                print '\tERROR parsing cost_sharing value',ex,pid,tier        
        # attach pharmacy info to tier feature
        tier_feature[all_tier.index(tier)] = [mail_order] + [y for x in pharm_feature for y in x]
        if n_doc > 1:
            print '\tWARNING: plan tier with multiple document:', pid,tier        
    # flaten the vector for the plan from hierarchy: tier-cost-pharmacy
    return [y for x in tier_feature for y in x]

In [12]:
def getDrugFeatureForPlan(pid, all_rxnorms, ex_attr, fea_space):
    # get the available drug attributes for a plan
    drug_cur = drug_col.find(
        {'plans.plan_id':pid, 'rxnorm_id':{'$in':all_rxnorms}},# 'plans.0':{'$exists':True}}, 
        {'_id':0, 'rxnorm_id':1, 'plans':{ '$elemMatch':{'plan_id':pid} }}
    )

    # extract the drug attributes for all common drugs    
    drug_fea = [[False]+[None]*len(ex_attr)]*len(all_rxnorms)
    try:
        for drug in drug_cur:
            fea, rid = drug['plans'][0], all_rxnorms.index(drug['rxnorm_id'])
            drug_fea[rid] = [True]+[fea[a[0]] if a[1]!='string' 
                                    else fea_space[a[2]].index(fea[a[0]])
                                    for a in ex_attr]
        # flatten features and return
        return [f for d in drug_fea for f in d]
    except Exception as ex:
        print '\tError parsing %s drug, skipping: %s' %(pid, str(ex))
        return None    

In [None]:
def getProviderUniqueAttr(provider_col, attr, all_plan_id):
    return provider_col.find({'plans.plan_id':{'$in':all_plan_id}}).distinct(attr)

def getProviderAttrCount(provider_col, plan_id, attr, all_attr):    
    attr_count = [0]*len(all_attr)
    for at in provider_col.aggregate(
        [
            {'$match':{'plans.plan_id':plan_id}},
            {'$group':{'_id':'$'+attr, 'cnt':{'$sum':1}}},
            {'$project':{attr:'$_id', 'count':'$cnt', '_id':0}}   
        ]
    ):
#         print at
        k = '' if not at[attr] else (at[attr][0] if type(at[attr]) is list else at[attr])
        if k in all_attr:
            attr_count[all_attr.index(k)] = at['count']        
    return attr_count

def getProviderFeatureForPlan(pid, features, provider_col, fea_list):
    # find all specialities the plans cover        
    rtn = []
    for f in fea_list:
        rtn += getProviderAttrCount(provider_col, pid, f, features[f]) 
    return rtn
    

In [10]:
plan_int_feature, i, skip = {}, 0, 0
# for each plan, get int features (plan level & combined fomulary level)
for pid in ex_id:
    i += 1    
    if i%10==0:
        print '%s: processing plans %d/%d ...' %(logTime(), i, len(ex_id))    
    # feature from plan data
    plan_feature = getPharmacyFeatureFromPlan(pid, tier_pharm, plan_col, tiers, cost_attr, feature_space)
    if len(plan_feature) != len(cost_cat_index):
        skip += 1
        print '\nError: plan feature dimension mismatch for %s' %pid
        continue
    
    # feature from drug data
    drug_feature = getDrugFeatureForPlan(pid, common_drug, drug_attr, feature_space)
    if not drug_feature:
        skip += 1
        continue
    
    # feature from provider data    
    prov_feature = getPharmacyFeatureFromPlan(pid, prov_attr_space, prov_col, prov_attr)
    if len(prov_feature) != len(prov_cat_index):
        skip += 1
        print '\nError: provider feature dimension mismatch for %s' %pid
    
    # combine for plan feature - must match with catagroical index concannation order    
    plan_int_feature[pid] = plan_feature + drug_feature + prov_feature

print '%s: completed processing %s plan, %d skipped due to parsing issue.' %(logTime(), state, skip)

2016-07-06 16:05:02.049000: processing plans 10/11 ...
2016-07-06 16:05:04.664000: completed processing AK plan, 0 skipped due to parsing issue.


###Provider & Facility dev

In [84]:
# find all specialities the plans cover
# specs = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('speciality')
# getProviderSpeciality(prov_col, '30969OR0040001', 'speciality', specs)

# find langurage
lang = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('languages')
getProviderAttrCount(prov_col, '30969OR0040001', 'languages', lang)

In [89]:
getProviderUniqueAttr(prov_col, 'accepting', ex_id)

#TODO: add another feature: number/count of locations

[u'accepting', u'not accepting', u'accepting in some locations']

In [87]:
# build 
def getProviderUniqueAttr(provider_col, attr, plan_id):
    return provider_col.find({'plans.plan_id':{'$in':plan_id}}).distinct(attr)

def getProviderAttrCount(provider_col, plan_id, attr, all_specs):
    spec_count = [0]*len(all_specs)
    for sp in prov_col.aggregate(
        [
            {'$match':{'plans.plan_id':plan_id}},
            {'$group':{'_id':'$'+attr, 'cnt':{'$sum':1}}},
            {'$project':{'spec':'$_id', 'count':'$cnt', '_id':0}}   
        ]
    ):
        spec_count[all_specs.index(sp['spec'][0])] = sp['count']        
        
    return spec_count

    
prov_col.find_one()

{u'_id': ObjectId('5775ff81c421d272dcd681f5'),
 u'accepting': u'accepting',
 u'addresses': [{u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'},
  {u'address': u'8100 WESTCOTT DRIVE',
   u'address_2': u'SUITE 101',
   u'city': u'FLEMINGTON',
   u'phone': u'9087820600',
   u'state': u'NJ',
   u'zip': u'08822'}],
 u'languages': [u'English'],
 u'last_updated_on': u'2015-10-06',
 u'name':

In [None]:
faci_col.find({'plans.plan_id':'10191NJ0070002'}).distinct('facility_type')

###Reset

In [105]:
client.close()
%reset 

In [56]:
t0 = time.time()
print 'drugs: %d' %len(drug_col.find({'plans.plan_id':'67577MI0390012'}).distinct('rxnorm_id'))
print("--- %s seconds ---" % (time.time() - t0))

drugs: 2710
--- 19.570182085 seconds ---


In [84]:
# regex filtering

regx = re.compile(".combigan.", re.IGNORECASE)
drug_col.find_one({'drug_name':regx}, {'_id':0, 'drug_name':1, 'rxnorm_id':1})

{u'drug_name': u'Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML Ophthalmic Solution [Combigan]',
 u'rxnorm_id': u'861637'}

In [155]:
# check if tier is defined in multiple doc

[[None]*4]*3+[]


# for p in 
plan_col.find_one(
            {'plan_id':'30969OR0050001'} #, 'formulary.drug_tier':tier},
#             {'_id':0, 'formulary':{'$elemMatch':{'drug_tier':tier}}}        
        )
#     print p,'\t'
        

{u'_id': ObjectId('577873392ccfd955ecdf4abc'),
 u'benefits': [],
 u'formulary': {u'cost_sharing': [{u'coinsurance_opt': u'NO-CHARGE',
    u'coinsurance_rate': 0.0,
    u'copay_amount': 10.0,
    u'copay_opt': u'NO-CHARGE-AFTER-DEDUCTIBLE',
    u'pharmacy_type': u'1-MONTH-IN-RETAIL'},
   {u'coinsurance_opt': u'NO-CHARGE',
    u'coinsurance_rate': 0.0,
    u'copay_amount': 30.0,
    u'copay_opt': u'NO-CHARGE-AFTER-DEDUCTIBLE',
    u'pharmacy_type': u'3-MONTH-IN-MAIL'}],
  u'drug_tier': u'GENERIC',
  u'mail_order': True},
 u'last_updated_on': u'2015-11-09',
 u'marketing_name': u'Zoom Health Plan Zoom Oregon Standard Gold Plan ',
 u'marketing_url': u'https://www.zoomcare.com/plusyou',
 u'network': [{u'network_tier': u'PREFERRED'}],
 u'plan_contact': u'athayer@zoomcare.com',
 u'plan_id': u'30969OR0050001',
 u'plan_id_type': u'HIOS-PLAN-ID',
 u'summary_url': u'https://www.zoomcare.com/sbc/isgd01.pdf'}