In [1]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from scipy.sparse import *
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

def logTime():
    return str(datetime.datetime.now())

In [92]:
%reload_ext autoreload
%autoreload 2
from aca_drug_feature import *
from aca_plan_feature import *
from aca_provider_feature import *

In [3]:
local = False
if local:
    client = MongoClient('fc8iasm01', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print '%s: using %s Mongo, total drug: %d, total plan: %d, total provider: %d' %(
    logTime(), 'local' if local else 'aws', len(all_drug), len(all_plan), prov_col.count())
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


In [5]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: provider document: %d' %(logTime(), prov_col.count())
print '%s: facility document: %d' %(logTime(), faci_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

# multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

# multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-09 09:32:28.660514: plan document: 12136
2016-07-09 09:32:28.744694: drug document: 1540473
2016-07-09 09:32:28.832037: provider document: 8799098
2016-07-09 09:32:28.931804: facility document: 4815321
2016-07-09 09:32:29.014277: unique plan_id: 6035
2016-07-09 09:32:29.014387: unique rxnorm_id: 46206
2016-07-09 09:32:29.016927: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY


###Main program

In [None]:
state = 'OR' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
n_plan = len(ex_id)
print '%s: processing %d plans for %s' %(logTime(), len(ex_id), 'all' if not state else state)

print '%s: 1/11 get formulary state space for all plans' %logTime()
all_plan_states = getFormularyAllStates1(plan_col, ex_id) + \
                  getFormularyAllStates2(plan_col, ex_id) + \
                  getFormularyAllStates3(plan_col, ex_id) 
print '%s: total plan states: %d' %(logTime(), len(all_plan_states))

print '%s: 2/11 extract formulary states for each plan' %logTime()
plan_feature = lil_matrix((n_plan, len(all_plan_states)))
i=0

for p in getFormularyStatesForPlan1(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['_id'])
    for s in p['plan_states']:
        plan_feature[r_id, all_plan_states.index(s)] = 1
        
for p in getFormularyStatesForPlan2(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['_id'])
    for s in p['plan_states']:
        plan_feature[r_id, all_plan_states.index(s)] = 1
        
for p in getFormularyStatesForPlan3(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['_id'])
    for s in p['plan_states']:
        plan_feature[r_id, all_plan_states.index(s)] = 1
        
print '%s: complete for %d plans' %(logTime(), i)

print '%s: 3/11 get summary feature for each plan' %logTime()
plan_sumstat = [[0]*3]*n_plan
i=0
for p in getFormularyAggregate(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    plan_sumstat[r_id] = [p['avg_copay'],p['avg_ci_rate'],p['count']]
print '%s: complete for %d plans' %(logTime(), i)
    
print '%s: 4/11 get all drugs covered by all plans' %logTime()
all_rxnorm = drug_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('rxnorm_id')
print '%s: total rx: %d' %(logTime(), len(all_rxnorm))

print '%s: 5/11 check drug coverage for each plan' %logTime()
drug_coverage = lil_matrix((n_plan, len(all_rxnorm)))
i=0
for p in getDrugListForPlans(drug_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for r in p['drug']:
        drug_coverage[r_id, all_rxnorm.index(r)] = 1
print '%s: complete for %d plans' %(logTime(), i)

print '%s: 6/11 get summary feature for drug' %logTime()
all_drug_states = getDrugAggregateAllStates(drug_col, ex_id)
print '%s: total drug states: %d' %(logTime(), len(all_drug_states))

print '%s: 7/11 extract drug sumstat for each plan' %logTime()
drug_sumstat = lil_matrix((n_plan, len(all_drug_states)))
i=0
for p in getDrugAggregateCountForPlans(drug_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for d in p['drug_state']:
        drug_sumstat[r_id, all_drug_states.index(d['key'])] = d['cnt']
print '%s: complete for %d plans' %(logTime(), i)

print '%s: 8/11 get provider under the plans' %logTime()
all_npi = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('npi')
print '%s: total providers: %d' %(logTime(), len(all_npi))

print '%s: 9/11 check provider coverage for each plan' %logTime() ##### slow #####
provider_coverage = lil_matrix((n_plan, len(all_npi)))
i=0
for p in getProviderListForPlans(prov_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for npi in p['npi']:
        provider_coverage[r_id, all_npi.index(npi)] = 1
print '%s: complete for %d plans' %(logTime(), i)

print '%s: 10/11 get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, ex_id)
print '%s: total provider summary: %d' %(logTime(), len(all_provider_states))

print '%s: 11/11 extract provider sumstat for each plan' %logTime()
provider_sumstat = lil_matrix((n_plan, len(all_provider_states)))
for p in getProviderStateForPlans(prov_col, ex_id):
    r_id = ex_id.index(p['_id'])
    for d in p['plan_states']:
        provider_sumstat[r_id, all_provider_states.index(d['key'])] = d['count'] #[d['count'], d['location']]
print '%s: complete for %d plans' %(logTime(), i)

2016-07-09 12:12:50.161518: processing 190 plans for OR
2016-07-09 12:12:50.161597: 1/11 get formulary state space for all plans
2016-07-09 12:12:50.499005: total plan states: 107
2016-07-09 12:12:50.499760: 2/11 extract formulary states for each plan
2016-07-09 12:12:51.123879: complete for 172 plans
2016-07-09 12:12:51.124474: 3/11 get summary feature for each plan
2016-07-09 12:12:51.232876: complete for 47 plans

In [48]:
# i=0
# ids = ex_id
# for p in getFormularyStatesForPlan(plan_col, ex_id):
#     i+=1
#     print p['_id'],len(p['plan_states'])
#     ids.remove(p['_id'])
    
# print i

# plan_col.count({'$or':[{'formulary.cost_sharing':{'$exists':True}},{'formulary.costSharing':{'$exists':True}}]})

# plan_col.find_one({'plan_id':'33931OH0030004'})['formulary'][3]

len(plan_col.find({'formulary.cost_sharing':{'$exists':False}}).distinct('plan_id'))

3449

In [35]:
ex_id

[u'61671NC0070001',
 u'61671NC0070002',
 u'61671NC0070003',
 u'61671NC0070004',
 u'61671NC0070005',
 u'61671NC0070006',
 u'61671NC0070007',
 u'61671NC0070008',
 u'61671NC0070009',
 u'61671NC0070010',
 u'61671NC0070011',
 u'61671NC0070012',
 u'61671NC0070013',
 u'61671NC0070014',
 u'61671NC0070015',
 u'61671NC0070016',
 u'61671NC0070017',
 u'61671NC0070018',
 u'61671NC0080001',
 u'61671NC0080002',
 u'61671NC0080003',
 u'61671NC0080007',
 u'61671NC0080008',
 u'61671NC0080009',
 u'61671NC0080011',
 u'61671NC0080012',
 u'61671NC0080013',
 u'11512NC0060017',
 u'11512NC0060018',
 u'11512NC0060019',
 u'11512NC0060020',
 u'11512NC0060024',
 u'11512NC0060026',
 u'11512NC0060027',
 u'11512NC0060028',
 u'11512NC0100021',
 u'11512NC0100022',
 u'11512NC0100023',
 u'11512NC0100024',
 u'11512NC0100028',
 u'11512NC0100030',
 u'11512NC0100031',
 u'11512NC0100032',
 u'11512NC0100033',
 u'11512NC0100034',
 u'11512NC0100035',
 u'11512NC0100036',
 u'11512NC0100040',
 u'11512NC0100042',
 u'11512NC0100043',


In [86]:
i=0
cs='cost_sharing'
for p in plan_col.aggregate([
        {'$match':{'plan_id':{'$in':all_plan}, 'formulary.cost_sharing':{'$exists':False},
                  'formulary.costSharing':{'$exists':False}}},
        {'$unwind':'$formulary'},
        {'$unwind':'$network'},        
#         {'$unwind':'$formulary.cost_sharing'}
        
    ]):    
    i+=1
    print p #['costSharing']
    if i==3:
        break
    
    

{u'marketing_name': u'Bronze 5', u'network': {u'network_tier': u'PREFERRED'}, u'formulary': {u'drug_tier': u'RETAIL-GENERIC', u'mail_order': False}, u'plan_id_type': u'HIOS-PLAN-ID', u'last_updated_on': u'2015-09-14', u'summary_url': u'http://prominencehealthplan.com/individual-and-family-plans/nevada-health-link-individual-and-family-plans/', u'plan_id': u'37392TX0010001', u'plan_contact': u'http://prominencehealthplan.com/about-prominence-health-plan/contact-prominence-health-plan/', u'_id': ObjectId('577c268486395f67704fe3e0'), u'marketing_url': u'http://prominencehealthplan.com/individual-and-family-plans/nevada-health-link-individual-and-family-plans/'}
{u'marketing_name': u'Bronze 5', u'network': {u'network_tier': u'PREFERRED'}, u'formulary': {u'drug_tier': u'RETAIL BRAND', u'mail_order': False}, u'plan_id_type': u'HIOS-PLAN-ID', u'last_updated_on': u'2015-09-14', u'summary_url': u'http://prominencehealthplan.com/individual-and-family-plans/nevada-health-link-individual-and-famil

In [78]:
i=0
for p in getFormularyStatesForPlan(plan_col, all_plan, 'costSharing'):
    print p
    i+=1
    if i==15:
        break

{u'count': [3], u'plan_states': [None], u'_id': u'48396ME0940005'}
{u'count': [2], u'plan_states': [None], u'_id': u'48396ME0710016'}
{u'count': [3], u'plan_states': [None], u'_id': u'48396ME0720031'}
{u'count': [2], u'plan_states': [None], u'_id': u'17575IN0700031'}
{u'count': [2], u'plan_states': [None], u'_id': u'79475WI0340016'}
{u'count': [2], u'plan_states': [None], u'_id': u'79475WI0340014'}
{u'count': [3], u'plan_states': [None], u'_id': u'48396ME0720028'}
{u'count': [2], u'plan_states': [None], u'_id': u'79475WI0500003'}
{u'count': [3], u'plan_states': [None], u'_id': u'48396ME0720036'}
{u'count': [3], u'plan_states': [None], u'_id': u'48396ME0720026'}
{u'count': [2], u'plan_states': [None], u'_id': u'49046GA0410021'}
{u'count': [2], u'plan_states': [None], u'_id': u'48396ME0710015'}
{u'count': [2], u'plan_states': [None], u'_id': u'49046GA0410022'}
{u'count': [2], u'plan_states': [None], u'_id': u'48396ME0730004'}
{u'count': [3], u'plan_states': [None], u'_id': u'48396ME07200

In [53]:
# get plan state - combinations of drug_tier/pharmacy_type/copay_opt/coinsurance_opt for one plan
def getFormularyStatesForPlan(plan_collection, plans, strCostSharing):
    if type(plans) is str:
        plans = [plans]
    # strCostSharing = 'cost_sharing'
    return plan_collection.aggregate(
        [
            {'$match':{'plan_id':{'$in':plans}, 'formulary.'+strCostSharing:{'$exists':True}}},
            {'$unwind':'$formulary'},
            {'$unwind':'$formulary.'+strCostSharing}, # cost_sharing'},
            {'$unwind':'$network'},
            # still use a group stage so the context can collapse into key
            {'$group':{
                    '_id':{
                        'pl':'$plan_id',
                        'ti':'$formulary.drug_tier',
                        'ph':'$formulary.'+strCostSharing+'.pharmacy_type',
                        'cp':'$formulary.'+strCostSharing+'.copay_opt',
                        'ci':'$formulary.'+strCostSharing+'.coinsurance_opt',
                        'nt':'$network.network_tier',
                    },
                    'cnt':{'$sum':1},
                }
            },
            {'$project':{
                    '_id':0,
                    'plan':'$_id.pl',
                    'plan_state':{
                        '$concat':[
                            {'$cond':[{'$or':[{'$eq':['$_id.ti',None]},{'$eq':['$_id.ti','']}]},'NA','$_id.ti']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.ph',None]},{'$eq':['$_id.ph','']}]},'NA','$_id.ph']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.cp',None]},{'$eq':['$_id.cp','']}]},'NA','$_id.cp']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.ci',None]},{'$eq':['$_id.ci','']}]},'NA','$_id.ci']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.nt',None]},{'$eq':['$_id.nt','']}]},'NA','$_id.nt']},
                        ]
                    },
                    'count':'$cnt',
                }
            },
            {'$group':{'_id':'$plan', 'count':{'$addToSet':'$count'}, 'plan_states':{'$addToSet':'$plan_state'}}},
            {'$sort':{'plan':1}}
        ]
    )

# get plan state - unique combinations of drug_tier/pharmacy_type/copay_opt/coinsurance_opt from all plans
def getFormularyAllStates(plan_collection, plans):
    if type(plans) is str:
        plans = [plans]
    for p in plan_collection.aggregate(
        [
            {'$match':{'plan_id':{'$in':plans}}},
            {'$unwind':'$formulary'},
            {'$unwind':'$formulary.cost_sharing'},
            # {'$unwind':'$formulary.costSharing'},
            {'$unwind':'$network'},
            # group context from all plans
            {'$group':{
                    '_id':{
                        'ti':'$formulary.drug_tier',
                        'ph':'$formulary.cost_sharing.pharmacy_type',
                        'cp':'$formulary.cost_sharing.copay_opt',
                        'ci':'$formulary.cost_sharing.coinsurance_opt',
                        'nt':'$network.network_tier',
                    },
                }
            },
            {'$project':{
                    '_id':0,
                    'plan_state':{
                        '$concat':[
                            {'$cond':[{'$or':[{'$eq':['$_id.ti',None]},{'$eq':['$_id.ti','']}]},'NA','$_id.ti']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.ph',None]},{'$eq':['$_id.ph','']}]},'NA','$_id.ph']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.cp',None]},{'$eq':['$_id.cp','']}]},'NA','$_id.cp']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.ci',None]},{'$eq':['$_id.ci','']}]},'NA','$_id.ci']},'|',
                            {'$cond':[{'$or':[{'$eq':['$_id.nt',None]},{'$eq':['$_id.nt','']}]},'NA','$_id.nt']},
                        ]
                    },
                }
            },
            {'$group':{'_id':None, 'count':{'$addToSet':'$count'}, 'all_states':{'$addToSet':'$plan_state'}}},
        ]
    ):
        states = p['all_states']
    return states


# get the mean value of copay_amout and coinsurance_rate (over all tier/pharmacy/copay/coinsurance options) for a plan
def getFormularyAggregate(plan_collection, plans):
    if type(plans) is str:
        plans = [plans]
    return plan_collection.aggregate(
        [
            {'$match':{'plan_id':{'$in':plans}}},
            {'$unwind':'$formulary'},
            {'$unwind':'$formulary.cost_sharing'},
            {'$group':{
                '_id':{'plan':'$plan_id'},
                'a_cp':{'$avg':'$formulary.cost_sharing.copay_amount'},
                'a_in':{'$avg':'$formulary.cost_sharing.coinsurance_rate'},
                'cnt':{'$sum':1},
                }
            },
            {'$project':{
                '_id':0,
                'plan':'$_id.plan',
                'avg_copay':'$a_cp',
                'avg_ci_rate':'$a_in',
                'count':'$cnt',
                }
            },
            {'$sort':{'plan':1}}
        ]
    )


###sparse matrix manual mode

In [63]:
# initialize as lil
# test = lil_matrix((3,18))
# test[2,5] = 3.14
test

<3x18 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in LInked List format>

In [59]:
# convert to csr
t2=csr_matrix(test)

In [11]:
client.close()
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
