In [12]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from scipy.sparse import *
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

def logTime():
    return str(datetime.datetime.now())

In [13]:
%reload_ext autoreload
%autoreload 2
from aca_drug_feature import *
from aca_plan_feature import *
from aca_provider_feature import *

In [14]:
local = False
if local:
    client = MongoClient('fc8iasm01', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


In [15]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

# multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

# multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-08 23:08:27.667732: plan document: 12136
2016-07-08 23:08:27.797582: drug document: 1540473
2016-07-08 23:08:27.911088: unique plan_id: 6035
2016-07-08 23:08:27.911235: unique rxnorm_id: 46206
2016-07-08 23:08:27.914224: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY
2016-07-08 23:08:27.916281: processing 20 plans for AK


###Main program

In [None]:
state = 'OR' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
n_plan = len(ex_id)
print '%s: processing %d plans for %s' %(logTime(), len(ex_id), 'all' if not state else state)

print '%s: get formulary state space for all plans' %logTime()
all_plan_states = getFormularyAllStates(plan_col, ex_id) 
print '%s: total plan states: %d' %(logTime(), len(all_plan_states))

print '%s: extract formulary states for each plan' %logTime()
plan_feature = lil_matrix((n_plan, len(all_plan_states)))
i=0
for p in getFormularyStatesForPlan(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['_id'])
    for s in p['plan_states']:
        plan_feature[r_id, all_plan_states.index(s)] = 1
print '%s: complete for %d plans' %(logTime(), i)

print '%s: get summary feature for each plan' %logTime()
plan_sumstat = [[0]*3]*n_plan
i=0
for p in getFormularyAggregate(plan_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    plan_sumstat[r_id] = [p['avg_copay'],p['avg_ci_rate'],p['count']]
print '%s: complete for %d plans' %(logTime(), i)
    
print '%s: get all drugs covered by all plans' %logTime()
all_rxnorm = drug_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('rxnorm_id')
print '%s: total rx: %d' %(logTime(), len(all_rxnorm))

print '%s: check drug coverage for each plan' %logTime()
drug_coverage = lil_matrix((n_plan, len(all_rxnorm)))
i=0
for p in getDrugListForPlans(drug_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for r in p['drug']:
        drug_coverage[r_id, all_rxnorm.index(r)] = 1
print '%s: complete for %d plans' %(logTime(), i)

print '%s: get summary feature for drug' %logTime()
all_drug_states = getDrugAggregateAllStates(drug_col, ex_id)
print '%s: total drug states: %d' %(logTime(), len(all_drug_states))

print '%s: extract drug sumstat for each plan' %logTime()
drug_sumstat = lil_matrix((n_plan, len(all_drug_states)))
i=0
for p in getDrugAggregateCountForPlans(drug_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for d in p['drug_state']:
        drug_sumstat[r_id, all_drug_states.index(d['key'])] = d['cnt']
print '%s: complete for %d plans' %(logTime(), i)

print '%s: get provider under the plans' %logTime()
all_npi = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('npi')
print '%s: total providers: %d' %(logTime(), len(all_npi))

print '%s: check provider coverage for each plan' %logTime() ##### slow #####
provider_coverage = lil_matrix((n_plan, len(all_npi)))
i=0
for p in getProviderListForPlans(prov_col, ex_id):
    i+=1
    r_id = ex_id.index(p['plan'])
    for npi in p['npi']:
        provider_coverage[r_id, all_npi.index(npi)] = 1
print '%s: complete for %d plans' %(logTime(), i)

print '%s: get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, ex_id)
print '%s: total provider summary: %d' %(logTime(), len(all_provider_states))

print '%s: extract provider sumstat for each plan' %logTime()
provider_sumstat = lil_matrix((n_plan, len(all_provider_states)))
for p in getProviderStateForPlans(prov_col, ex_id):
    r_id = ex_id.index(p['_id'])
    for d in p['plan_states']:
        provider_sumstat[r_id, all_provider_states.index(d['key'])] = d['count'] #[d['count'], d['location']]
print '%s: complete for %d plans' %(logTime(), i)

2016-07-09 01:17:08.135890: processing 190 plans for OR
2016-07-09 01:17:08.135977: get formulary state space for all plans
2016-07-09 01:17:08.669133: extract formulary states for each plan
2016-07-09 01:17:08.875045: complete for 47 plans
2016-07-09 01:17:08.875637: get summary feature for each plan
2016-07-09 01:17:08.983673: complete for 47 plans
2016-07-09 01:17:08.984398: get all drugs covered by all plans
2016-07-09 01:17:12.387492: total rx: 10632
2016-07-09 01:17:12.388235: check drug coverage for each plan
2016-07-09 01:18:34.382130: complete for 190 plans
2016-07-09 01:18:34.382727: get summary feature for drug
2016-07-09 01:18:46.066163: total drug states: 82
2016-07-09 01:18:46.066909: extract drug sumstat for each plan
2016-07-09 01:18:59.114246: complete for 190 plans
2016-07-09 01:18:59.114705: get provider states
2016-07-09 01:19:02.021983: total providers: 41031

In [38]:
print '%s: get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, ex_id)

2016-07-09 00:07:41.266737: get summary feature for provider


In [44]:
print '%s: extract provider sumstat for each plan' %logTime()
provider_sumstat = {}
for p in getProviderStateForPlans(prov_col, ex_id):
#     print p['_id'],len(p['plan_states'])
    p_count = [[0,0]]*len(all_provider_states)
    for d in p['plan_states']:
        p_count[all_provider_states.index(d['key'])] = [d['count'], d['location']]
    provider_sumstat[p['_id']] = [y for x in p_count for y in x]

2016-07-09 00:15:20.325784: extract provider sumstat for each plan


In [49]:
# prov_col.count()
# prov_col.find_one({'facility_name':{'$exists':True}})
# provider_sumstat.values()[0]
len(ex_id)

190

In [53]:
print logTime()
test = {}
for p in getProviderListForPlans(prov_col, ex_id):
    print logTime(),p['plan']
    m1 = [False]*len(all_npi)
    for npi in p['npi']:
        m1[all_npi.index(npi)] = True
    test[p['plan']] = m1
print logTime()

###sparse matrix manual mode

In [63]:
# initialize as lil
# test = lil_matrix((3,18))
# test[2,5] = 3.14
test

<3x18 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in LInked List format>

In [59]:
# convert to csr
t2=csr_matrix(test)

In [11]:
client.close()
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
