In [12]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient
from zipfile import ZipFile, ZIP_DEFLATED
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime

def logTime():
    return str(datetime.datetime.now())

In [13]:
%reload_ext autoreload
%autoreload 2
from aca_drug_feature import *
from aca_plan_feature import *
from aca_provider_feature import *

In [14]:
local = False
if local:
    client = MongoClient('fc8iasm01', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print 'Using %s Mongo, total drug: %d, total plan: %d' %('local' if local else 'aws', len(all_drug), len(all_plan))
# client.formularies.scollection_names()
# client.providers.collection_names()

Using aws Mongo, total drug: 46206, total plan: 6035


In [15]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

# multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

# multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-08 23:08:27.667732: plan document: 12136
2016-07-08 23:08:27.797582: drug document: 1540473
2016-07-08 23:08:27.911088: unique plan_id: 6035
2016-07-08 23:08:27.911235: unique rxnorm_id: 46206
2016-07-08 23:08:27.914224: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY
2016-07-08 23:08:27.916281: processing 20 plans for AK


In [20]:
state = 'OR' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
print '%s: processing %d plans for %s' %(logTime(), len(ex_id), 'all' if not state else state)

2016-07-08 23:15:18.092371: processing 190 plans for OR


###Main program

In [21]:
print '%s: get formulary state space for all plans' %logTime()
all_plan_states = getFormularyAllStates(plan_col, ex_id) 

print '%s: extract formulary states for each plan' %logTime()
plan_feature = {p['_id']:[s in p['plan_states'] for s in all_plan_states] 
                for p in getFormularyStatesForPlan(plan_col, ex_id)}

print '%s: get summary feature for each plan' %logTime()
plan_sumstat = {p['plan']:[p['avg_copay'],p['avg_ci_rate'],p['count']] 
                for p in getFormularyAggregate(plan_col, ex_id)}

print '%s: get all drugs covered by all plans' %logTime()
all_rxnorm = drug_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('rxnorm_id')

print '%s: check drug coverage for each plan' %logTime()
drug_coverage = {p['plan']:[rx in p['drug'] for rx in all_rxnorm] 
                 for p in getDrugListForPlans(drug_col, ex_id)}

print '%s: get summary feature for drug' %logTime()
all_drug_states = getDrugAggregateAllStates(drug_col, ex_id)

print '%s: extract drug sumstat for each plan' %logTime()
drug_sumstat = {}
for p in getDrugAggregateCountForPlans(drug_col, ex_id):
    state_count = [0]*len(all_drug_states)
    for d in p['drug_state']:
        state_count[all_drug_states.index(d['key'])] = d['cnt']
    drug_sumstat[p['plan']] = state_count

print '%s: get provider states' %logTime()
all_npi = prov_col.find({'plans.plan_id':{'$in':ex_id}}).distinct('npi')

print '%s: check provider coverage for each plan' %logTime() # slow
provider_coverage = {p['plan']:[npi in p['npi'] for npi in all_npi] 
                     for p in getProviderListForPlans(prov_col, ex_id)}

print '%s: get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, ex_id)

print '%s: extract provider sumstat for each plan' %logTime()
provider_sumstat = {}
for p in getProviderStateForPlans(prov_col, ex_id):
    p_count = [None]*len(all_provider_states)
    for d in p['plan_states']:
        p_count[all_provider_states.index(d['key'])] = [d['count'], d['location']]
    provider_sumstat[p['_id']] = [y for x in p_count for y in x]

2016-07-08 23:15:25.746746: get formulary state space for all plans
2016-07-08 23:15:25.981393: extract formulary states for each plan
2016-07-08 23:15:26.249072: get summary feature for each plan
2016-07-08 23:15:26.415742: get all drugs covered by all plans
2016-07-08 23:17:36.315615: check drug coverage for each plan
2016-07-08 23:21:53.579294: get summary feature for drug
2016-07-08 23:22:52.562991: extract drug sumstat for each plan
2016-07-08 23:23:41.758832: get provider states
2016-07-08 23:24:28.504206: check provider coverage for each plan


KeyboardInterrupt: 

In [38]:
print '%s: get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, ex_id)

2016-07-09 00:07:41.266737: get summary feature for provider


In [39]:
all_provider_states

[u'Marriage and Family|INDIVIDUAL|accepting|Urdu|EPO',
 u'Pediatric Urology|INDIVIDUAL|accepting|English|TIER-ONE',
 u'Pediatrics & Endocrinology|INDIVIDUAL|accepting|English|TIER-ONE',
 u'Neurology|INDIVIDUAL|accepting|Spanish|TIER-ONE',
 u'Cardiology|INDIVIDUAL|accepting|ENGLISH|LIFEWISE',
 u'Acupuncture|INDIVIDUAL|accepting|Farsi|LIFEWISE',
 u'Obstetrics & Gynecology|INDIVIDUAL|accepting|Swedish|OREGON-EPO',
 u'Podiatry|INDIVIDUAL|accepting|English|OREGON-EPO',
 u'Orthopedic Surgery|INDIVIDUAL|not accepting|English|TIER-ONE',
 u'Internal Medicine & Gastroenterology|INDIVIDUAL|not accepting|English|TIER-ONE',
 u'Cognitive Behavioral Therapy|INDIVIDUAL|accepting|Norwegian|EPO',
 u'Podiatry|INDIVIDUAL|accepting|Finnish|OREGON-EPO',
 u'Nephrology|INDIVIDUAL|accepting|English|TIER-ONE',
 u'Infectious Disease|INDIVIDUAL|accepting|ENGLISH|LIFEWISE',
 u'ONCOLOGIST/HEMATOLOGIST|INDIVIDUAL|accepting|FARSI|PREFERRED',
 u'Unknown Specialty|INDIVIDUAL|accepting|NA|PREFERRED',
 u'INTERNAL MEDICIN

In [36]:
print '%s: extract provider sumstat for each plan' %logTime()
provider_sumstat = {}
for p in getProviderStateForPlans(prov_col, ex_id):
#     print p['_id'],len(p['plan_states'])
    p_count = [None]*len(all_provider_states)
    for d in p['plan_states']:
        p_count[all_provider_states.index(d['key'])] = [d['count'], d['location']]
    provider_sumstat[p['_id']] = [y for x in p_count for y in x]

2016-07-09 00:00:52.859314: extract provider sumstat for each plan


TypeError: 'NoneType' object is not iterable

In [33]:
# prov_col.count()
# prov_col.find_one({'facility_name':{'$exists':True}})
len(all_provider_states)

4864

In [27]:
faci_col.find_one() #{'npi':'1215961677'})

{u'_id': ObjectId('5775ff89c421d272dcd6877c'),
 u'addresses': [{u'address': u'4909 W DIVISION',
   u'address_2': u'',
   u'city': u'CHICAGO',
   u'phone': u'7732871200',
   u'state': u'IL',
   u'zip': u'60651'}],
 u'facility_name': u'RISHI PHARMACY INC',
 u'facility_type': [u'Pharmacy'],
 u'last_updated_on': u'2015-09-30',
 u'npi': u'1316955651',
 u'plans': [{u'name': u'Health Republic Full Access Core Platinum',
   u'network_tier': u'PREFERRED',
   u'plan_id': u'10191NJ0050003',
   u'plan_id_type': u'HIOS-PLAN-ID'},
  {u'name': u'Health Republic Full Access Core Gold',
   u'network_tier': u'PREFERRED',
   u'plan_id': u'10191NJ0050002',
   u'plan_id_type': u'HIOS-PLAN-ID'},
  {u'name': u'Health Republic Full Access Core Silver',
   u'network_tier': u'PREFERRED',
   u'plan_id': u'10191NJ0050001',
   u'plan_id_type': u'HIOS-PLAN-ID'},
  {u'name': u'Health Republic Monmouth County Community Plan Bronze',
   u'network_tier': u'PREFERRED',
   u'plan_id': u'10191NJ0180001',
   u'plan_id_type

In [11]:
client.close()
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
