In [64]:
from sklearn.preprocessing import OneHotEncoder
from pymongo import MongoClient, DESCENDING
from zipfile import ZipFile, ZIP_DEFLATED
from scipy.sparse import *
from scipy import stats
from sklearn import svm
from sets import Set
import numpy as np
import json, sys, os, time, re, datetime, itertools, pickle

def logTime():
    return str(datetime.datetime.now())

In [3]:
%reload_ext autoreload
%autoreload 2
from aca_drug_feature import *
from aca_plan_feature import *
from aca_provider_feature import *

In [4]:
local = False
if local:
    client = MongoClient('fc8iasm01', 27017)
    plan_col = client.aca.plan
    drug_col = client.aca.drug
else:
    client = MongoClient('ec2-54-153-83-172.us-west-1.compute.amazonaws.com', 27017)
    plan_col = client.plans.plans
    drug_col = client.formularies.drugs
    prov_col = client.providers.providers
    faci_col = client.providers.facilities

all_plan = drug_col.distinct('plans.plan_id')
all_drug = drug_col.distinct('rxnorm_id')

print '%s: using %s Mongo, total drug: %d, total plan: %d, total provider: %d' %(
    logTime(), 'local' if local else 'aws', len(all_drug), len(all_plan), prov_col.count())
# client.formularies.scollection_names()
# client.providers.collection_names()

2016-07-10 14:18:20.846091: using aws Mongo, total drug: 46206, total plan: 6035, total provider: 8799098


In [5]:
print '%s: plan document: %d' %(logTime(), plan_col.count())
print '%s: drug document: %d' %(logTime(), drug_col.count())
print '%s: provider document: %d' %(logTime(), prov_col.count())
print '%s: facility document: %d' %(logTime(), faci_col.count())
print '%s: unique plan_id: %d' %(logTime(), len(all_plan))
print '%s: unique rxnorm_id: %d' %(logTime(), len(all_drug))

# multi_plan = [1 for p in plan_col.aggregate([{"$group": {"_id":"$plan_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: plans with multiple documents: %d' %(logTime(), sum(multi_plan))

# multi_drug = [1 for p in drug_col.aggregate([{"$group": {"_id":"$rxnorm_id", "count":{"$sum":1}}}]) if p['count']>1]
# print '%s: drugs with multiple documents: %d' %(logTime(), sum(multi_drug))

state_id = np.unique([i[5:7] for i in all_plan])
print '%s: states in the plan: %s' %(logTime(), ', '.join(state_id))

2016-07-09 14:07:38.909204: plan document: 12136
2016-07-09 14:07:38.998114: drug document: 1540473
2016-07-09 14:07:39.084850: provider document: 8799098
2016-07-09 14:07:39.171320: facility document: 4815321
2016-07-09 14:07:39.264177: unique plan_id: 6035
2016-07-09 14:07:39.264318: unique rxnorm_id: 46206
2016-07-09 14:07:39.267385: states in the plan: AK, AL, AR, AZ, CO, DE, FL, GA, HI, IA, IL, IN, KS, KY, LA, MA, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, OH, OK, OR, PA, SC, SD, TN, TX, UT, VA, WA, WI, WV, WY


In [149]:
state = 'UT' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
n_plan = len(ex_id)
print '%s: processing %d plans for %s' %(logTime(), len(ex_id), 'all' if not state else state)

2016-07-10 17:27:55.363123: processing 74 plans for UT


###Main Process

In [151]:
state = 'UT' # set to None to include all (very slow process for all)
ex_id = all_plan if not state else [i for i in all_plan if state in i]
n_plan = len(ex_id)
print '%s: processing %d plans for %s' %(logTime(), len(ex_id), 'all' if not state else state)

print '%s: 1/11 get formulary state space for all plans' %logTime()
all_plan_states = getFormularyAllStates1(plan_col, ex_id) + \
                  getFormularyAllStates2(plan_col, ex_id) + \
                  getFormularyAllStates3(plan_col, ex_id) 
print '%s: total formulary states: %d' %(logTime(), len(all_plan_states))

print '%s: 2/11 extract formulary states for each plan' %logTime()
plan_feature = lil_matrix((n_plan, len(all_plan_states)))
valid_plan1 = []
for f in [getFormularyStatesForPlan1,getFormularyStatesForPlan2,getFormularyStatesForPlan3]:
    for p in f(plan_col, ex_id):
        r_id = ex_id.index(p['_id'])
        valid_plan1.append(p['_id'])
        for s in p['plan_states']:
            plan_feature[r_id, all_plan_states.index(s)] = 1        
print '%s: complete for %d plans' %(logTime(), len(valid_plan1))

print '%s: 3/11 get formulary summary feature for each plan' %logTime()
plan_sumstat = [[0]*3]*len(valid_plan1)
for p in getFormularyAggregate(plan_col, valid_plan1):
    r_id = ex_id.index(p['plan'])
    plan_sumstat[r_id] = [p['avg_copay'],p['avg_ci_rate'],p['count']]
print '%s: complete for %d plans' %(logTime(), len(valid_plan1))
    
print '%s: 4/11 get all drugs covered by all plans' %logTime()
all_rxnorm = drug_col.find({'plans.plan_id':{'$in':valid_plan1}}).distinct('rxnorm_id')
print '%s: total rx: %d' %(logTime(), len(all_rxnorm))

print '%s: 5/11 check drug coverage for each plan' %logTime()
drug_coverage = lil_matrix((n_plan, len(all_rxnorm)))
valid_plan2 = []
for p in getDrugListForPlans(drug_col, valid_plan1):
    valid_plan2.append(p['plan'])
    r_id = ex_id.index(p['plan'])
    for r in p['drug']:
        drug_coverage[r_id, all_rxnorm.index(r)] = 1
print '%s: complete for %d plans' %(logTime(), len(valid_plan2))

print '%s: 6/11 get summary feature for drug' %logTime()
all_drug_states = getDrugAggregateAllStates(drug_col, valid_plan2)
print '%s: total drug states: %d' %(logTime(), len(all_drug_states))

print '%s: 7/11 extract drug sumstat for each plan' %logTime()
drug_sumstat = lil_matrix((n_plan, len(all_drug_states)))
valid_plan3 = []
for p in getDrugAggregateCountForPlans(drug_col, valid_plan2):
    valid_plan3.append(p['plan'])
    r_id = ex_id.index(p['plan'])
    for d in p['drug_state']:
        drug_sumstat[r_id, all_drug_states.index(d['key'])] = d['cnt']
print '%s: complete for %d plans' %(logTime(), len(valid_plan3))

print '%s: 8/11 get provider under the plans' %logTime()
all_npi = prov_col.find({'plans.plan_id':{'$in':valid_plan3}}).distinct('npi')
print '%s: total providers: %d' %(logTime(), len(all_npi))

print '%s: 9/11 check provider coverage for each plan' %logTime() ##### slow #####
provider_coverage = lil_matrix((n_plan, len(all_npi)))
valid_plan4 = []
for p in getProviderListForPlans(prov_col, valid_plan3):
    valid_plan4.append(p['plan'])
    r_id = ex_id.index(p['plan'])
    for npi in p['npi']:
        provider_coverage[r_id, all_npi.index(npi)] = 1
print '%s: complete for %d plans' %(logTime(), len(valid_plan4))

print '%s: 10/11 get summary feature for provider' %logTime()
all_provider_states = getProviderAllStates(prov_col, valid_plan4)
print '%s: total provider summary: %d' %(logTime(), len(all_provider_states))

print '%s: 11/11 extract provider sumstat for each plan' %logTime()
provider_sumstat = lil_matrix((n_plan, len(all_provider_states)))
valid_plan5 = []
for p in getProviderStateForPlans(prov_col, valid_plan4):
    r_id = ex_id.index(p['_id'])
    valid_plan5.append(p['_id'])
    for d in p['plan_states']:
        provider_sumstat[r_id, all_provider_states.index(d['key'])] = d['count'] #[d['count'], d['location']]
print '%s: complete for %d plans' %(logTime(), len(valid_plan5))

2016-07-10 17:29:51.997403: processing 74 plans for UT
2016-07-10 17:29:51.997520: 1/11 get formulary state space for all plans
2016-07-10 17:29:52.317457: total formulary states: 77
2016-07-10 17:29:52.318247: 2/11 extract formulary states for each plan
2016-07-10 17:29:52.835207: complete for 74 plans
2016-07-10 17:29:52.835771: 3/11 get formulary summary feature for each plan
2016-07-10 17:29:52.941727: complete for 74 plans
2016-07-10 17:29:52.942321: 4/11 get all drugs covered by all plans
2016-07-10 17:29:53.445309: total rx: 6990
2016-07-10 17:29:53.445854: 5/11 check drug coverage for each plan
2016-07-10 17:30:23.566267: complete for 74 plans
2016-07-10 17:30:23.566872: 6/11 get summary feature for drug
2016-07-10 17:30:26.199454: total drug states: 56
2016-07-10 17:30:26.200027: 7/11 extract drug sumstat for each plan
2016-07-10 17:30:29.222912: complete for 74 plans
2016-07-10 17:30:29.223427: 8/11 get provider under the plans
2016-07-10 17:30:29.735360: total providers: 104

In [153]:
# combine features
plan_sumstat = lil_matrix(plan_sumstat)
feature_mat = [plan_feature, drug_coverage, drug_sumstat, provider_coverage, provider_sumstat, plan_sumstat]
n_fea = sum(m.shape[1] for m in feature_mat)
total_feature = lil_matrix((len(valid_plan5), n_fea))
for i in range(len(valid_plan5)):
    r_id = ex_id.index(valid_plan5[i])
    total_feature[i] = hstack([m.getrow(r_id) for m in feature_mat])
print '%s: feature dimension: %s' %(logTime(), total_feature.shape)    

saveName = '%s_%d_%d.pickle' %(state, len(valid_plan5), n_fea)
with open(saveName, 'w') as f:
    pickle.dump([total_feature, valid_plan5], f)
print '%s: feature matrix saved as %s' %(logTime(), saveName)

del plan_feature
del plan_sumstat
del drug_coverage
del drug_sumstat
del provider_coverage 
del provider_sumstat

2016-07-10 17:33:20.075173: feature dimension: (74, 19243)
2016-07-10 17:33:22.425142: feature matrix saved as UT_74_19243.pickle


###Load saved feature data (feature matrix and plan IDs)

In [154]:
# Getting back the objects:
savedData = 'UT_74_19243.pickle'
with open(savedData) as f: 
    total_feature, plans = pickle.load(f)
print '%s: data loaded: %s' %(logTime(), total_feature.shape)

2016-07-10 17:34:18.042346: data loaded: (74, 19243)


###Query to rank plans based on number of specialities in providers the plan covers

In [155]:
i=0
provider_rank = []
for p in prov_col.aggregate(
        [
            {'$match':{'plans.plan_id':{'$in':plans}, 'facility_name':{'$exists':False}}},
            {'$unwind':'$plans'},
            {'$match':{'plans.plan_id':{'$in':plans}}},
            {'$unwind':'$speciality'},
            {'$unwind':'$languages'},
            {'$group':{
                    '_id':{
                        'pl':'$plans.plan_id',
                        'sp':'$speciality',
                    },
                    'cnt':{'$sum':1},
                    'loc':{'$sum':{'$size':'$addresses'}}
                }
            },
            {'$project':{'_id':0, 'plan':'$_id.pl', 'speciality':'$_id.sp', 'count':'$cnt'}},
            {'$group':{'_id':'$plan', 'speciality_cnt':{'$sum':1}, 'provider_cnt':{'$sum':'$count'} }},
            {'$sort':{'speciality_cnt':-1, 'provider_cnt':-1}}        
        ], #allowDiskUse=True
    ):
    i+=1
    provider_rank.append(p['_id'])
    print p
    
print i

{u'speciality_cnt': 119, u'_id': u'18167UT0010001', u'provider_cnt': 7861}
{u'speciality_cnt': 119, u'_id': u'18167UT0010003', u'provider_cnt': 7861}
{u'speciality_cnt': 119, u'_id': u'18167UT0010002', u'provider_cnt': 7861}
{u'speciality_cnt': 114, u'_id': u'68781UT0030014', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030016', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030010', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030002', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030011', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0140005', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030005', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030001', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030009', u'provider_cnt': 8818}
{u'speciality_cnt': 114, u'_id': u'68781UT0030006', u'provider_cnt': 8818}
{u'speciality_cnt': 114, 

In [156]:
train_rank = provider_rank[0::2]
test_rank = provider_rank[1::2]
rank_index = [plans.index(x) for x in train_rank]

pair_fea, pair_diff = [], []

for rank in [rank_index]:
    k = 0
    for i,j in itertools.combinations(range(len(rank)), 2):
        if k%2 == 0:
            pair_fea.append(total_feature.getrow(rank[i]) - total_feature.getrow(rank[j]))
        else:
            pair_fea.append(total_feature.getrow(rank[j]) - total_feature.getrow(rank[i]))            
        pair_diff.append((-1)**k)        
        k += 1

print '%s: total training sample: %d' %(logTime(), len(pair_diff))

2016-07-10 17:34:55.952765: total training sample: 666


In [157]:
clf = svm.SVC(kernel='linear', C=.1)
clf.fit(vstack(pair_fea), pair_diff)
coef = clf.coef_.toarray()[0]

In [158]:
test_weight = [np.dot(coef, total_feature.getrow(plans.index(p)).toarray()[0]) for p in test_rank]
letor_rank_ind = np.argsort(test_weight)[::-1]
letor_rank = [test_rank[i] for i in letor_rank_ind]
k_tau = stats.kendalltau(letor_rank,test_rank)
print 'Kendall\'s tau is: %.4f' %k_tau[0]

Kendall's tau is: 0.7207


###Query to rank plans based on drug tier policies

In [168]:
raw = []

for p in plan_col.aggregate(
    [
        {'$match':{'plan_id':{'$in':plans}}},
        {'$unwind':'$formulary'},
        {'$unwind':'$formulary.cost_sharing'},
        {'$group':{'_id':'$plan_id','n_drug':{'$sum':1}, 
                   'avg_copay':{'$avg':'$formulary.cost_sharing.copay_amount'},
                   'avg_coinsure':{'$avg':'$formulary.cost_sharing.coinsurance_rate'}}},
        {'$sort':{'n_drug':-1}},
    ]
):
    raw.append((str(p['_id']), int(p['n_drug']), float(p['avg_copay']), float(p['avg_coinsure'])))

# pymongo doesn't sort aggregation for multiple field, so do it manually
vtype = [('plan', str), ('s1', int), ('s2', float), ('s3', float)]
plan_rank_index = [i for i in np.argsort(np.array(raw,dtype=vtype), order=['s3', 's2', 's1'])][::-1]
plan_rank =[raw[i][0] for i in plan_rank_index]
for i in plan_rank_index:
    print raw[i]

('18167UT0010003', 9, 26.666666666666668, 11.88888888888889)
('18167UT0010002', 9, 21.666666666666668, 8.88888888888889)
('18167UT0010001', 9, 21.666666666666668, 5.888888888888889)
('42261UT0050003', 12, 0.0, 0.625)
('68781UT0130005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0020005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0030005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0120005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0140005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0010005', 9, 2.2222222222222223, 0.2222222222222222)
('68781UT0030016', 9, 2.2222222222222223, 0.21111111111111114)
('68781UT0010016', 9, 2.2222222222222223, 0.21111111111111114)
('68781UT0020016', 9, 2.2222222222222223, 0.21111111111111114)
('68781UT0020011', 9, 3.3333333333333335, 0.18888888888888888)
('68781UT0010011', 9, 3.3333333333333335, 0.18888888888888888)
('68781UT0030011', 9, 3.3333333333333335, 0.18888888888888888)
('68781UT0010001', 9, 2.222222222

In [169]:
train_rank = plan_rank[0::2]
test_rank =  plan_rank[1::2]
rank_index = [plans.index(x) for x in train_rank]

pair_fea, pair_diff = [], []

for rank in [rank_index]:
    k = 0
    for i,j in itertools.combinations(range(len(rank)), 2):
        if k%2 == 0:
            pair_fea.append(total_feature.getrow(rank[i]) - total_feature.getrow(rank[j]))
        else:
            pair_fea.append(total_feature.getrow(rank[j]) - total_feature.getrow(rank[i]))            
        pair_diff.append((-1)**k)        
        k += 1

print '%s: total training sample: %d' %(logTime(), len(pair_diff))

2016-07-10 17:46:52.365654: total training sample: 666


In [170]:
clf = svm.SVC(kernel='linear', C=.1)
clf.fit(vstack(pair_fea), pair_diff)
coef = clf.coef_.toarray()[0]

In [171]:
test_weight = [np.dot(coef, total_feature.getrow(plans.index(p)).toarray()[0]) for p in test_rank]
letor_rank_ind = np.argsort(test_weight)[::-1]
letor_rank = [test_rank[i] for i in letor_rank_ind]
k_tau = stats.kendalltau(letor_rank,test_rank)
print 'Kendall\'s tau is: %.4f' %k_tau[0]

Kendall's tau is: 0.3694


###Combine both rank for training

In [163]:
train_rank = [plan_rank[0::2], provider_rank[0::2]]
test_rank = [plan_rank[1::2], provider_rank[1::2]]
rank_index = [[plans.index(x) for x in t] for t in train_rank]

pair_fea, pair_diff = [], []

for rank in rank_index:
    k = 0
    for i,j in itertools.combinations(range(len(rank)), 2):
        if k%2 == 0:
            pair_fea.append(total_feature.getrow(rank[i]) - total_feature.getrow(rank[j]))
        else:
            pair_fea.append(total_feature.getrow(rank[j]) - total_feature.getrow(rank[i]))            
        pair_diff.append((-1)**k)        
        k += 1

print '%s: total training sample: %d' %(logTime(), len(pair_diff))

2016-07-10 17:36:10.505711: total training sample: 1332


In [164]:
clf = svm.SVC(kernel='linear', C=.1)
clf.fit(vstack(pair_fea), pair_diff)
coef = clf.coef_.toarray()[0]

In [165]:
test_weight = [[np.dot(coef, total_feature.getrow(plans.index(p)).toarray()[0]) for p in t] for t in test_rank]
letor_rank_ind = [np.argsort(t)[::-1] for t in test_weight]
letor_rank = [[tr[i] for i in t] for t, tr in zip(letor_rank_ind, test_rank)]
k_tau = [stats.kendalltau(l,t)[0] for l,t in zip(letor_rank, test_rank)]
print 'Kendall\'s tau is: %s' %k_tau

Kendall's tau is: [0.075075075075075062, 0.23723723723723719]


In [11]:
client.close()
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
