In [1]:
import numpy as np
from sets import Set
import json

###Generate encode definition from mappings.json

In [171]:
mappings = json.load(open('mappings.json'))

def addEncode(fields):   
    ''' add 'encode' field on the mapping definition '''
    if ('type' in fields) and ('type' not in fields['type']): #(len(fields)==1):
        fields['encode'] = 0
    else:
        for v in fields.values():
            addEncode(v)
        
addEncode(mappings)

with open('encode2.json', 'w') as outfile:
    json.dump(mappings['mappings'], outfile)

###Load encode definition

In [41]:

def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + '/' + f)
    return rtn


###Retrieve unique values

In [42]:
# get unique value for each string encode field (prepare for on-hot-encode)

def getFieldValue(data, key, val, add_call):
    ''' 
    retrieve value(s) of a feature from the json tree based on the key,
    and put them into a set/list.
    '''
    if len(key) == 1:        
        add_call(val, data[key[0]])
    elif type(data) is list: # key[0] == 'properties':
        for d in data:
            getFieldValue(d, key[1:], val, add_call)
    else:
        getFieldValue(data[key[0]], key[1:], val, add_call)
        
    return val
        
def add_set(vSet, val):
    ''' add an element to a set '''
    vSet.add(val)
    
def add_list(vList, val):
    ''' append an element to a list '''
    vList.append(val)
        

###Extract encode variable list and generate feature space

In [3]:
plan = json.load(open('plan.json'))   
drug = json.load(open('drug.json'))
drug_plan_data={'plan':plan,'drug':drug}

In [43]:
# read file
encode = json.load(open('encode2.json'))

encode_list = getEncodeFields(encode, {})

# encode_list
feature_space = {k : list(getFieldValue(drug_plan_data, k.split('/'), Set(), add_set)) \
                 for k,v in encode_list.items()} # if v == 'string'}

feature_space

{u'drug/properties/plans/properties/drug_tier': [u'BRAND',
  u'SPECIALTY',
  u'VALUE',
  u'MEDICAL-SERVICE-DRUGS',
  u'PREFERRED',
  u'PREVENTIVE',
  u'SELECT'],
 u'drug/properties/plans/properties/plan_id': [u'73836AK0840001',
  u'73836AK0850001',
  u'73836AK0850002',
  u'73836AK0790001',
  u'73836AK0790002',
  u'73836AK0860001',
  u'73836AK0750004',
  u'73836AK0750001',
  u'73836AK0860002',
  u'73836AK0750003',
  u'73836AK0750002'],
 u'drug/properties/plans/properties/prior_authorization': [False, True],
 u'drug/properties/plans/properties/quantity_limit': [False, True],
 u'drug/properties/plans/properties/step_therapy': [False, True],
 u'plan/properties/formulary/properties/cost_sharing/properties/coinsurance_opt': [u'AFTER-DEDUCTIBLE',
  None],
 u'plan/properties/formulary/properties/cost_sharing/properties/coinsurance_rate': [0.0,
  0.4,
  0.3,
  0.15,
  0.45,
  0.35],
 u'plan/properties/formulary/properties/cost_sharing/properties/copay_amount': [0.0,
  6.0,
  135.0,
  45.0,
  18

###Investigate cost sharing policy for each drug tier

In [6]:
len(drug_plan_data['plan'])
for p in drug_plan_data['plan']:
#     print [f['drug_tier'] for f in p['formulary']]
    for f in p['formulary']:
        if f['drug_tier']=='VALUE':
            print [c['pharmacy_type']+'/'+str(c['coinsurance_opt'])+'/'+c['copay_opt'] for c in f['cost_sharing']] 
#     print '\n'


[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/NO-CHARGE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/NO-CHARGE', u'3-MONTH-IN-RETAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-OUT-RETAIL/None/BEFORE-DEDUCTIBLE']
[u'3-MONTH-IN-MAIL/None/BEFORE-DEDUCTIBLE', u'3-MONTH-IN-RETAIL/None/BE

###Generate integer feature vector for plan data

In [47]:
def getIntegerFeature(dataJsonList, dataType, featureSpace, encodeDefinition):
    ''' extract the features from json list 
    and convert to integer (for string) based on feature space index '''
    features = {}

    for k,v in featureSpace.items():
        if not k.startswith(dataType):
            continue
        print k
        # we want feature for each record (plan, drug etc.), so exclude the first 2 key component
        path = k.split('/')[2:]    
        # use integer representation for categorical variable, and keep raw value for bool and float
        features[k] = [[v.index(x) for x in getFieldValue(r, path, [], add_list)] \
                       if encodeDefinition[k] == 'string' \ 
                       else getFieldValue(r, path, [], add_list) for r in dataJsonList]
    
    return features

plan_int_features = getIntegerFeature(drug_plan_data['plan'], 'plan', feature_space, encode_list)

plan/properties/formulary/properties/cost_sharing/properties/pharmacy_type
plan/properties/formulary/properties/drug_tier
plan/properties/network/properties/network_tier
plan/properties/formulary/properties/cost_sharing/properties/coinsurance_rate
plan/properties/plan_id
plan/properties/plan_id_type
plan/properties/formulary/properties/mail_order
plan/properties/formulary/properties/cost_sharing/properties/coinsurance_opt
plan/properties/formulary/properties/cost_sharing/properties/copay_opt
plan/properties/formulary/properties/cost_sharing/properties/copay_amount


In [246]:
# plan_int_features['plan/properties/plan_id']
plan_int_features['plan/properties/formulary/properties/drug_tier']
# plan_int_features['plan/properties/formulary/properties/cost_sharing/properties/copay_amount']

[[6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [6, 4, 0, 1, 5, 2, 3],
 [],
 [],
 [],
 []]

###Assemble plan feature

In [340]:
def assemblePlanFeature(intFea, feaSpace, encodeDef, planIdKey = 'plan/properties/plan_id'):
    '''  '''
    plan_names = feaSpace[planIdKey]
    plan_fea = {plan_names[x[0]]:[] for x in intFea[planIdKey]}
    cat_index = []
    for k,v in intFea.items():
        if k == planIdKey:
            continue
        if encode_list[k] == 'string':
            cat_index += [max(len(z) for z in plan_fea.values()) + x for x in range(max(len(y) for y in v))]
        for p,f in zip(intFea[planIdKey], v):
            plan_fea[plan_names[p[0]]] += f
    
    return plan_fea, cat_index
    
plan_feature, plan_cat = assemblePlanFeature(plan_int_features, feature_space, encode_list)

###Generate integer feature vector for drug

In [7]:
# check size of feature vector of each plan
# [len(x) for x in features]

# 266 is a special where the drug is available on 2 tiers
j = [len(d['plans']) for d in drug_plan_data['drug']].index(22)
# drug_plan_data['drug'][j]['plans']

In [280]:
encode_list
plan_int_features['plan/properties/plan_id']

[[11],
 [14],
 [13],
 [8],
 [1],
 [2],
 [3],
 [6],
 [12],
 [4],
 [5],
 [10],
 [7],
 [9],
 [0]]

In [51]:
# plan_int_features[1]
drug_int_features = getIntegerFeature(drug_plan_data['drug'], 'drug', feature_space, encode_list)


drug/properties/plans/properties/prior_authorization
drug/properties/plans/properties/plan_id
drug/properties/plans/properties/step_therapy
drug/properties/plans/properties/quantity_limit
drug/properties/plans/properties/drug_tier


In [150]:
# drug_int_features['drug/properties/plans/properties/drug_tier']
a=[[[y] for y in x] for x in drug_int_features['drug/properties/plans/properties/quantity_limit']]
b=[[[y] for y in x] for x in drug_int_features['drug/properties/plans/properties/drug_tier']]
c=[[[y] for y in x] for x in drug_int_features['drug/properties/plans/properties/plan_id']]
d=[[[y] for y in x] for x in drug_int_features['drug/properties/plans/properties/step_therapy']]
c=[np.hstack(t) for t in zip(c,a,b,d)]

In [203]:
drug_fea = [[]] * 11
df=xyz[0][xyz[0][:,0].argsort(),1:]
np.hstack((drug_fea,df))

array([[ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.],
       [ 0.,  0.,  0.,  6.]])

###Assemble drug feature

In [318]:
def assembleDrugFeature(intFea, feaSpace, encodeDef, planIdKey = 'drug/properties/plans/properties/plan_id'):
    ''' assemble integer feature together for each plan,
        for each plan, the vector shows features for all drugs
    '''
    p_id = intFea[planIdKey]
    n_plan = max([max(x) for x in p_id]) + 1
    drug_fea = [[]] * n_plan
    # generate array for combine
    comb_fea = ([[[y] for y in x] for x in v] for k,v in intFea.items() if k != planIdKey)
    # put plan ID at first
    fea = [[[y] for y in x] for x in intFea[planIdKey]]
    # combine feature for each drug
    for f in comb_fea:
        fea = [np.hstack(t) for t in zip(fea, f)]
    # combine drugs for each plan
    cat_index = range(3, 4*len(fea), 4)
    for f in fea:
        index = [np.where(f[:,0]==i)[0][0] for i in range(n_plan)] if len(f) != n_plan else f[:,0].argsort()
        drug_fea = np.hstack((drug_fea, f[index, 1:]))
    return {feaSpace[planIdKey][i]:drug_fea[i] for i in range(len(drug_fea))}, cat_index
    


In [327]:
# drug_feature, drug_cat = assembleDrugFeature(drug_int_features, feature_space, encode_list)

# drug_feature
# drug_int_features

drug_feature.values()[0][drug_cat]

[ 6.  0.  4. ...,  0.  0.  6.]


### Combine plan and drug feature for common plan ID

In [341]:
feature = []
n_pf = len(plan_feature.values()[0])

for p,df in drug_feature.items():
    pf = plan_feature[p]
    pf.extend(df)
    feature.append(pf)
    

cat_index = plan_cat + [x+n_pf for x in drug_cat]

[35327, 35327, 35327, 35327, 35327, 35327, 35327, 35327, 35327, 35327, 35327]

###One-hot-encode

In [348]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categorical_features=cat_index)
encode_feature = enc.fit_transform(feature)  

In [349]:
encode_feature

<11x35345 sparse matrix of type '<type 'numpy.float64'>'
	with 130289 stored elements in COOrdinate format>