In [1]:
from sklearn.preprocessing import OneHotEncoder
from sets import Set
import numpy as np
import json

###Generate encode definition from mappings.json
- add an _encode_ filed on the mapping variables

In [171]:
mappings = json.load(open('mappings.json'))

def addEncode(fields):   
    ''' add 'encode' field on the mapping definition '''
    if ('type' in fields) and ('type' not in fields['type']): #(len(fields)==1):
        fields['encode'] = 0
    else:
        for v in fields.values():
            addEncode(v)
        
addEncode(mappings)

with open('encode2.json', 'w') as outfile:
    json.dump(mappings['mappings'], outfile)

###Load encode definition
- set _encode_ to 1 in the file to select the field

In [2]:

def getEncodeFields(encode_def, rtn, path=''):
    ''' extract the selected fields from the encode json definition '''
    
    if 'encode' in encode_def and encode_def['encode'] == 1:
        rtn[path[1:]] = encode_def['type']
    elif 'encode' not in encode_def:
        for f in encode_def:
            getEncodeFields(encode_def[f], rtn, path + '/' + f)
    return rtn


###Retrieve value of a selected variable
- pass a set, to get unique value for encoding
- pass a list, to get all values for feature extraction

In [3]:
# get unique value for each string encode field (prepare for on-hot-encode)

def getFieldValue(data, key, val, add_call):
    ''' 
    retrieve value(s) of a feature from the json tree based on the key,
    and put them into a set/list.
    '''
    if len(key) == 1:        
        add_call(val, data[key[0]])
    elif type(data) is list: # key[0] == 'properties':
        for d in data:
            getFieldValue(d, key[1:], val, add_call)
    else:
        getFieldValue(data[key[0]], key[1:], val, add_call)
        
    return val
        
def add_set(vSet, val):
    ''' add an element to a set '''
    vSet.add(val)
    
def add_list(vList, val):
    ''' append an element to a list '''
    vList.append(val)
        

###Extract encode variable list and generate feature space

In [4]:
# load file
!curl -o plan.json http://get-moda.com/json/plans-AK.json
!curl -o drug.json http://get-moda.com/json/formulary-AK.json

plan = json.load(open('plan.json'))   
drug = json.load(open('drug.json'))
drug_plan_data={'plan':plan,'drug':drug}

In [5]:
# read file
encode = json.load(open('encode2.json'))

encode_list = getEncodeFields(encode, {})

# encode_list
feature_space = {k : list(getFieldValue(drug_plan_data, k.split('/'), Set(), add_set)) \
                 for k,v in encode_list.items()} # if v == 'string'}

# feature_space

###Generate integer feature vector for plan data

In [6]:
def getIntegerFeature(dataJsonList, dataType, featureSpace, encodeDefinition):
    ''' extract the features from json list 
    and convert to integer (for string) based on feature space index '''
    features = {}

    for k,v in featureSpace.items():
        if not k.startswith(dataType):
            continue
        print k
        # we want feature for each record (plan, drug etc.), so exclude the first 2 key component
        path = k.split('/')[2:]    
        # use integer representation for categorical variable, and keep raw value for bool and float
        features[k] = [[v.index(x) for x in getFieldValue(r, path, [], add_list)] \
                       if encodeDefinition[k] == 'string' \
                       else getFieldValue(r, path, [], add_list) for r in dataJsonList]
    
    return features

plan_int_features = getIntegerFeature(drug_plan_data['plan'], 'plan', feature_space, encode_list)

plan/properties/formulary/properties/cost_sharing/properties/pharmacy_type
plan/properties/formulary/properties/drug_tier
plan/properties/network/properties/network_tier
plan/properties/formulary/properties/cost_sharing/properties/coinsurance_rate
plan/properties/plan_id
plan/properties/plan_id_type
plan/properties/formulary/properties/mail_order
plan/properties/formulary/properties/cost_sharing/properties/coinsurance_opt
plan/properties/formulary/properties/cost_sharing/properties/copay_opt
plan/properties/formulary/properties/cost_sharing/properties/copay_amount


###Assemble plan feature

In [7]:
def assemblePlanFeature(intFea, feaSpace, encodeDef, planIdKey = 'plan/properties/plan_id'):
    ''' combine feature fields for each plan, 
        and generate index for catagorical features '''
    plan_names = feaSpace[planIdKey]
    plan_fea = {plan_names[x[0]]:[] for x in intFea[planIdKey]}
    cat_index = []
    for k,v in intFea.items():
        if k == planIdKey:
            continue
        if encode_list[k] == 'string':
            cat_index += [max(len(z) for z in plan_fea.values()) + x for x in range(max(len(y) for y in v))]
        for p,f in zip(intFea[planIdKey], v):
            plan_fea[plan_names[p[0]]] += f
    
    return plan_fea, cat_index
    
plan_feature, plan_cat = assemblePlanFeature(plan_int_features, feature_space, encode_list)

###Generate integer feature vector for drug data

In [8]:
# 266 is a special case where the drug is available on 2 tiers
# j = [len(d['plans']) for d in drug_plan_data['drug']].index(22)
# drug_plan_data['drug'][j]['plans']

drug_int_features = getIntegerFeature(drug_plan_data['drug'], 'drug', feature_space, encode_list)


drug/properties/plans/properties/prior_authorization
drug/properties/plans/properties/plan_id
drug/properties/plans/properties/step_therapy
drug/properties/plans/properties/quantity_limit
drug/properties/plans/properties/drug_tier


###Assemble drug feature

In [9]:
def assembleDrugFeature(intFea, feaSpace, encodeDef, planIdKey = 'drug/properties/plans/properties/plan_id'):
    ''' assemble integer feature together for each plan,
        for each plan, the vector shows features for all drugs
    '''
    p_id = intFea[planIdKey]
    n_plan = max([max(x) for x in p_id]) + 1
    drug_fea = [[]] * n_plan
    # generate array for combine
    comb_fea = ([[[y] for y in x] for x in v] for k,v in intFea.items() if k != planIdKey)
    # put plan ID at first
    fea = [[[y] for y in x] for x in intFea[planIdKey]]
    # combine feature for each drug
    for f in comb_fea:
        fea = [np.hstack(t) for t in zip(fea, f)]
    # combine drugs for each plan
    cat_index = range(3, 4*len(fea), 4)
    for f in fea:
        index = [np.where(f[:,0]==i)[0][0] for i in range(n_plan)] if len(f) != n_plan else f[:,0].argsort()
        drug_fea = np.hstack((drug_fea, f[index, 1:]))
    return {feaSpace[planIdKey][i]:drug_fea[i] for i in range(len(drug_fea))}, cat_index
    
drug_feature, drug_cat = assembleDrugFeature(drug_int_features, feature_space, encode_list)

### Combine plan and drug feature for common plan ID

In [10]:
feature = []

for p,df in drug_feature.items():
    pf = plan_feature[p]
    n_pf = len(pf)
    pf.extend(df)
    feature.append(pf)
    
cat_index = plan_cat + [x+n_pf for x in drug_cat]

###One-hot-encode

In [11]:
enc = OneHotEncoder(categorical_features=cat_index)
encode_feature = enc.fit_transform(feature)  

In [12]:
encode_feature

<11x35345 sparse matrix of type '<type 'numpy.float64'>'
	with 130289 stored elements in COOrdinate format>