# LinUCB with real rewards and low risk analysis

In [44]:
import pandas as pd
import pickle
import numpy as np
import math
warf = pd.read_pickle('../data/cleaned_warfarin.pkl')

true_dosage = []
with open('../data/true_dosage.pkl', 'rb') as f:
    true_dosage = pickle.load(f)
true_values = []
with open('../data/true_labels.pkl', 'rb') as f:
    true_values = pickle.load(f)

In [45]:
len(warf)

5528

In [46]:
len(list(warf))

94

In [47]:
train_len = math.ceil(len(true_values)*0.8)
test_len = len(true_values) - train_len

In [48]:
arms = ["low", "medium", "high"]
accuracy = []
regret_list = []
for j in range(10):
    accuracy_list = []
    regret_list = []
    regret = 0
    risk = 0
    A_map = {}
    b_map = {} 
    for arm in arms:
        A_map[arm] = np.identity(len(list(warf))) 
        b_map[arm] = np.zeros(len(list(warf)))
    
    indices = np.random.permutation(list(range(len(warf))))
    for i in indices[:train_len]:
        feature = np.array(warf.iloc[i], dtype=float)
        # iterate through actions
        p = {}
        max_value = -np.inf
        max_arm = "low"
        for arm in arms:
            theta = np.matmul(np.linalg.inv(A_map[arm]), b_map[arm])
            p = np.matmul(theta.T, feature) + \
                np.sqrt(np.matmul(np.matmul(feature.T,np.linalg.inv(A_map[arm])), feature))
            if p > max_value:
                max_value = p
                max_arm = arm
       
        # observe reward and update A and b
        if true_values[i] == max_arm: r = 0 
        elif max_arm == 'low': r = -np.abs((true_dosage[i] - 14.4478)/4.031703103381195)
        elif max_arm == 'medium': r = -np.abs((true_dosage[i] - 31.9729)/7.785291417550073)
        else: r = -np.abs((true_dosage[i] - 63.7695)/18.35817976658349)
            
        if (max_arm == 'high' and true_values[i] == 'low') or (max_arm == 'low' and true_values[i] == 'high'):
            r = -100
        
        regret += 0 if true_values[i] == max_arm else 1
        A_map[max_arm] += np.outer(feature, feature) 
        b_map[max_arm] += r*feature
        
        accuracy_list.append((i+1 - regret)/(i+1))
        regret_list.append(regret)
    
    correct_count = 0
    count = 0
    for i in indices[train_len:]:
        count += 1
        feature = np.array(warf.iloc[i], dtype=float)
        # iterate through actions
        p = {}
        max_value = -np.inf
        max_arm = "low"
        for arm in arms:
            theta = np.matmul(np.linalg.inv(A_map[arm]), b_map[arm])
            p = np.matmul(theta.T, feature) + \
                    np.sqrt(np.matmul(np.matmul(feature.T,np.linalg.inv(A_map[arm])), feature))
            if p > max_value:
                max_value = p
                max_arm = arm
        # observe reward and update A and b
        if max_arm == true_values[i]: correct_count += 1
        if (max_arm == 'high' and true_values[i] == 'low') or (max_arm == 'low' and true_values[i] == 'high'):
            risk += 1
    print(correct_count/count)
    accuracy.append(correct_count/count)
print(risk/count)

0.0018099547511312218
0.6407239819004525
0.0027149321266968325
0.5882352941176471
0.0018099547511312218
0.6190045248868778
0.0009049773755656109
0.6262443438914027
0.0
0.6561085972850679
0.0036199095022624436
0.611764705882353
0.0
0.6289592760180995
0.0009049773755656109
0.5963800904977375
0.0009049773755656109
0.6280542986425339
0.0018099547511312218
0.6262443438914027


In [49]:
np.mean(accuracy)

0.6221719457013574

In [50]:
# 0.6622624434389142
regret_list[train_len-1]

1665

In [51]:
with open('../results/linucb_real_regret.pkl', 'wb') as f:
    pickle.dump(regret_list, f)
with open('../results/linucb_real_accuracy.pkl', 'wb') as f:
    pickle.dump(accuracy_list, f)