## Example: Self-Exciting Multi Armed Bandit

In [31]:
%load_ext autoreload
%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from Q_learning import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Setting

In [32]:
max_invest = 5
X = np.array([np.array([i,j]) for i in np.concatenate([np.arange(-max_invest,0,1),np.arange(1,max_invest+1,1)]) for j in [0,1]])
A = np.array([np.array([i,j]) for i in np.arange(1,max_invest+1,1) for j in [0,1]])
p = [0.4,0.6]
lam_p = 0.1
def c(x,y):
    return np.linalg.norm(x-y)

def r(x,a,y):
    return y[0]

def P_0(x,a):
    return np.concatenate([[a[0]*(2*binom.rvs(1,p[a[1]]+lam_p*np.sign(x[0])*(x[1]==a[1]))-1)],[a[1]]])
    
def p_0(k,x,a):
    return binom.pmf(0.5*(k[0]/(a[0])+1),1,p[a[1]]+lam_p*np.sign(x[0])*(x[1]==a[1]))*(k[1]==a[1])
   
epsilon = 0.5 # Radius of the Wasserstein Ball
alpha = 0.45 # Discount Factor
rng = np.random.default_rng()
x_0 = rng.choice(X) #Initial Value

### Training

In [36]:
Q_opt_robust = robust_q_learning(X,
               A,
               r,
               c,
               P_0, # Simulation of next state in dependence of x and a
               p_0, # The probability mass function
               epsilon,
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = 50000,
               q =1,
               gamma_t_tilde = lambda t: 1/(t+1),
               Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust = q_learning(X,
               A,
               r,
               P_0, # Simulation of next state in dependence of x and a
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = 50000,
               gamma_t_tilde = lambda t: 1/(t+1),
                Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 50000/50000 [14:28:14<00:00,  1.04s/it]   
100%|██████████| 50000/50000 [00:06<00:00, 7287.11it/s]


### Determine the optimal strategies

In [37]:
if np.ndim(A)>1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X)>1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])
def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]
def a_opt_robust(x):
    return A[np.argmax(Q_opt_robust[x_index(x),:])]
def a_opt_nonrobust(x):
    return A[np.argmax(Q_opt_nonrobust[x_index(x),:])]

### Take a look at the strategies

In [38]:
rob_strat = [str(a_opt_robust(x)) for x in X]
nonrob_strat = [str(a_opt_nonrobust(x)) for x in X]
Strats = pd.DataFrame(np.array([rob_strat,nonrob_strat]))
Strats.columns = [str(x) for x in X]
Strats["x"]=[r"$a_{\operatorname{Robust}}(x)$",r"$a_{\operatorname{Non-Robust}}(x)$"]
Strats["x"]=["Robust","Non-Robust"]
display(Strats)

Unnamed: 0,[-5 0],[-5 1],[-4 0],[-4 1],[-3 0],[-3 1],[-2 0],[-2 1],[-1 0],[-1 1],...,[1 1],[2 0],[2 1],[3 0],[3 1],[4 0],[4 1],[5 0],[5 1],x
0,[4 1],[2 1],[4 1],[1 1],[2 1],[1 1],[4 1],[1 1],[3 1],[2 1],...,[5 1],[1 0],[5 1],[3 1],[3 1],[1 1],[5 1],[2 1],[5 1],Robust
1,[5 1],[2 1],[1 1],[1 1],[5 1],[1 1],[3 1],[2 1],[5 1],[3 1],...,[4 1],[4 1],[5 1],[2 0],[5 1],[4 0],[3 1],[5 1],[5 1],Non-Robust


In [42]:
print(rob_strat)
print(nonrob_strat)

['[4 1]', '[2 1]', '[4 1]', '[1 1]', '[2 1]', '[1 1]', '[4 1]', '[1 1]', '[3 1]', '[2 1]', '[2 1]', '[5 1]', '[1 0]', '[5 1]', '[3 1]', '[3 1]', '[1 1]', '[5 1]', '[2 1]', '[5 1]']
['[5 1]', '[2 1]', '[1 1]', '[1 1]', '[5 1]', '[1 1]', '[3 1]', '[2 1]', '[5 1]', '[3 1]', '[1 1]', '[4 1]', '[4 1]', '[5 1]', '[2 0]', '[5 1]', '[4 0]', '[3 1]', '[5 1]', '[5 1]']


In [43]:
np.mean([a_opt_robust(x)[0] for x in X])

2.8

In [44]:
np.mean([a_opt_nonrobust(x)[0] for x in X])

3.3

### Evaluation

In [39]:
def cumulated_rewards(N_iter,a,X_0):
    rew = []
    for i in range(N_iter):
        X_1 = P_0(X_0,a(X_0))
        rew.append(r(X_0,a(X_0),X_1))
        X_0 = X_1
    return np.sum(rew)

In [40]:
N = 100000
robust_rewards = []
non_robust_rewards = []
#Different Scenarios
Probs = [[0.4,0.6],[0.45,0.5],[0.45,0.55],[0.6,0.4],[0.5,0.5],[0.55,0.45]]
rng = np.random.default_rng()
x_0 = rng.choice(X) #Initial Value
for p in  Probs:
    robust_rewards.append(cumulated_rewards(N,a_opt_robust,x_0))
    non_robust_rewards.append(cumulated_rewards(N,a_opt_nonrobust,x_0))

In [41]:
Results = pd.DataFrame(np.array([robust_rewards,non_robust_rewards]))
Results.columns = [str(p) for p in Probs]
Results["p"]=["Robust","Non-Robust"]
# Results= Results.set_index("p").reset_index()
Results

Unnamed: 0,"[0.4, 0.6]","[0.45, 0.5]","[0.45, 0.55]","[0.6, 0.4]","[0.5, 0.5]","[0.55, 0.45]",p
0,125290,31045,77086,-38322,33982,-5880,Robust
1,122514,30898,74690,-47601,30127,-11515,Non-Robust
