# Robust Q Learning, Example: Unfair Coin Toss

Load everything

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from Q_learning import *

Fix the Setting

In [2]:
nr_coins = 10
X = np.linspace(0,nr_coins,nr_coins+1)
A = np.array([-1,0,1])

def c(x,y):
    return np.abs(x-y)

def r(x,a,y):
    return a*(y>x)- a*(y<x)-np.abs(a)*(x==y) # You get a

def P_0(x,a):
    return binom.rvs(nr_coins, 0.5)

def p_0(k,x,a):
    return binom.pmf(k,nr_coins,0.5)

alpha = 0.45 # Discount Factor
x_0 = 5 # Initial Value
def c(x,y):
    return np.abs(x-y)

Compute the Wasserstein-distance between two binomial distributions:

$$
W\bigg(Bin(n,p_1),Bin(n,p_2)\bigg)= |p_1-p_2|\cdot n
$$

In [3]:
def Wasserstein_distance(p1,p2):
    return np.sum([np.abs(binom.cdf(k,nr_coins, p1)-binom.cdf(k,nr_coins, p2)) for k in range(nr_coins+1)])

## Training

In [4]:
Nr_iter = 50000
Q_opt_robust_eps2 = robust_q_learning(X,
               A,
               r,
               c,
               P_0, # Simulation of next state in dependence of x and a
               p_0, # The probability mass function
               2, #epsilon
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = Nr_iter,
               q =1,
               gamma_t_tilde = lambda t: 1/(t+1),
               Q_0 = np.ones([len(X),len(A)]))

Q_opt_robust_eps1 = robust_q_learning(X,
               A,
               r,
               c,
               P_0, # Simulation of next state in dependence of x and a
               p_0, # The probability mass function
               1,#epsilon
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = Nr_iter,
               q =1,
               gamma_t_tilde = lambda t: 1/(t+1),
               Q_0 = np.ones([len(X),len(A)]))

Q_opt_robust_eps05 = robust_q_learning(X,
               A,
               r,
               c,
               P_0, # Simulation of next state in dependence of x and a
               p_0, # The probability mass function
               0.5,#epsilon
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = Nr_iter,
               q =1,
               gamma_t_tilde = lambda t: 1/(t+1),
               Q_0 = np.ones([len(X),len(A)]))

Q_opt_nonrobust = q_learning(X,
               A,
               r,
               P_0, # Simulation of next state in dependence of x and a
               alpha,
               x_0, 
               eps_greedy = 0.1,
               Nr_iter = Nr_iter,
               gamma_t_tilde = lambda t: 1/(t+1),
                Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 50000/50000 [2:46:05<00:00,  5.02it/s]  
100%|██████████| 50000/50000 [3:23:58<00:00,  4.09it/s]  
100%|██████████| 50000/50000 [4:43:03<00:00,  2.94it/s]  
100%|██████████| 50000/50000 [00:08<00:00, 5882.34it/s]


Determine the optimal strategies

In [5]:
if np.ndim(A)>1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X)>1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])
def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]
def a_opt_robust_eps05(x):
    return A[np.argmax(Q_opt_robust_eps05[x_index(x),:])]
def a_opt_robust_eps1(x):
    return A[np.argmax(Q_opt_robust_eps1[x_index(x),:])]
def a_opt_robust_eps2(x):
    return A[np.argmax(Q_opt_robust_eps2[x_index(x),:])]
def a_opt_nonrobust(x):
    return A[np.argmax(Q_opt_nonrobust[x_index(x),:])]

Print out the strategies

In [6]:
df = pd.DataFrame(np.array([[a_opt_robust_eps05(x) for x in X],
                            [a_opt_robust_eps1(x) for x in X],
                            [a_opt_robust_eps2(x) for x in X],
                            [a_opt_nonrobust(x) for x in X]]))
df["State"]=["Robust, eps = 0.5","Robust, eps = 1","Robust, eps = 2","Non-Robust"]
df = df.set_index("State").reset_index()
df

Unnamed: 0,State,0,1,2,3,4,5,6,7,8,9,10
0,"Robust, eps = 0.5",1,1,1,0,0,0,0,0,-1,-1,-1
1,"Robust, eps = 1",1,1,0,0,0,0,0,0,0,-1,-1
2,"Robust, eps = 2",0,0,0,0,0,0,0,0,0,0,0
3,Non-Robust,1,1,1,1,1,0,-1,-1,-1,-1,-1


## Evaluation

Simulate games and compare the performance

In [7]:
N = 100000
robust_rewards_eps05 = []
robust_rewards_eps1 = []
robust_rewards_eps2 = []
non_robust_rewards = []
Probs = np.arange(0.1,0.95,0.05)
for p in Probs:
    X_t = np.random.binomial(nr_coins,p,N)
    robust_rewards_eps05.append(np.sum(np.array([r(X_t[i],a_opt_robust_eps05(X_t[i]),X_t[i+1]) for i in range(N-1)])))
    robust_rewards_eps1.append(np.sum(np.array([r(X_t[i],a_opt_robust_eps1(X_t[i]),X_t[i+1]) for i in range(N-1)])))
    robust_rewards_eps2.append(np.sum(np.array([r(X_t[i],a_opt_robust_eps2(X_t[i]),X_t[i+1]) for i in range(N-1)])))
    non_robust_rewards.append(np.sum(np.array([r(X_t[i],a_opt_nonrobust(X_t[i]),X_t[i+1]) for i in range(N-1)])))
Results = pd.DataFrame(np.array([robust_rewards_eps05,robust_rewards_eps1,robust_rewards_eps2,non_robust_rewards]))
Results.columns = np.round(Probs,2)
Results["p"]=["Robust, eps = 0.5","Robust, eps = 1","Robust, eps = 2","Non-Robust"]
Results= Results.set_index("p").reset_index()
Results

Unnamed: 0,p,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9
0,"Robust, eps = 0.5",-24728,-8653,4554,12946,16491,16148,13323,11087,9920,10881,13170,16199,16825,13492,4451,-8434,-24427
1,"Robust, eps = 1",-8174,8992,15201,14512,11091,7365,4387,2652,2050,2649,4373,7352,11139,14645,15276,9106,-7611
2,"Robust, eps = 2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Non-Robust,-31386,-24384,-18438,-11001,-1567,11175,22892,32213,35082,32161,22956,10742,-656,-10745,-18374,-23930,-31091
