In [1]:
#!pip install torch torchvision torchaudio
#!pip install pytorch-lightning
#!pip install scikit-learn

In [2]:
import numpy as np
import torch
from torch import nn 
from torchvision.datasets import FashionMNIST, MNIST, EMNIST, CIFAR100
from torchvision import transforms
from pytorch_lightning import Trainer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from models import *
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from joblib import Parallel, delayed
from tqdm import tqdm
from collections import defaultdict
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
import tensorflow.keras as K


import matplotlib as mpl
mpl.style.use("classic")
mpl.rcParams["figure.figsize"] = [5, 3]

mpl.rcParams["axes.linewidth"] = 0.75
mpl.rcParams["figure.facecolor"] = "w"
mpl.rcParams["grid.linewidth"] = 0.75
mpl.rcParams["lines.linewidth"] = 0.75
mpl.rcParams["patch.linewidth"] = 0.75
mpl.rcParams["xtick.major.size"] = 3
mpl.rcParams["ytick.major.size"] = 3

mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["ps.fonttype"] = 42
mpl.rcParams["font.size"] = 9
mpl.rcParams["axes.titlesize"] = "medium"
mpl.rcParams["legend.fontsize"] = "medium"


import os
import warnings
warnings.filterwarnings("ignore")


# Reproducibility
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('The available device is :', device)

  return _bootstrap._gcd_import(name[level:], package, level)


The available device is : cuda:0


In [3]:
dict_datasets = {'MNIST' : MNIST,
                 'FashionMNIST': FashionMNIST,
                 'EMNIST': EMNIST,
                'CIFAR': CIFAR100}

In [4]:
name = 'MNIST'
print('dataset used is :', name)
dataset = dict_datasets[name]

dataset used is : MNIST


In [5]:
if name != 'CIFAR': #For all datasets, except CIFAR, we do not use any pre-trained model to get features
    if name == 'EMNIST':
        mnist_train = dataset(os.getcwd(), train=True, split = 'balanced', download=True, transform=transforms.ToTensor())
        mnist_test = dataset(os.getcwd(), train=False, split = 'balanced', download=True, transform=transforms.ToTensor())

    else:
        mnist_train = dataset(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
        mnist_test = dataset(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
        
    X_train, Y_train = torch.tensor(mnist_train.data).float().reshape(len(mnist_train.data), -1), torch.tensor(mnist_train.targets)
    X_test, Y_test = torch.tensor(mnist_test.data).float().reshape(len(mnist_test.data), -1), torch.tensor(mnist_test.targets)

else: #For CIFAR, we use a pre-trained ResNet50 to get features
    cifar_train = dataset(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
    cifar_test = dataset(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
    
    resnet_model = ResNet50(weights='imagenet', input_tensor=K.Input(shape=(32, 32, 3)))
    ablation_model = K.Model(inputs=resnet_model.inputs, outputs=resnet_model.layers[-2].output)
    
    X_train = ablation_model.predict(preprocess_input(cifar_train.data, data_format=None))
    X_test = ablation_model.predict(preprocess_input(cifar_test.data, data_format=None))
    
    X_train, Y_train = torch.tensor(X_train), torch.tensor(cifar_train.targets)
    X_test, Y_test = torch.tensor(X_test), torch.tensor(cifar_test.targets)
    
# rescaling the contexts
X_train /= torch.norm(X_train, p = 2, dim = -1, keepdim=True)
X_test /= torch.norm(X_test, p = 2, dim = -1, keepdim=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/train-images-idx3-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/train-labels-idx1-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/t10k-images-idx3-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw/t10k-labels-idx1-ubyte.gz to /mnt/nfs/home/i.aouali/aistats_2024/MNIST/raw



In [6]:
# data subsampling to learn a logging Policy

x0, X_log, y0, Y_log = train_test_split(X_train, Y_train, train_size = 0.05)

N = len(X_log)
N_test = len(X_test)

print('Train : dimension of X is :', X_train.shape, 'dimension of Y is :', len(X_train))
print('Test : dimension of X_test is :', X_test.shape, 'dimension of Y is :', N_test)
context_dim = X_log.shape[1]
num_actions = len(np.unique(Y_log))
print('num_actions: ', num_actions)

subsample_pt = TensorDataset(x0, y0)
subsample_dataloader = DataLoader(subsample_pt, batch_size=128, shuffle=True)

# create the logging split
logging_split = TensorDataset(X_log, Y_log)
logging_split_dataloader = DataLoader(logging_split, batch_size=128, shuffle=True)

Train : dimension of X is : torch.Size([60000, 784]) dimension of Y is : 60000
Test : dimension of X_test is : torch.Size([10000, 784]) dimension of Y is : 10000
num_actions:  10


In [7]:
# Training a logging policy
etas = np.round(np.linspace(0, 1, 10), 2) # Inverse temperature parameter (the higher eta the better the performance of the logging policy)

dict_results = defaultdict(list)
epochs_logging = 10 # logging policy is trained using 10 epochs
epochs = 20 # learning policies are trained using 20 epochs

logging_policy = SupervisedPolicy(n_actions=num_actions, context_dim=context_dim, softmax = True, reg=1e-6, device = device)
trainer = Trainer(max_epochs=epochs_logging, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
trainer.fit(logging_policy, subsample_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

In [None]:
for i, eta in enumerate(etas):
    logging_policy = logging_policy.to(device)
    print('eta parameter : ', eta)  
    logging_policy.alpha = eta
    risk_logging = test_risk_exact_probit(X_test, Y_test, logging_policy)
    print('The reward of the logging policy: ', -risk_logging)
    
    dict_results['eta'].append(eta)
    dict_results['logging_reward'].append(-risk_logging)
    
    # Collect a bandit dataset
    f, a, p, c = build_bandit_dataset(logging_split_dataloader, logging_policy, replay_count = 1)

    print('max', p.max(dim = 0)[0].mean().item())
    print('min', p.min(dim = 0)[0].mean().item())

    bandit_train_posterior = TensorDataset(f, a, p, c)
    bandit_train_posterior_dataloader = DataLoader(bandit_train_posterior, batch_size=128, shuffle=True)

    mu_0 = eta * logging_policy.linear.weight.data
    
    #####################################################################################################################
    #####################################################################################################################
    ######### Ours with Gaussian policies
    #######################################
    
    model = OurGaussian(n_actions=num_actions, context_dim=context_dim, beta = 1 -  1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Ours with Gaussian policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['ours, gaussian'].append(-risk_after_train)
    
    ######### Ours with Mixed-Logit policies
    
    model = OurMixedLogit(n_actions=num_actions, context_dim=context_dim, beta = 1 -  1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Ours with MixedLogit policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['ours, mixed-logit'].append(-risk_after_train)

    #####################################################################################################################
    #####################################################################################################################
    ######### London with Gaussian policies
    #######################################
    
    model = LondonGaussian(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of London with Gaussian policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['london, gaussian'].append(-risk_after_train)
    
    ######### London with MixedLogit policies
    
    model = LondonMixedLogit(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of London with MixedLogit policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['london, mixed-logit'].append(-risk_after_train)
    
    #####################################################################################################################
    #####################################################################################################################
    ######### Sakhi 1 with Gaussian policies
    ########################################
    model = CatoniGaussian(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Sakhi et al. 1 with Gaussian policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['sakhi1, gaussian'].append(-risk_after_train)

    ######### Sakhi 1 with MixedLogit policies
    
    model = CatoniMixedLogit(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Sakhi et al. 1 with MixedLogit policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['sakhi1, mixed-logit'].append(-risk_after_train)
    
    
    #####################################################################################################################
    #####################################################################################################################
    ######### Sakhi 2 with Gaussian policies
    #######################################
    model = BernsteinGaussian(n_actions=num_actions, context_dim=context_dim, tau=1/(N**(1/4)), N=N, loc_weight=mu_0, num_p=100, rc=1, xi=0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Sakhi et al. 2 with Gaussian policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['sakhi2, gaussian'].append(-risk_after_train)

    ######### Sakhi 2 with MixedLogit policies
    model = BernsteinMixedLogit(n_actions=num_actions, context_dim=context_dim, tau=1/(N**(1/4)), N=N, loc_weight=mu_0, num_p=100, rc=1, xi=0, device = device)

    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)
    trainer.fit(model, bandit_train_posterior_dataloader)

    model = model.to(device)
    with torch.no_grad():
        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)
        
    print('Reward of Sakhi et al. 2 with MixedLogit policies after training  :', -risk_after_train)
    print('################################################################################')
        
    dict_results['sakhi2, mixed-logit'].append(-risk_after_train)
    
    print(dict_results)

df = pd.DataFrame(dict_results)
print(df)
df.to_csv('results/results_' + name +'.csv', index = False)

eta parameter :  0.0


79it [00:00, 682.16it/s]


The reward of the logging policy:  0.10000000149011612


100%|██████████| 446/446 [00:00<00:00, 590.49it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


max 0.10000000894069672
min 0.10000000894069672


Training: 0it [00:00, ?it/s]

79it [00:01, 41.54it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with Gaussian policies after training  : 0.40582422676086427
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 41.73it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with MixedLogit policies after training  : 0.3040118197441101
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 42.89it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with Gaussian policies after training  : 0.14937721660137177
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 43.53it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with MixedLogit policies after training  : 0.10942757939100266
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 43.33it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.6834229459762573
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 42.14it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.5687774660110474
################################################################################
S_lmbd is defined by these two bounds 0.0011664327636970212 0.12943784736956893


Training: 0it [00:00, ?it/s]

79it [00:01, 42.81it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.6170005302429199
################################################################################
S_lmbd is defined by these two bounds 0.0011664327636970212 0.12943784736956893


Training: 0it [00:00, ?it/s]

79it [00:01, 43.03it/s]


Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.4607139356613159
################################################################################
defaultdict(<class 'list'>, {'eta': [0.0], 'logging_reward': [0.10000000149011612], 'ours, gaussian': [0.40582422676086427], 'ours, mixed-logit': [0.3040118197441101], 'london, gaussian': [0.14937721660137177], 'london, mixed-logit': [0.10942757939100266], 'sakhi1, gaussian': [0.6834229459762573], 'sakhi1, mixed-logit': [0.5687774660110474], 'sakhi2, gaussian': [0.6170005302429199], 'sakhi2, mixed-logit': [0.4607139356613159]})
eta parameter :  0.11


79it [00:00, 116.30it/s]


The reward of the logging policy:  0.2255584443092346


100%|██████████| 446/446 [00:00<00:00, 603.69it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


max 0.3826219439506531
min 0.019424449652433395


Training: 0it [00:00, ?it/s]

79it [00:01, 41.74it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with Gaussian policies after training  : 0.7974009990692139
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 45.62it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with MixedLogit policies after training  : 0.7586897585868836
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 43.34it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with Gaussian policies after training  : 0.4066402594566345
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 42.61it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with MixedLogit policies after training  : 0.2667170904159546
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 68.91it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.6970359573364258
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:00, 96.42it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.6345921165466308
################################################################################
S_lmbd is defined by these two bounds 0.0011664327636970212 0.12943784736956893


Training: 0it [00:00, ?it/s]

79it [00:01, 75.78it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.7053758906364441
################################################################################
S_lmbd is defined by these two bounds 0.0011664327636970212 0.12943784736956893


Training: 0it [00:00, ?it/s]

79it [00:01, 73.15it/s]


Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.621946343421936
################################################################################
defaultdict(<class 'list'>, {'eta': [0.0, 0.11], 'logging_reward': [0.10000000149011612, 0.2255584443092346], 'ours, gaussian': [0.40582422676086427, 0.7974009990692139], 'ours, mixed-logit': [0.3040118197441101, 0.7586897585868836], 'london, gaussian': [0.14937721660137177, 0.4066402594566345], 'london, mixed-logit': [0.10942757939100266, 0.2667170904159546], 'sakhi1, gaussian': [0.6834229459762573, 0.6970359573364258], 'sakhi1, mixed-logit': [0.5687774660110474, 0.6345921165466308], 'sakhi2, gaussian': [0.6170005302429199, 0.7053758906364441], 'sakhi2, mixed-logit': [0.4607139356613159, 0.621946343421936]})
eta parameter :  0.22


79it [00:00, 2515.30it/s]


The reward of the logging policy:  0.3974212783336639


100%|██████████| 446/446 [00:00<00:00, 590.80it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


max 0.7379595637321472
min 0.0029253759421408176


Training: 0it [00:00, ?it/s]

79it [00:01, 70.63it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with Gaussian policies after training  : 0.8395170715332031
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:00, 93.13it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Ours with MixedLogit policies after training  : 0.8263850328445435
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 72.40it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with Gaussian policies after training  : 0.5898578350067138
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 74.11it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of London with MixedLogit policies after training  : 0.4446976410865784
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:01, 77.40it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.7149958414077758
################################################################################


Training: 0it [00:00, ?it/s]

79it [00:00, 96.69it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.648596229839325
################################################################################
S_lmbd is defined by these two bounds 0.0011664327636970212 0.12943784736956893


Training: 0it [00:00, ?it/s]