Ce notebook permet de comparer les résultats des algorithmes **CAC**,**CACLA** et **CACLA+VAR** avec une stratégie d'exploration gaussienne et $\epsilon$-greedy sur l'environnement **Tracking** du papier  https://www.researchgate.net/publication/4249966_Reinforcement_Learning_in_Continuous_Action_Spaces/link/0912f5093a214c7f1b000000/download

Fait par Jérémy DUFOURMANTELLE et Ethan ABITBOL

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import torch
import sys
import numpy as np

sys.path.insert(0, '../')
from utils.Tracking import Tracking
from utils.Critic import CriticNetwork
from utils.Actor import ActorNetwork
from utils.CAC import CAC
from utils.CACLA import CACLA
from utils.CACLAVAR import CACLAVAR

from tqdm import tqdm

In [None]:
nb_tests_global = 20

# Gaussian Exploration

### CAC with gaussian exploration

In [None]:
success_cac = 0
fails_cac = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_cac = list()
matrice_simulation_iteration_cac = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    cac = CAC(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.9,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "gaussian",
        verbose_mode = False
    )
    cac.learning()
    matrice_simulation_rewards_cac.append(cac.list_rewards_mean)
    matrice_simulation_iteration_cac.append(cac.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  cac.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_cac += 1
    else :
        success_cac += 1    

### CACLA with gaussian exploration

In [None]:
success_cacla = 0
fails_cacla = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_cacla = list()
matrice_simulation_iteration_cacla = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    cacla = CACLA(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.95,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "gaussian",
        verbose_mode = False
    )
    
    cacla.learning()
    matrice_simulation_rewards_cacla.append(cacla.list_rewards_mean)
    matrice_simulation_iteration_cacla.append(cacla.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  cacla.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_cacla += 1
    else :
        success_cacla += 1    

### CACLA+VAR with gaussian exploration

In [None]:
success_caclavar = 0
fails_caclavar = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_caclavar = list()
matrice_simulation_iteration_caclavar = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    caclavar = CACLAVAR(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.8,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "gaussian",
        verbose_mode = False
    )
    caclavar.learning()
    matrice_simulation_rewards_caclavar.append(caclavar.list_rewards_mean)
    matrice_simulation_iteration_caclavar.append(caclavar.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  caclavar.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_caclavar += 1
    else :
        success_caclavar += 1    

In [None]:
print(f"[gaussian] Nombre de tests : {nb_tests}")
print("------------------------------------")
print(f"[gaussian] Nombre de succes CAC: {success_cac}")
print(f"[gaussian] Nombre d'echecs CAC: {fails_cac}")
print(f"[gaussian] Ratio de succes pour CAC: {success_cac/(success_cac+fails_cac)*100}%")
print("------------------------------------")
print(f"[gaussian] Nombre de succes CACLA: {success_cacla}")
print(f"[gaussian] Nombre d'echecs CACLA: {fails_cacla}")
print(f"[gaussian] Ratio de succes pour CACLA: {success_cacla/(success_cacla+fails_cacla)*100}%")
print("------------------------------------")
print(f"[gaussian] Nombre de succes CACLAVAR: {success_caclavar}")
print(f"[gaussian] Nombre d'echecs CACLAVAR: {fails_caclavar}")
print(f"[gaussian] Ratio de succes pour CACLAVAR: {success_caclavar/(success_caclavar+fails_caclavar)*100}%")

In [None]:
def rewards_normalization(matrice_simulation_rewards) :
    dist_max = -200
    dist_min = 0
    arr = np.array(matrice_simulation_rewards)
    return 1 - ( arr / (dist_max - dist_min)).mean(axis=0)

In [None]:
l_cac = rewards_normalization(matrice_simulation_rewards_cac)
l_cacla = rewards_normalization(matrice_simulation_rewards_cacla)
l_caclavar = rewards_normalization(matrice_simulation_rewards_caclavar)

In [None]:
m_iter_caclavar = np.array(matrice_simulation_iteration_caclavar)
x_iter_caclavar = m_iter_caclavar.mean(axis=0)

m_iter_cacla = np.array(matrice_simulation_iteration_cacla)
x_iter_cacla = m_iter_cacla.mean(axis=0)

In [None]:
plt.figure()
plt.title(f"evolution of the rewards with {nb_tests} simulations and gaussian exploration on Tracking")
plt.xlabel("episode")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(l_cac, label="rewards CAC")
plt.plot(l_cacla, label="rewards CACLA")
plt.plot(l_caclavar, label="rewards CACLAVAR")
plt.legend()
plt.show()

plt.figure()
plt.title(f"evolution of the rewards with {nb_tests} simulations and gaussian exploration on Tracking")
plt.xlabel("episode")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(l_cacla, label="rewards CACLA")
plt.plot(l_caclavar, label="rewards CACLAVAR")
plt.legend()
plt.show()

plt.figure(figsize=(5,11))
plt.title(f"evolution of the rewards with {nb_tests} simulations and gaussian exploration on Tracking")
plt.xlabel("iteration")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(x_iter_cacla,l_cacla, label="rewards CACLA")
plt.plot(x_iter_caclavar,l_caclavar, label="rewards CACLAVAR")
plt.xticks([0,51200,102400])
plt.legend()
plt.show()

In [None]:
print("Resultats : ")
print("[gaussian] CAC : mean rewards -> ",l_cac.mean())
print("[gaussian] CAC : std rewards -> ",l_cac.std())
print("[gaussian] CACLA : mean rewards -> ",l_cacla.mean())
print("[gaussian] CACLA : std rewards -> ",l_cacla.std())
print("[gaussian] CACLAVAR : mean rewards -> ",l_caclavar.mean())
print("[gaussian] CACLAVAR : std rewards -> ",l_caclavar.std())

# $\epsilon$-greedy Exploration

### CAC with $\epsilon$-greedy exploration

In [None]:
success_cac = 0
fails_cac = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_cac = list()
matrice_simulation_iteration_cac = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    cac = CAC(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.9,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "egreedy",
        verbose_mode = False
    )
    cac.learning()
    matrice_simulation_rewards_cac.append(cac.list_rewards_mean)
    matrice_simulation_iteration_cac.append(cac.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  cac.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_cac += 1
    else :
        success_cac += 1    

### CACLA with $\epsilon$-greedy exploration

In [None]:
success_cacla = 0
fails_cacla = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_cacla = list()
matrice_simulation_iteration_cacla = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    cacla = CACLA(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.95,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "egreedy",
        verbose_mode = False
    )
    cacla.learning()
    matrice_simulation_rewards_cacla.append(cacla.list_rewards_mean)
    matrice_simulation_iteration_cacla.append(cacla.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  cacla.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_cacla += 1
    else :
        success_cacla += 1    

### CACLA+VAR with $\epsilon$-greedy exploration

In [None]:
success_caclavar = 0
fails_caclavar = 0
nb_tests = nb_tests_global

matrice_simulation_rewards_caclavar = list()
matrice_simulation_iteration_caclavar = list()

for i in tqdm(range(nb_tests)) : 
    env = Tracking()
    actor_network = ActorNetwork(
        nb_neurons = 12,
        action_space = env.action_space,
        observation_space = env.observation_space
    )
    critic_network = CriticNetwork(
        nb_neurons = 12,
        observation_space = env.observation_space
    )
    caclavar = CACLAVAR(
        learning_rate_critic = 0.01,
        learning_rate_actor = 0.01,
        discount_factor = 0.9,
        epsilon = 1.0,
        epsilon_min = 0.01,
        epsilon_decay = 0.01,
        sigma = 0.1,
        nb_episode = 500,
        nb_tests = 3,
        test_frequency = 1,
        env = env,
        actor_network = actor_network,
        critic_network = critic_network,
        exploration_strategy = "egreedy",
        verbose_mode = False
    )
    caclavar.learning()
    matrice_simulation_rewards_caclavar.append(caclavar.list_rewards_mean)
    matrice_simulation_iteration_caclavar.append(caclavar.list_iteration)
    
    state = env.reset()
    done = False
    nb_iter = 0
    while not done :
        state_t = torch.as_tensor(state , dtype=torch.float32)
        action =  caclavar.best_model(state_t).detach().numpy()
        new_state, reward, done = env.step(action)
        state = new_state
        nb_iter += 1
    
    if nb_iter == env.max_iteration :
        fails_caclavar += 1
    else :
        success_caclavar += 1    

In [None]:
print(f"[egreedy] Nombre de tests : {nb_tests}")
print("------------------------------------")
print(f"[egreedy] Nombre de succes CAC: {success_cac}")
print(f"[egreedy] Nombre d'echecs CAC: {fails_cac}")
print(f"[egreedy] Ratio de succes pour CAC: {success_cac/(success_cac+fails_cac)*100}%")
print("------------------------------------")
print(f"[egreedy] Nombre de succes CACLA: {success_cacla}")
print(f"[egreedy] Nombre d'echecs CACLA: {fails_cacla}")
print(f"[egreedy] Ratio de succes pour CACLA: {success_cacla/(success_cacla+fails_cacla)*100}%")
print("------------------------------------")
print(f"[egreedy] Nombre de succes CACLAVAR: {success_caclavar}")
print(f"[egreedy] Nombre d'echecs CACLAVAR: {fails_caclavar}")
print(f"[egreedy] Ratio de succes pour CACLAVAR: {success_caclavar/(success_caclavar+fails_caclavar)*100}%")

In [None]:
def rewards_normalization(matrice_simulation_rewards) :
    dist_max = -200
    dist_min = 0
    arr = np.array(matrice_simulation_rewards)
    return 1 - ( arr / (dist_max - dist_min)).mean(axis=0)

In [None]:
l_cac = rewards_normalization(matrice_simulation_rewards_cac)
l_cacla = rewards_normalization(matrice_simulation_rewards_cacla)
l_caclavar = rewards_normalization(matrice_simulation_rewards_caclavar)

In [None]:
m_iter_caclavar = np.array(matrice_simulation_iteration_caclavar)
x_iter_caclavar = m_iter_caclavar.mean(axis=0)

m_iter_cacla = np.array(matrice_simulation_iteration_cacla)
x_iter_cacla = m_iter_cacla.mean(axis=0)

In [None]:
plt.figure()
plt.title(f"evolution of the rewards with {nb_tests} simulations and egreedy exploration on Tracking")
plt.xlabel("episode")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(l_cac, label="rewards CAC")
plt.plot(l_cacla, label="rewards CACLA")
plt.plot(l_caclavar, label="rewards CACLAVAR")
plt.legend()
plt.show()

plt.figure()
plt.title(f"evolution of the rewards with {nb_tests} simulations and egreedy exploration on Tracking")
plt.xlabel("episode")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(l_cacla, label="rewards CACLA")
plt.plot(l_caclavar, label="rewards CACLAVAR")
plt.legend()
plt.show()

plt.figure(figsize=(5,11))
plt.title(f"evolution of the rewards with {nb_tests} simulations and egreedy exploration on Tracking")
plt.xlabel("iteration")
plt.ylabel("rewards normalized between 0 and 1")
plt.plot(x_iter_cacla,l_cacla, label="rewards CACLA")
plt.plot(x_iter_caclavar,l_caclavar, label="rewards CACLAVAR")
plt.xticks([0,51200,102400])
plt.legend()
plt.show()

In [None]:
print("Resultats : ")
print("[egreedy] CAC : mean rewards -> ",l_cac.mean())
print("[egreedy] CAC : std rewards -> ",l_cac.std())
print("[egreedy] CACLA : mean rewards -> ",l_cacla.mean())
print("[egreedy] CACLA : std rewards -> ",l_cacla.std())
print("[egreedy] CACLAVAR : mean rewards -> ",l_caclavar.mean())
print("[egreedy] CACLAVAR : std rewards -> ",l_caclavar.std())