<a href="https://colab.research.google.com/github/jbbonice2/PROJET-RL/blob/main/algoComparatifGreedyUcbGrandiant1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## **Comparaison Greedy , Epsilon-greedy , UCB et Gradient bandit**



In [None]:
import numpy as np
import matplotlib.pyplot as plt

class RowGridWorld:
    def __init__(self, n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1):
        self.n_cols = n_cols
        self.goal_states = goal_states  # États cibles avec récompense positive
        self.neutral_states = neutral_states  # États neutres avec récompense 0
        self.reward = reward
        self.step_penalty = step_penalty
        self.state = 0
        self.done = False

    def reset(self):
        self.state = np.random.randint(0, self.n_cols)  # Commencer à un état aléatoire
        self.done = False
        return self.state

    def step(self, action):
        if self.done:
            return self.state, 0, True

        if action == 0:  # Aller à gauche
            self.state = max(0, self.state - 1)
        elif action == 1:  # Aller à droite
            self.state = min(self.n_cols - 1, self.state + 1)

        # Vérifier si l'agent atteint un bord de la grille
        if self.state == 0 or self.state == self.n_cols - 1:
            self.done = True
            reward = self.reward if self.state in self.goal_states else self.step_penalty
            return self.state, reward, self.done

        # Calcul de la récompense
        if self.state in self.goal_states:  # Si l'état actuel est un état cible
            reward = self.reward
        elif self.state in self.neutral_states:  # Si l'état actuel est un état neutre
            reward = 0
        else:
            reward = self.step_penalty  # Pénalité pour chaque mouvement

        return self.state, reward, self.done

    def render(self):
        grid = ['-' for _ in range(self.n_cols)]
        grid[self.state] = 'A'
        print(" ".join(grid))


# Algorithmes
def greedy(env, n_episodes):
    rewards = []
    Q = np.zeros(2)  # Valeurs estimées pour chaque action (gauche, droite)
    N = np.zeros(2)  # Nombre de fois que chaque action a été choisie
    for _ in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = np.argmax(Q)  # Greedy: Choisir l'action avec la plus haute valeur estimée
            next_state, reward, done = env.step(action)
            N[action] += 1
            Q[action] += (reward - Q[action]) / N[action]  # Mise à jour de la valeur estimée
            total_reward += reward
        rewards.append(total_reward)
    return np.array(rewards)

def epsilon_greedy(env, n_episodes, epsilon=0.3):
    rewards = []
    Q = np.zeros(2)
    N = np.zeros(2)
    for _ in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            if np.random.rand() < epsilon:
                action = np.random.choice([0, 1])  # Exploration
            else:
                action = np.argmax(Q)  # Exploitation
            next_state, reward, done = env.step(action)
            N[action] += 1
            Q[action] += (reward - Q[action]) / N[action]
            total_reward += reward
        rewards.append(total_reward)
    return np.array(rewards)

def ucb(env, n_episodes, c=2):
    rewards = []
    Q = np.zeros(2)
    N = np.zeros(2)
    t = 0
    for _ in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            t += 1
            action = np.argmax(Q + c * np.sqrt(np.log(t + 1) / (N + 1e-5)))  # UCB
            next_state, reward, done = env.step(action)
            N[action] += 1
            Q[action] += (reward - Q[action]) / N[action]
            total_reward += reward
        rewards.append(total_reward)
    return np.array(rewards)

def gradient_bandit(env, n_episodes, alpha=0.3):
    rewards = []
    H = np.zeros(2)  # Valeurs de préférence pour chaque action
    for _ in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            # Sélectionner une action selon les probabilités softmax
            exp = np.exp(H)
            probs = exp / np.sum(exp)
            action = np.random.choice([0, 1], p=probs)

            next_state, reward, done = env.step(action)

            # Mise à jour des préférences H
            baseline = np.mean(reward)
            H[action] += alpha * (reward - baseline)

            total_reward += reward
        rewards.append(total_reward)
    return np.array(rewards)

# Initialisation de l'environnement
env = RowGridWorld(n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1)
n_episodes = 900

# Comparaison des méthodes
greedy_rewards = greedy(env, n_episodes)
epsilon_greedy_rewards = epsilon_greedy(env, n_episodes)
ucb_rewards = ucb(env, n_episodes)
gradient_bandit_rewards = gradient_bandit(env, n_episodes)

# Tracer les courbes des récompenses moyennes cumulatives
plt.figure(figsize=(10, 6))

# Moyennes cumulatives
plt.plot(np.cumsum(greedy_rewards) / (np.arange(n_episodes) + 1), label='Greedy')
plt.plot(np.cumsum(epsilon_greedy_rewards) / (np.arange(n_episodes) + 1), label='Epsilon-Greedy')
plt.plot(np.cumsum(ucb_rewards) / (np.arange(n_episodes) + 1), label='UCB')
plt.plot(np.cumsum(gradient_bandit_rewards) / (np.arange(n_episodes) + 1), label='Gradient Bandit')

# Paramètres du graphique
plt.xlabel('Épisodes')
plt.ylabel('Récompense moyenne cumulative')
plt.title('Comparaison entre Greedy, Epsilon-Greedy, UCB et Gradient Bandit')
plt.legend()
plt.grid(True)
plt.show()


## **comparaison entre Monte carlo prediction , Sarsa et N-step prediction**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Classe RowGridWorld
class RowGridWorld:
    def __init__(self, n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1):
        self.n_cols = n_cols
        self.goal_states = goal_states  # États cibles avec récompense positive
        self.neutral_states = neutral_states  # États neutres avec récompense 0
        self.reward = reward
        self.step_penalty = step_penalty
        self.state = 0
        self.done = False

    def reset(self):
        self.state = np.random.randint(0, self.n_cols)  # Commencer à un état aléatoire
        self.done = False
        return self.state

    def step(self, action):
        if self.done:
            return self.state, 0, True

        if action == 0:  # Aller à gauche
            self.state = max(0, self.state - 1)
        elif action == 1:  # Aller à droite
            self.state = min(self.n_cols - 1, self.state + 1)

        # Vérifier si l'agent atteint un bord de la grille
        if self.state == 0 or self.state == self.n_cols - 1:
            self.done = True
            reward = self.reward if self.state in self.goal_states else self.step_penalty
            return self.state, reward, self.done

        # Calcul de la récompense
        if self.state in self.goal_states:  # Si l'état actuel est un état cible
            reward = self.reward
        elif self.state in self.neutral_states:  # Si l'état actuel est un état neutre
            reward = 0
        else:
            reward = self.step_penalty  # Pénalité pour chaque mouvement

        return self.state, reward, self.done

    def render(self):
        grid = ['-' for _ in range(self.n_cols)]
        grid[self.state] = 'A'
        print(" ".join(grid))


# Monte Carlo Prediction
def monte_carlo_prediction(env, n_episodes, gamma=0.9):
    V = np.zeros(env.n_cols)  # Valeurs des états
    returns = {s: [] for s in range(env.n_cols)}  # Stocker les retours pour chaque état
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        done = False
        episode = []

        while not done:
            action = np.random.choice([0, 1])  # Politique aléatoire
            next_state, reward, done = env.step(action)
            episode.append((state, reward))
            state = next_state

        G = 0
        visited = set()
        for state, reward in reversed(episode):
            G = gamma * G + reward
            if state not in visited:
                returns[state].append(G)
                V[state] = np.mean(returns[state])
                visited.add(state)

        rewards_per_episode.append(sum(r for _, r in episode))
    return V, np.array(rewards_per_episode)


# SARSA
def sarsa(env, n_episodes, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((env.n_cols, 2))  # Valeurs état-action
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[state])
        done = False
        total_reward = 0

        while not done:
            next_state, reward, done = env.step(action)
            next_action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[next_state])

            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])

            state = next_state
            action = next_action
            total_reward += reward

        rewards_per_episode.append(total_reward)
    return Q, np.array(rewards_per_episode)


# n-Step Prediction
def n_step_prediction(env, n_episodes, n=3, alpha=0.1, gamma=0.9):
    V = np.zeros(env.n_cols)
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        done = False
        states = [state]
        rewards = []
        T = float('inf')
        total_reward = 0

        t = 0
        while True:
            if t < T:
                action = np.random.choice([0, 1])  # Politique aléatoire
                next_state, reward, done = env.step(action)
                rewards.append(reward)
                states.append(next_state)
                if done:
                    T = t + 1

            tau = t - n + 1
            if tau >= 0:
                G = sum(gamma ** (i - tau) * rewards[i] for i in range(tau, min(tau + n, T)))
                if tau + n < T:
                    G += gamma ** n * V[states[tau + n]]
                V[states[tau]] += alpha * (G - V[states[tau]])

            if tau == T - 1:
                break
            t += 1
            total_reward += sum(rewards)

        rewards_per_episode.append(total_reward)
    return V, np.array(rewards_per_episode)


# Initialisation de l'environnement
env = RowGridWorld(n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1)
n_episodes = 500

# Comparaison des méthodes
_, mc_rewards = monte_carlo_prediction(env, n_episodes)
_, sarsa_rewards = sarsa(env, n_episodes)
_, n_step_rewards = n_step_prediction(env, n_episodes)

# Tracer les courbes des récompenses moyennes cumulatives
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(mc_rewards) / (np.arange(n_episodes) + 1), label='Monte Carlo Prediction')
plt.plot(np.cumsum(sarsa_rewards) / (np.arange(n_episodes) + 1), label='SARSA')
plt.plot(np.cumsum(n_step_rewards) / (np.arange(n_episodes) + 1), label='n-Step Prediction')

# Paramètres du graphique
plt.xlabel('Épisodes')
plt.ylabel('Récompense moyenne cumulative')
plt.title('Comparaison entre Monte Carlo Prediction, SARSA et n-Step Prediction')
plt.legend()
plt.grid(True)
plt.show()


## **comparaison entre Q-learning , Sarsa et N-step sarsa**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Classe RowGridWorld
class RowGridWorld:
    def __init__(self, n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1):
        self.n_cols = n_cols
        self.goal_states = goal_states  # États cibles avec récompense positive
        self.neutral_states = neutral_states  # États neutres avec récompense 0
        self.reward = reward
        self.step_penalty = step_penalty
        self.state = 0
        self.done = False

    def reset(self):
        self.state = np.random.randint(0, self.n_cols)  # Commencer à un état aléatoire
        self.done = False
        return self.state

    def step(self, action):
        if self.done:
            return self.state, 0, True

        if action == 0:  # Aller à gauche
            self.state = max(0, self.state - 1)
        elif action == 1:  # Aller à droite
            self.state = min(self.n_cols - 1, self.state + 1)

        # Vérifier si l'agent atteint un bord de la grille
        if self.state == 0 or self.state == self.n_cols - 1:
            self.done = True
            reward = self.reward if self.state in self.goal_states else self.step_penalty
            return self.state, reward, self.done

        # Calcul de la récompense
        if self.state in self.goal_states:  # Si l'état actuel est un état cible
            reward = self.reward
        elif self.state in self.neutral_states:  # Si l'état actuel est un état neutre
            reward = 0
        else:
            reward = self.step_penalty  # Pénalité pour chaque mouvement

        return self.state, reward, self.done

    def render(self):
        grid = ['-' for _ in range(self.n_cols)]
        grid[self.state] = 'A'
        print(" ".join(grid))


# Q-Learning
def q_learning(env, n_episodes, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((env.n_cols, 2))  # Valeurs état-action
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[state])
            next_state, reward, done = env.step(action)
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            total_reward += reward

        rewards_per_episode.append(total_reward)
    return Q, np.array(rewards_per_episode)


# SARSA
def sarsa(env, n_episodes, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((env.n_cols, 2))  # Valeurs état-action
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[state])
        done = False
        total_reward = 0

        while not done:
            next_state, reward, done = env.step(action)
            next_action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[next_state])
            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
            state, action = next_state, next_action
            total_reward += reward

        rewards_per_episode.append(total_reward)
    return Q, np.array(rewards_per_episode)


# n-Step SARSA
def n_step_sarsa(env, n_episodes, n=3, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((env.n_cols, 2))  # Valeurs état-action
    rewards_per_episode = []

    for _ in range(n_episodes):
        state = env.reset()
        action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[state])
        states = [state]
        actions = [action]
        rewards = []
        T = float('inf')
        total_reward = 0

        t = 0
        while True:
            if t < T:
                next_state, reward, done = env.step(action)
                rewards.append(reward)
                if done:
                    T = t + 1
                else:
                    next_action = np.random.choice([0, 1]) if np.random.rand() < epsilon else np.argmax(Q[next_state])
                    states.append(next_state)
                    actions.append(next_action)

            tau = t - n + 1
            if tau >= 0:
                G = sum(gamma ** (i - tau) * rewards[i] for i in range(tau, min(tau + n, T)))
                if tau + n < T:
                    G += gamma ** n * Q[states[tau + n], actions[tau + n]]
                Q[states[tau], actions[tau]] += alpha * (G - Q[states[tau], actions[tau]])

            if tau == T - 1:
                break
            t += 1
            total_reward += sum(rewards)

        rewards_per_episode.append(total_reward)
    return Q, np.array(rewards_per_episode)


# Initialisation de l'environnement
env = RowGridWorld(n_cols=10, goal_states=[1, 4, 9], neutral_states=[2, 3, 5], reward=1, step_penalty=-0.1)
n_episodes = 500

# Comparaison des méthodes
_, q_learning_rewards = q_learning(env, n_episodes)
_, sarsa_rewards = sarsa(env, n_episodes)
_, n_step_sarsa_rewards = n_step_sarsa(env, n_episodes)

# Tracer les courbes des récompenses moyennes cumulatives
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(q_learning_rewards) / (np.arange(n_episodes) + 1), label='Q-Learning')
plt.plot(np.cumsum(sarsa_rewards) / (np.arange(n_episodes) + 1), label='SARSA')
plt.plot(np.cumsum(n_step_sarsa_rewards) / (np.arange(n_episodes) + 1), label='n-Step SARSA')

# Paramètres du graphique
plt.xlabel('Épisodes')
plt.ylabel('Récompense moyenne cumulative')
plt.title('Comparaison entre Q-Learning, SARSA et n-Step SARSA')
plt.legend()
plt.grid(True)
plt.show()
