In [None]:
import gymnasium as gym
import numpy as np


env = gym.make("Taxi-v3")

In [2]:
num_states = env.observation_space.n
num_actions = env.action_space.n

print(f"le nombre des états est {num_states}")
print(f"le nombre des actions est {num_actions}")

le nombre des états est 500
le nombre des actions est 6


In [3]:

gamma = 0.99            
learning_rate = 0.1      
clip_epsilon = 0.2        
episodes = 5000           

policy_table = np.ones((num_states, num_actions)) / num_actions

value_table = np.zeros(num_states)


Fonction pour choisir une action selon la politique

In [4]:

def choisir_action(state):
    probas = policy_table[state]
    return np.random.choice(np.arange(num_actions), p=probas)


Fonction pour calculer les récompenses cumulées

In [5]:
def calcul_recompenses_cumulees(rewards, gamma):
    R = 0
    recompenses = np.zeros(len(rewards))
    for t in reversed(range(len(rewards))):
        R = rewards[t] + gamma * R
        recompenses[t] = R
    return recompenses


Boucle d'entraînement PPO

In [None]:
for episode in range(episodes):
    state, _ = env.reset()
    done = False

    episode_states = []
    episode_actions = []
    episode_rewards = []

    while not done:
        action = choisir_action(state) 
        next_state, reward, done, _, _ = env.step(action) 

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state 


Calcul des récompenses cumulées

In [None]:

    discounted_rewards = calcul_recompenses_cumulees(episode_rewards, gamma)


Calcul de l'avantage

In [None]:

    advantages = discounted_rewards - value_table[episode_states]


Mise à jour de la politique avec clipping PPO

In [None]:
  
    for i in range(len(episode_states)):
        s = episode_states[i]
        a = episode_actions[i]
        A = advantages[i]

        clipped_A = np.clip(A, -clip_epsilon, clip_epsilon)

        policy_table[s] *= (1 - learning_rate)  
        policy_table[s][a] += learning_rate * clipped_A  

        policy_table[s] = np.clip(policy_table[s], 1e-5, 1.0)
        policy_table[s] /= np.sum(policy_table[s])


Mise à jour de la value function V(s)

In [None]:

    for i in range(len(episode_states)):
        s = episode_states[i]
        R = discounted_rewards[i]

        value_table[s] += learning_rate * (R - value_table[s])

    if episode % 500 == 0:
        print(f"✅ Épisode {episode} terminé")

env.close()


Initialiser la Q-table

In [None]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))  

Boucle d'evaluation

In [None]:

test_episodes = 20
successes = 0

for episode in range(test_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    steps = 0

    while not done:
        action = np.argmax(q_table[state])  
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        total_reward += reward
        steps += 1

    if total_reward >= 20:
        successes += 1

    print(f"Épisode {episode+1}: Récompense totale = {total_reward}, Succès = {"yes" if total_reward >= 20 else "no"}")


success_rate = successes / test_episodes
print(f"\n Taux de réussite de l'agent sur {test_episodes} épisodes : {success_rate*100:.2f}%")

env.close()
