In [None]:
#---------exercice1
import gymnasium as gym
import numpy as np

env = gym.make("Taxi-v3")

# Nombre d'états et d'actions
state_size = env.observation_space.n
action_size = env.action_space.n

print(f"Nombre d'états : {state_size}")
print(f"Nombre d'actions : {action_size}")

policy_table = np.ones((state_size, action_size)) / action_size
value_table = np.zeros(state_size)

print("Premières lignes de policy table:")
print(policy_table[:7])

print("Premières valeurs de value table :")
print(value_table[:7])

# Exercice 2
for episode in range(20):
    state, _ = env.reset()
    done = False
    total_reward = 0

    print(f"Épisode {episode + 1} :")

    while not done:
        action = env.action_space.sample()
        next_state, reward, done, _, _ = env.step(action)
        total_reward += reward
        print(f"  Action : {action}, Récompense : {reward}")

    print(f"Récompense totale : {total_reward}\n")

# -------Exercice 3
gamma = 0.99

def compute_discounted_rewards(rewards, gamma):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    R = 0
    for t in reversed(range(len(rewards))):
        R = rewards[t] + gamma * R
        discounted_rewards[t] = R
    return discounted_rewards

def compute_advantage(discounted_rewards, value_table, episode_states):
    return discounted_rewards - value_table[episode_states]

def update_policy(policy_table, episode_states, episode_actions, advantage, lr_policy=0.1, clip_epsilon=0.2):
    for i in range(len(episode_states)):
        state, action = episode_states[i], episode_actions[i]
        old_prob = policy_table[state, action]
        new_prob = old_prob + lr_policy * advantage[i]
        new_prob = np.clip(new_prob, 0, 1)
        policy_table[state, action] = new_prob / np.sum(policy_table[state])

def update_value_function(value_table, episode_states, discounted_rewards, lr_value=0.1):
    for i in range(len(episode_states)):
        state = episode_states[i]
        value_table[state] += lr_value * (discounted_rewards[i] - value_table[state])

lr_policy = 0.1
clip_epsilon = 0.2
num_training_episodes = 500

for episode in range(num_training_episodes):
    state, _ = env.reset()
    done = False
    episode_states = []
    episode_actions = []
    episode_rewards = []

    while not done:
        action_probs = policy_table[state] / np.sum(policy_table[state])
        action = np.random.choice(action_size, p=action_probs)
        next_state, reward, done, _, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state

    discounted_rewards = compute_discounted_rewards(episode_rewards, gamma)
    advantage = compute_advantage(discounted_rewards, value_table, episode_states)

    update_policy(policy_table, episode_states, episode_actions, advantage, lr_policy, clip_epsilon)
    update_value_function(value_table, episode_states, discounted_rewards)

num_eval_episodes = 20
total_rewards = []

for ep in range(num_eval_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = np.argmax(policy_table[state])
        next_state, reward, done, _, _ = env.step(action)
        total_reward += reward
        state = next_state

    total_rewards.append(total_reward)

print(f"Récompenses moyennes après entraînement : {np.mean(total_rewards)}")
