<a href="https://colab.research.google.com/github/latifaja/tes/blob/master/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Paramètres du jeu
GRID_SIZE = 4
STATE_SIZE = GRID_SIZE * GRID_SIZE
ACTION_SIZE = 4  # Haut, Bas, Gauche, Droite
GAMMA = 0.9  # Facteur de réduction
LEARNING_RATE = 0.01
EPSILON = 1.0  # Exploration initiale
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
BATCH_SIZE = 32
MEMORY_SIZE = 2000
EPISODES = 1000
TARGET_UPDATE_FREQ = 10  # Fréquence de mise à jour du réseau cible

# Déplacements possibles (Haut, Bas, Gauche, Droite)
MOVES = {
    0: (-1, 0),  # Haut
    1: (1, 0),   # Bas
    2: (0, -1),  # Gauche
    3: (0, 1)    # Droite
}

class GridWorld:
    """Environnement GridWorld 4x4"""
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.reset()

    def reset(self):
        """Réinitialise l'agent à la position de départ."""
        self.agent_pos = (0, 0)
        self.goal_pos = (3, 3)
        self.obstacle_pos = (1, 1)
        return self.get_state()

    def get_state(self):
        """Retourne l'état sous forme d'un vecteur binaire aplati."""
        state = np.zeros((GRID_SIZE, GRID_SIZE))
        state[self.agent_pos] = 1
        return state.flatten()

    def step(self, action):
        """Fait avancer l'agent et renvoie (nouvel état, récompense, terminé)."""
        x, y = self.agent_pos
        dx, dy = MOVES[action]
        new_x, new_y = x + dx, y + dy

        if 0 <= new_x < GRID_SIZE and 0 <= new_y < GRID_SIZE:
            self.agent_pos = (new_x, new_y)

        if self.agent_pos == self.goal_pos:
            return self.get_state(), 10, True
        elif self.agent_pos == self.obstacle_pos:
            return self.get_state(), -5, False
        else:
            return self.get_state(), -1, False

class DoubleDQNAgent:
    """Agent Double DQN avec réseau principal et réseau cible"""
    def __init__(self):
        self.state_size = STATE_SIZE
        self.action_size = ACTION_SIZE
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.epsilon = EPSILON
        self.model = self._build_model()  # Réseau principal
        self.target_model = self._build_model()  # Réseau cible
        self.update_target_model()  # Initialisation du réseau cible

    def _build_model(self):
        """Construit le réseau de neurones."""
        model = Sequential([
            Dense(24, activation='relu', input_shape=(self.state_size,)),
            Dense(24, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=LEARNING_RATE))
        return model

    def update_target_model(self):
        """Met à jour le réseau cible avec les poids du réseau principal."""
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        """Stocke une expérience dans la mémoire."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Choisit une action selon la stratégie ε-greedy."""
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(np.array([state]), verbose=0)
        return np.argmax(q_values[0])

    def replay(self):
        """Entraîne le modèle avec des expériences passées."""
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states = np.zeros((BATCH_SIZE, self.state_size))
        targets = np.zeros((BATCH_SIZE, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(batch):
            states[i] = state
            targets[i] = self.model.predict(np.array([state]), verbose=0)[0]

            if done:
                targets[i][action] = reward
            else:
                # Double DQN: sélection de l'action avec le réseau principal
                next_action = np.argmax(self.model.predict(np.array([next_state]), verbose=0)[0])
                # Évaluation avec le réseau cible
                target_q = self.target_model.predict(np.array([next_state]), verbose=0)[0][next_action]
                targets[i][action] = reward + GAMMA * target_q

        self.model.fit(states, targets, epochs=1, verbose=0)

        if self.epsilon > EPSILON_MIN:
            self.epsilon *= EPSILON_DECAY



In [2]:
# Entraînement de l'agent
env = GridWorld()
agent = DoubleDQNAgent()

for episode in range(EPISODES):
    state = env.reset()
    total_reward = 0

    for step in range(50):  # Limite de 50 déplacements
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if done:
            break

    agent.replay()

    # Mise à jour périodique du réseau cible
    if episode % TARGET_UPDATE_FREQ == 0:
        agent.update_target_model()

    print(f"Episode {episode+1}/{EPISODES}, Score: {total_reward}, Epsilon: {agent.epsilon:.4f}")

# Sauvegarde du modèle
agent.model.save("double_dqn_model.keras")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode 1/1000, Score: -74, Epsilon: 0.9950
Episode 2/1000, Score: -54, Epsilon: 0.9900
Episode 3/1000, Score: 4, Epsilon: 0.9851
Episode 4/1000, Score: -37, Epsilon: 0.9801
Episode 5/1000, Score: -74, Epsilon: 0.9752
Episode 6/1000, Score: -12, Epsilon: 0.9704
Episode 7/1000, Score: -23, Epsilon: 0.9655
Episode 8/1000, Score: -11, Epsilon: 0.9607
Episode 9/1000, Score: -16, Epsilon: 0.9559
Episode 10/1000, Score: -58, Epsilon: 0.9511
Episode 11/1000, Score: -54, Epsilon: 0.9464
Episode 12/1000, Score: -58, Epsilon: 0.9416
Episode 13/1000, Score: -54, Epsilon: 0.9369
Episode 14/1000, Score: -28, Epsilon: 0.9322
Episode 15/1000, Score: -58, Epsilon: 0.9276
Episode 16/1000, Score: -66, Epsilon: 0.9229
Episode 17/1000, Score: -1, Epsilon: 0.9183
Episode 18/1000, Score: -58, Epsilon: 0.9137
Episode 19/1000, Score: -21, Epsilon: 0.9092
Episode 20/1000, Score: 0, Epsilon: 0.9046
Episode 21/1000, Score: -58, Epsilon: 0.9001
Episode 22/1000, Score: -9, Epsilon: 0.8956
Episode 23/1000, Score: -