In [None]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque
import random

# Hyperparameters
EPISODES = 1000
ALPHA = 0.001  # Learning rate
GAMMA = 0.99   # Discount factor
EPSILON = 1.0  # Exploration rate
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
BATCH_SIZE = 32
MEMORY_SIZE = 2000

# DQL Agent
class DQLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.Input(shape=(self.state_size,)))  # Input shape for state
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))  # Output size equal to action space
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=ALPHA))  # Adam optimizer
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= EPSILON:
            return np.random.uniform(-1.0, 1.0)  # Continuous action space
        act_values = self.model.predict(state)  # Predict Q-values
        return np.clip(act_values[0][0], -1.0, 1.0)  # Return action with the highest Q-value

    def replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        minibatch = random.sample(self.memory, BATCH_SIZE)  # Sample from memory
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += GAMMA * np.amax(self.model.predict(next_state)[0])  # Update target for Q-value
            target_f = self.model.predict(state)
            target_f[0][0] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)  # Train model

# Main function
if __name__ == "__main__":
    env = gym.make('MountainCarContinuous-v0')  # Create MountainCarContinuous environment
    agent = DQLAgent(state_size=2, action_size=1)  # Initialize agent with state and action sizes

    for e in range(EPISODES):
        state = env.reset()  # Reset environment
        state = np.reshape(state, [1, agent.state_size])  # Reshape for model input
        done = False  # Initialize done variable

        for time in range(500):
            action = agent.act(state)  # Select action
            action = np.array([action])  # Convert action to a 1D array for continuous action space
            next_state, reward, done, _ = env.step(action)  # Execute action
            reward = reward if not done else -10  # Reward assignment
            next_state = np.reshape(next_state, [1, agent.state_size])  # Reshape next state
            agent.remember(state, action, reward, next_state, done)  # Store in memory
            state = next_state  # Transition to next state

            if done:
                print(f"Episode: {e}/{EPISODES}, Score: {time}")
                break
        agent.replay()  # Train the agent using experience replay
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY  # Decay epsilon for exploration

    env.close()  # Close environment after training


  if not isinstance(terminated, (bool, np.bool8)):


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

KeyboardInterrupt: 