In [None]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque
import random

# Hyperparameters
EPISODES = 1000
ALPHA = 0.001  # Learning rate
GAMMA = 0.99   # Discount factor
EPSILON = 1.0  # Exploration rate
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
BATCH_SIZE = 32
MEMORY_SIZE = 2000

# DQL Agent
class DQLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.Input(shape=(self.state_size,)))  # Input shape for state
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))  # Output size equal to action space
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=ALPHA))  # Adam optimizer
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= EPSILON:
            return random.randrange(self.action_size)  # Explore action space
        act_values = self.model.predict(state)  # Predict Q-values
        return np.argmax(act_values[0])  # Return action with the highest Q-value

    def replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        minibatch = random.sample(self.memory, BATCH_SIZE)  # Sample from memory
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += GAMMA * np.amax(self.model.predict(next_state)[0])  # Update target for Q-value
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)  # Train model

# Main function
if __name__ == "__main__":
    env = gym.make('Acrobot-v1')  # Create Acrobot environment
    agent = DQLAgent(state_size=6, action_size=3)  # Initialize agent with state and action sizes

    for e in range(EPISODES):
        state, _ = env.reset()  # Reset environment and ignore the second returned value
        print("Initial state:", state)  # Print the initial state to check its shape
        state = np.reshape(state, [1, agent.state_size])  # Reshape for model input
        done = False  # Initialize done variable

        for time in range(500):
            action = agent.act(state)  # Select action
            next_state_info = env.step(action)  # Execute action

            # Check the type and length of next_state_info
            print("Next state info:", next_state_info)  # Debugging line to see what is returned

            # Unpack values based on the length of the returned tuple
            if len(next_state_info) == 5:  # Check for 5 returned values
                next_state, reward, terminated, truncated, _ = next_state_info
                done = terminated  # Set done based on terminated
            elif len(next_state_info) == 4:
                next_state, reward, terminated, truncated = next_state_info
                done = terminated  # Set done based on terminated
            elif len(next_state_info) == 3:
                next_state, reward, done = next_state_info
            else:
                raise ValueError(f"Unexpected number of values returned: {len(next_state_info)}")

            reward = reward if not done else -10  # Reward assignment
            next_state = np.reshape(next_state, [1, agent.state_size])  # Reshape next state
            agent.remember(state, action, reward, next_state, done)  # Store in memory
            state = next_state  # Transition to next state

            if done:
                print(f"Episode: {e}/{EPISODES}, Score: {time}")
                break
        agent.replay()  # Train the agent using experience replay
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY  # Decay epsilon for exploration

    env.close()  # Close environment after training


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        2.0079925 ], dtype=float32), -1.0, False, False, {})
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Next state info: (array([ 0.62928325, -0.777176  ,  0.51082194,  0.85968655, -0.97432214,
        0.86910814], dtype=float32), -1.0, False, False, {})
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Next state info: (array([ 0.5514802 , -0.834188  ,  0.45927086,  0.88829625,  0.01962749,
       -0.27292424], dtype=float32), -1.0, False, False, {})
Next state info: (array([ 0.64375556, -0.7652312 ,  0.6236167 ,  0.7817303 ,  1.1226202 ,
       -1.6906481 ], dtype=float32), -1.0, False, False, {})
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Next state info: (array([ 0.8473724 , -0.530999  ,  0.89897215,  0.4380058 ,  1.9426038 ,
       -2.6890743 ], dtype=float32), -1.0, False, False, {})
Next state info: (array([ 0.99489814, -0.10088461,  0.

KeyboardInterrupt: 