In [None]:
import gymnasium as gym
import numpy as np
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from itertools import count

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

env = gym.make("CartPole-v1")

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [None]:
def DQN():
    n_hidden = 128

    inputs = layers.Input(shape=(n_observations,))
    hidden = layers.Dense(n_hidden, activation='relu')(inputs)
    outputs = layers.Dense(n_actions, activation='softmax')(hidden)

    return keras.Model(inputs=inputs, outputs=outputs)

In [None]:
state, info = env.reset()
n_observations = len(state)
n_actions = env.action_space.n

In [None]:
batch_size = 32
gamma = 0.99
epsilon = 0.9
epsilon_decay = 0.99
learning_rate_adam = 1e-4

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_adam, amsgrad=True)
loss_function = keras.losses.Huber()

model = DQN()
memory = []

while True:  
    state = env.reset()[0]
    print(state)
    episode_reward = 0

    for timestep in range(1, 1000):
        if epsilon > np.random.rand():
            action = np.random.choice(n_actions)
        else:
            state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()
            print(action_probs[0], action)

        epsilon = epsilon * epsilon_decay

        next_state, reward, done, _, _ = env.step(action)
        memory.append([state, next_state, action, reward, done])
        state = np.array(next_state)

        
        if len(memory) >= batch_size:
            
            indices = np.random.choice(range(len(memory)), size=batch_size)
            memory_sample = [memory[i] for i in indices]

            state_memory = [mem[0] for mem in memory_sample]
            next_state_memory = [mem[1] for mem in memory_sample]
            action_memory = [mem[2] for mem in memory_sample]
            reward_memory = [mem[3] for mem in memory_sample]
            done_memory = [mem[4] for mem in memory_sample]

            future_rewards = model.predict(np.array(next_state_memory))
            updated_q_values = reward_memory + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            masks = tf.one_hot(action_memory, n_actions)

            with tf.GradientTape() as tape:
                q_values = model(np.array(state_memory))

                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            memory = memory[10:]
            print(len(memory))

        episode_reward += reward    
        
        if done:
            print(episode_reward, 'done')
            break
    