In [13]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np

env = gym.make('CartPole-v1')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [14]:
epsilon = 0.99
epsilon_decay = 0.99

gamma = 0.7

num_observations = 4
num_hidden = 128
num_actions = 2

batch_size = 128
max_memory_length = 1000

def create_q_model():
    observations = layers.Input(shape=(num_observations,))
    hidden = layers.Dense(num_hidden, activation='relu')(observations)
    action = layers.Dense(num_actions, activation='softmax')(hidden)
    
    return keras.Model(inputs=observations, outputs=action)

policy_net = create_q_model()
target_net = create_q_model()

In [15]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = keras.losses.Huber()

In [None]:
for i in range(100): 
    state = np.array(env.reset()[0])
    episode_reward = 0

    for timestep in range(1, 1000):
        if epsilon > np.random.rand(1)[0]:
            action = np.random.choice(num_actions)
        else:
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = policy_net(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()

        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)
            
        epsilon *= epsilon_decay
        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        if len(done_history) > batch_size:

            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            future_rewards = target_net.predict(state_next_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            print(updated_q_values)
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                q_values = policy_net(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, policy_net.trainable_variables)
            optimizer.apply_gradients(zip(grads, policy_net.trainable_variables))

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break
            
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    print('Running reward: ' + str(running_reward), 'Episode reward: ', str(episode_reward))

    if running_reward > 40:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

tf.Tensor(
[ 1.4900886  1.4426911  1.4948645  1.3864279  1.366649   1.4910412
  1.3671664  1.3662602  1.36698    1.3686876 -1.         1.400507
  1.4377506  1.4911051  1.364918   1.369628   1.4224218  1.4175632
  1.3998951 -1.         1.4153161  1.3877199  1.3652467  1.4229398
  1.4726344  1.4043983  1.4156219  1.4961861  1.3877199  1.4715548
  1.4400971  1.507839   1.422745  -1.         1.4028072  1.3851107
  1.4011304  1.509972   1.3679314  1.4912407 -1.         1.4189386
  1.4432662  1.3662602  1.4544164  1.4021497  1.4758928  1.3850571
  1.364918   1.4200125 -1.         1.475091   1.4216025  1.4432662
  1.4066635  1.4249456  1.4528916  1.3657373 -1.        -1.
  1.43567    1.3848495  1.405277   1.4701781  1.4149067  1.475091
  1.3685797  1.4224218  1.492044   1.4699278 -1.         1.4175546
  1.4527692  1.4013834  1.4759367  1.3691261  1.4242532  1.4398009
  1.3835313  1.395894   1.3815461  1.4060144  1.3658928  1.455249
  1.4013834  1.4749389  1.4549538  1.4181136  1.367128   1.36