In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np

env = gym.make('CartPole-v1')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
def create_model(num_observations, num_hidden, num_actions):
    observations = layers.Input(shape=(num_observations,))
    hidden = layers.Dense(num_hidden, activation='relu')(observations)
    action = layers.Dense(num_actions, activation='softmax')(hidden)
    
    return keras.Model(inputs=observations, outputs=action)

In [3]:
model = create_model(4, 128, 2)

In [4]:
state = np.array(env.reset()[0])
state_tensor = tf.convert_to_tensor(state)
state_tensor = tf.expand_dims(state_tensor, 0)
action_probs = model(state_tensor, training=False)
action_probs

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.4966027, 0.5033973]], dtype=float32)>

In [5]:
action_history = []
state_history = []
state_next_history = []
done_history = []
rewards_history = []

epsilon = 0.9
epsilon_decay = 0.99
gamma = 0.7
alpha = 0.7

In [6]:
done = False

for timestep in range(100):
    state_tensor = tf.convert_to_tensor(state)
    state_tensor = tf.expand_dims(state_tensor, 0)
    action_probs = model(state_tensor, training=False)
    action = tf.argmax(action_probs[0]).numpy()

    state_next, reward, done, _, _ = env.step(action)
    state_next = np.array(state_next)

    epsilon *= epsilon_decay

    action_history.append(action)
    state_history.append(state)
    state_next_history.append(state_next)
    done_history.append(done)
    rewards_history.append(reward)

    state = state_next
    
    if done:
        break

In [11]:
loss_function = keras.losses.Huber()
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

In [16]:
indices = np.random.choice(range(len(done_history)), size=128)

state_sample = np.array([state_history[i] for i in indices])
state_next_sample = np.array([state_next_history[i] for i in indices])
rewards_sample = [rewards_history[i] for i in indices]
action_sample = [action_history[i] for i in indices]

next_q_values = model.predict(state_next_sample)
next_q_values = rewards_sample + gamma * tf.reduce_max(next_q_values, axis=1)
masks = tf.one_hot(action_sample, 2)

predicted_q_values = model.predict(state_sample)
predicted_q_values = tf.reduce_sum(tf.multiply(predicted_q_values, masks), axis=1)
loss = loss_function(next_q_values, predicted_q_values)
print(loss)

tf.Tensor(0.36065894, shape=(), dtype=float32)
