In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np

env = gym.make('CartPole-v1')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
n_observations = 4
n_actions = 2
n_hidden = 128
batch_size = 32


def create_q_model():
    observations = layers.Input(shape=(n_observations,))
    hidden = layers.Dense(n_hidden, activation='relu')(observations)
    action = layers.Dense(n_actions, activation='softmax')(hidden)
    
    return keras.Model(inputs=observations, outputs=action)

policy_network = create_q_model()
target_network = create_q_model()

In [3]:
state = np.array(env.reset()[0])

state_tensor = tf.convert_to_tensor(state)
state_tensor = tf.expand_dims(state_tensor, 0)

action_probs = policy_network(state_tensor, training=False)
action = tf.argmax(action_probs[0]).numpy()

In [18]:
action_history = []
rewards_history = []
state_history = []
state_next_history = []
done_history = []

epsilon = 1.0 
gamma = 0.7
state = np.array(env.reset()[0])

num_actions = 2
max_memory_length = 100000
update_target_network = 10000

loss_function = keras.losses.Huber()
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)


done = False

for timestep in range(10000):

    if epsilon > np.random.rand(1)[0]:
        action = np.random.choice(2)
    else:
        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = policy_network(state_tensor, training=False)
        action = tf.argmax(action_probs[0]).numpy()

    epsilon -= epsilon*0.99
    
    state_next, reward, done, _, _= env.step(action)
    
    action_history.append(action)
    rewards_history.append(reward)
    state_history.append(state)
    state_next_history.append(state_next)
    done_history.append(done)
        
    state_next = np.array(state_next)
    state = state_next
    
    if len(done_history) > batch_size:

        indices = np.random.choice(range(len(done_history)), size=batch_size)
        state_sample = np.array([state_history[i] for i in indices])
        state_next_sample = np.array([state_next_history[i] for i in indices])
        rewards_sample = [rewards_history[i] for i in indices]
        action_sample = [action_history[i] for i in indices]
        done_sample = tf.convert_to_tensor(
            [float(done_history[i]) for i in indices]
        )
        
        future_rewards = target_network.predict(state_next_sample)
        updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )
        
        updated_q_values = updated_q_values * (1 - done_sample) - done_sample
        masks = tf.one_hot(action_sample, num_actions)
        
        with tf.GradientTape() as tape:
            q_values = policy_network(state_sample)
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            loss = loss_function(updated_q_values, q_action)

        grads = tape.gradient(loss, policy_network.trainable_variables)
        optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))
        
        if timestep % update_target_network == 0:
            target_network.set_weights(policy_network.get_weights())

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]
            
        if done:
            break



In [23]:
policy_network.get_weights()

[array([[-0.14800662, -0.15856116,  0.14252976,  0.02196858, -0.17713419,
          0.05471009, -0.10806739, -0.01163385, -0.00756484,  0.04246236,
         -0.07758716, -0.06213327,  0.05366201, -0.05964801,  0.05698507,
         -0.18172869,  0.08051647,  0.11657345, -0.07985903, -0.00487107,
          0.11643071, -0.1356546 ,  0.09790315,  0.01744771, -0.01523813,
          0.17080674,  0.13955276,  0.20731242, -0.1507261 ,  0.0984832 ,
         -0.11024596, -0.13327256,  0.00333306,  0.16209884,  0.11939374,
          0.01386365, -0.14474888,  0.05461216,  0.1072421 ,  0.16714491,
         -0.04076625, -0.06449553, -0.00034814, -0.01932207, -0.21160628,
          0.20487715, -0.0401553 ,  0.01767941,  0.19894987, -0.0713774 ,
          0.02201256,  0.05908144,  0.02109844, -0.17946829,  0.13101083,
          0.16677898, -0.06233062, -0.02162753, -0.12669596, -0.07646756,
         -0.03090625, -0.19394931,  0.11550583,  0.15573555, -0.07949964,
         -0.2018384 , -0.15503952, -0.

In [24]:
target_network.get_weights()

[array([[ 0.20735282, -0.11326644, -0.12472573,  0.09752351, -0.08947554,
          0.03657231,  0.19655615, -0.17036848, -0.00183612,  0.05763641,
         -0.13210136,  0.15010506,  0.11341664, -0.07386585,  0.0412688 ,
          0.04253253,  0.10497403,  0.14371184, -0.06977929, -0.20022082,
          0.07939738, -0.17446475, -0.00844145, -0.0146102 ,  0.1757429 ,
          0.06846237,  0.11212939,  0.14826486,  0.02834977,  0.19552729,
         -0.14745823, -0.16388275,  0.0995318 , -0.01864009,  0.18501979,
         -0.07529959,  0.20887494,  0.09749326,  0.15676469,  0.08306327,
         -0.06651598, -0.0944457 ,  0.10942945,  0.02807422,  0.18166816,
          0.14498922,  0.04617926,  0.0589779 , -0.06303152, -0.04784174,
         -0.00156727,  0.1546072 , -0.17999485, -0.17964014, -0.20151457,
          0.17279595,  0.07392704,  0.15513214, -0.15284616,  0.07875717,
          0.10903105,  0.00067382, -0.09538943, -0.0774719 , -0.09637346,
         -0.20625512,  0.20872104,  0.

In [25]:
masks

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)>

In [26]:
q_values

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.4772376 , 0.5227624 ],
       [0.46027455, 0.5397254 ],
       [0.4678169 , 0.5321832 ],
       [0.4678169 , 0.5321832 ],
       [0.44670767, 0.55329233],
       [0.43334833, 0.5666516 ],
       [0.44468504, 0.55531496],
       [0.48925003, 0.51075   ],
       [0.4493823 , 0.5506177 ],
       [0.4772376 , 0.5227624 ],
       [0.48634693, 0.51365304],
       [0.45549333, 0.5445066 ],
       [0.44028473, 0.55971533],
       [0.49209467, 0.50790536],
       [0.49209467, 0.50790536],
       [0.42135358, 0.57864636],
       [0.46027455, 0.5397254 ],
       [0.4532178 , 0.5467822 ],
       [0.46027455, 0.5397254 ],
       [0.45105362, 0.5489463 ],
       [0.48634693, 0.51365304],
       [0.4772376 , 0.5227624 ],
       [0.42135358, 0.57864636],
       [0.4803359 , 0.5196641 ],
       [0.49891892, 0.501081  ],
       [0.44670767, 0.55329233],
       [0.47093344, 0.52906656],
       [0.4803359 , 0.5196641 ],
       [0.48337203, 0.51662

In [27]:
q_action

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.5227624 , 0.5397254 , 0.5321832 , 0.5321832 , 0.55329233,
       0.5666516 , 0.55531496, 0.51075   , 0.5506177 , 0.5227624 ,
       0.51365304, 0.5445066 , 0.55971533, 0.50790536, 0.50790536,
       0.57864636, 0.5397254 , 0.5467822 , 0.5397254 , 0.5489463 ,
       0.51365304, 0.5227624 , 0.57864636, 0.5196641 , 0.49891892,
       0.55329233, 0.52906656, 0.5196641 , 0.5166279 , 0.5856449 ,
       0.5227624 , 0.51365304], dtype=float32)>