In [15]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np

env = gym.make('CartPole-v1')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [16]:
# create an update network using only tensorflow and no keras models
# do updates and backprop by doing dot products, etc. only

In [17]:
n_observations = 4
n_actions = 2
n_hidden = 128
batch_size = 32


def create_q_model():
    observations = layers.Input(shape=(n_observations,))
    hidden = layers.Dense(n_hidden, activation='relu')(observations)
    action = layers.Dense(n_actions, activation='softmax')(hidden)
    
    return keras.Model(inputs=observations, outputs=action)

policy_network = create_q_model()
target_network = create_q_model()

In [18]:
state = np.array(env.reset()[0])

state_tensor = tf.convert_to_tensor(state)
state_tensor = tf.expand_dims(state_tensor, 0)

action_probs = policy_network(state_tensor, training=False)
action = tf.argmax(action_probs[0]).numpy()

In [29]:
action_history = []
rewards_history = []
state_history = []
state_next_history = []
done_history = []

epsilon = 1.0 
gamma = 0.7
state = np.array(env.reset()[0])

num_actions = 2
max_memory_length = 100000
update_target_network = 10000

loss_function = keras.losses.Huber()
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)


done = False

for timestep in range(10000):

    if epsilon > np.random.rand(1)[0]:
        action = np.random.choice(2)
    else:
        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = policy_network(state_tensor, training=False)
        action = tf.argmax(action_probs[0]).numpy()

    epsilon = epsilon*0.99
    
    state_next, reward, done, _, _= env.step(action)
    
    action_history.append(action)
    rewards_history.append(reward)
    state_history.append(state)
    state_next_history.append(state_next)
    done_history.append(done)
        
    state_next = np.array(state_next)
    state = state_next
    
    if len(done_history) > batch_size:

        indices = np.random.choice(range(len(done_history)), size=batch_size)
        state_sample = np.array([state_history[i] for i in indices])
        state_next_sample = np.array([state_next_history[i] for i in indices])
        rewards_sample = [rewards_history[i] for i in indices]
        action_sample = [action_history[i] for i in indices]
        done_sample = tf.convert_to_tensor(
            [float(done_history[i]) for i in indices]
        )
        
        future_rewards = target_network.predict(state_next_sample)
        updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )
        
        updated_q_values = updated_q_values * (1 - done_sample) - done_sample
        masks = tf.one_hot(action_sample, num_actions)
        
        with tf.GradientTape() as tape:
            q_values = policy_network(state_sample)
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            loss = loss_function(updated_q_values, q_action)

        grads = tape.gradient(loss, policy_network.trainable_variables)
        optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))
        
        if timestep % update_target_network == 0:
            target_network.set_weights(policy_network.get_weights())

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]
            
        if done:
            break



In [20]:
policy_network.get_weights()

[array([[-0.08054093, -0.10679916,  0.09303557,  0.07345929,  0.08540739,
         -0.13461973,  0.20585506, -0.1213766 ,  0.02370906, -0.06059588,
         -0.01338914,  0.18600085,  0.20497458, -0.05617824,  0.08727563,
          0.03648237,  0.17719765, -0.0904631 ,  0.20548807,  0.12485288,
          0.03346802,  0.20937054, -0.03251931,  0.15582845, -0.00209483,
         -0.17751658,  0.18478467, -0.2023637 , -0.15599991, -0.05771455,
          0.01233106,  0.2115377 ,  0.14761916,  0.0442236 , -0.05824048,
         -0.09190679, -0.02034608, -0.1291775 ,  0.12956683, -0.1560924 ,
          0.14620261,  0.01922181, -0.03012272, -0.08641136, -0.15416922,
          0.1485579 ,  0.01368504, -0.07369842, -0.04668625,  0.16357404,
         -0.03365701,  0.0641554 ,  0.20025   , -0.10029116,  0.00849628,
         -0.10039532, -0.19507504,  0.07191797, -0.14772998,  0.00945104,
         -0.09381081, -0.13287684, -0.10673073,  0.08451787,  0.01826963,
          0.01712282, -0.15773962, -0.

In [21]:
target_network.get_weights()

[array([[-0.12219755,  0.12803578,  0.10557327,  0.17891341, -0.05431172,
          0.07612738, -0.05225667, -0.157843  ,  0.17460284, -0.1641401 ,
         -0.06134826,  0.15135777,  0.07257697,  0.1121197 ,  0.09401038,
          0.1789709 , -0.1080802 ,  0.14203557,  0.06907845, -0.09351564,
         -0.00663532,  0.05767271, -0.06406157, -0.18289796,  0.1977461 ,
          0.09954995,  0.08143169, -0.19440697, -0.18912944,  0.10471559,
         -0.20717373,  0.04324082, -0.2032533 , -0.0378997 , -0.02821559,
         -0.16905043, -0.01965341,  0.13054034, -0.11699952,  0.12968907,
          0.08606428,  0.03354684,  0.00860894,  0.01290569, -0.16149257,
          0.18543783, -0.20574991,  0.07587123,  0.07207853,  0.11954275,
          0.02799554, -0.01351129, -0.0066272 ,  0.04793364,  0.09448561,
         -0.16572583, -0.16642937, -0.05914615, -0.07061037, -0.01778373,
          0.13921347, -0.0540234 ,  0.08150047,  0.12592205, -0.19527663,
          0.07562375, -0.01730949, -0.

In [22]:
masks

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)>

In [23]:
q_values

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.5707026 , 0.42929742],
       [0.60831803, 0.39168197],
       [0.5089172 , 0.4910828 ],
       [0.51317763, 0.48682234],
       [0.5924672 , 0.40753284],
       [0.5469819 , 0.4530181 ],
       [0.5072931 , 0.4927069 ],
       [0.52802473, 0.47197524],
       [0.5095931 , 0.4904069 ],
       [0.5924672 , 0.40753284],
       [0.55010545, 0.4498945 ],
       [0.51901877, 0.4809812 ],
       [0.5147486 , 0.48525134],
       [0.5089172 , 0.4910828 ],
       [0.5451519 , 0.45484802],
       [0.52900815, 0.4709918 ],
       [0.58298224, 0.41701785],
       [0.5036247 , 0.4963752 ],
       [0.51343143, 0.48656854],
       [0.51667756, 0.4833225 ],
       [0.5147486 , 0.48525134],
       [0.563411  , 0.436589  ],
       [0.49688452, 0.50311553],
       [0.60831803, 0.39168197],
       [0.56126094, 0.43873903],
       [0.56664497, 0.43335494],
       [0.5297262 , 0.4702737 ],
       [0.55010545, 0.4498945 ],
       [0.5707026 , 0.42929

In [24]:
q_action

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.5707026 , 0.39168197, 0.4910828 , 0.51317763, 0.5924672 ,
       0.5469819 , 0.5072931 , 0.47197524, 0.5095931 , 0.5924672 ,
       0.55010545, 0.4809812 , 0.48525134, 0.4910828 , 0.5451519 ,
       0.4709918 , 0.41701785, 0.5036247 , 0.51343143, 0.4833225 ,
       0.48525134, 0.436589  , 0.49688452, 0.39168197, 0.43873903,
       0.56664497, 0.4702737 , 0.55010545, 0.5707026 , 0.4709918 ,
       0.5018183 , 0.4910828 ], dtype=float32)>

In [25]:
tf.one_hot(action_sample, num_actions)

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)>

In [26]:
tf.multiply(q_values, masks)

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.5707026 , 0.        ],
       [0.        , 0.39168197],
       [0.        , 0.4910828 ],
       [0.51317763, 0.        ],
       [0.5924672 , 0.        ],
       [0.5469819 , 0.        ],
       [0.5072931 , 0.        ],
       [0.        , 0.47197524],
       [0.5095931 , 0.        ],
       [0.5924672 , 0.        ],
       [0.55010545, 0.        ],
       [0.        , 0.4809812 ],
       [0.        , 0.48525134],
       [0.        , 0.4910828 ],
       [0.5451519 , 0.        ],
       [0.        , 0.4709918 ],
       [0.        , 0.41701785],
       [0.5036247 , 0.        ],
       [0.51343143, 0.        ],
       [0.        , 0.4833225 ],
       [0.        , 0.48525134],
       [0.        , 0.436589  ],
       [0.49688452, 0.        ],
       [0.        , 0.39168197],
       [0.        , 0.43873903],
       [0.56664497, 0.        ],
       [0.        , 0.4702737 ],
       [0.55010545, 0.        ],
       [0.5707026 , 0.     

In [30]:
action_sample

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0]