In [1]:
!pip install gymnasium
!pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/ba/7c/b971f2485155917ecdcebb210e021e36a6b65457394590be01cc61515310/tensorflow-2.13.0-cp310-cp310-win_amd64.whl.metadata
  Using cached tensorflow-2.13.0-cp310-cp310-win_amd64.whl.metadata (2.6 kB)
Collecting tensorflow-intel==2.13.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.13.0 from https://files.pythonhosted.org/packages/40/fa/98115f6fe4d92e1962f549917be2dc8e369853b7e404191996fedaaf4dd6/tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl.metadata
  Using cached tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.1.21 (from tensorflow-intel=

In [2]:
import gymnasium as gym
import numpy as np
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from itertools import count

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

env = gym.make("CartPole-v1")

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x1c27b083d30>

In [3]:
def DQN():
    n_hidden = 128

    inputs = layers.Input(shape=(n_observations,))
    hidden = layers.Dense(n_hidden, activation='relu')(inputs)
    outputs = layers.Dense(n_actions, activation='softmax')(hidden)

    return keras.Model(inputs=inputs, outputs=outputs)

In [4]:
state, info = env.reset()
n_observations = len(state)
n_actions = env.action_space.n

In [12]:
batch_size = 32
gamma = 0.99
epsilon = 0.9
epsilon_decay = 0.99
learning_rate_adam = 1e-4

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_adam, amsgrad=True)

In [14]:
model = DQN()
memory = []

while True:  
    state = env.reset()[0]
    print(state)
    episode_reward = 0

    for timestep in range(1, 1000):
        if epsilon > np.random.rand():
            action = np.random.choice(n_actions)
        else:
            state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()
            print(action_probs[0], action)

        epsilon = epsilon * epsilon_decay

        state_next, reward, done, _, _ = env.step(action)
        memory.append([state, state_next, action, reward, done])
        state = np.array(state_next)

        episode_reward += reward    
        print(episode_reward)
        
    if done:
        break

[-0.02068522  0.02288801  0.03195376  0.02976369]
1.0
2.0
3.0
4.0
5.0
6.0
tf.Tensor([0.46108356 0.5389164 ], shape=(2,), dtype=float32) 1
7.0
8.0
tf.Tensor([0.46033287 0.53966707], shape=(2,), dtype=float32) 1
9.0
10.0
tf.Tensor([0.43732944 0.56267047], shape=(2,), dtype=float32) 1
11.0
12.0
13.0
tf.Tensor([0.42193127 0.5780687 ], shape=(2,), dtype=float32) 1
14.0
14.0
14.0
tf.Tensor([0.40258083 0.59741914], shape=(2,), dtype=float32) 1
14.0
14.0
tf.Tensor([0.3933522 0.6066478], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0.37958032 0.6204197 ], shape=(2,), dtype=float32) 1
14.0
14.0
14.0
14.0
14.0
14.0
tf.Tensor([0.33651245 0.6634875 ], shape=(2,), dtype=float32) 1
14.0
14.0
tf.Tensor([0.3171696 0.6828304], shape=(2,), dtype=float32) 1
14.0
14.0
14.0
tf.Tensor([0.28608465 0.71391535], shape=(2,), dtype=float32) 1
14.0
14.0
14.0
tf.Tensor([0.2557343 0.7442657], shape=(2,), dtype=float32) 1
14.0
14.0
tf.Tensor([0.24478345 0.75521654], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0.2409

tf.Tensor([3.322525e-16 1.000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([2.6463723e-16 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([2.1116435e-16 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([1.6881406e-16 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([1.3519648e-16 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([1.08430143e-16 1.00000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([8.702874e-17 1.000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([6.925261e-17 1.000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([5.421577e-17 1.000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([4.2297924e-17 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([3.2842657e-17 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([2.5338307e-17 1.0000000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([1.93912e-17 1.00000e+00], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([1.4706758e-17 1.0000000e

tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 

tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 

tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 1
14.0
tf.Tensor([0. 1.], shape=(2,), dtype=float32) 

In [17]:
indices = np.random.choice(range(len(memory)), size=batch_size)
memory_sample = [memory[i] for i in indices]

In [19]:
len(memory)

999

In [20]:
len(memory_sample)

32

In [31]:
type(memory)

list