In [2]:
import gym
import numpy as np

#
# Environment
#
env = gym.make('CartPole-v1')
state = env.reset()
action = env.action_space.sample()

print('State space: ', env.observation_space)
print('Initial state: ', state)
print('\nAction space: ', env.action_space)
print('Random action: ', action)

State space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Initial state:  [-0.03554296  0.00394534  0.03405297  0.02208636]

Action space:  Discrete(2)
Random action:  1


In [3]:
# DQN Modeling
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

num_state = env.observation_space.shape[0]
num_action = env.action_space.n

model = Sequential()
model.add(Dense(24, input_dim= num_state, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(num_action, activation='linear'))
model.compile(loss='mse', optimizer="adam")


  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,
2022-06-03 22:50:16.225802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
import random
from collections import deque
from tqdm import tqdm

num_episode = 10
memory = deque(maxlen=2000)

# Hyper parameter
epsilon = 0.3
gamma = 0.95
batch_size = 32

# DQN Learning
for episode in tqdm(range(num_episode)):
    state = env.reset()
    done = False
    while not done:
        if np.random.uniform() < epsilon:
            action = env.action_space.sample()
        else:
            q_value = model.predict(state.reshape(1, num_state))
            action = np.argmax(q_value[0])
        next_state, reward, done, info = env.step(action)
        # Memory
        memory.append((state, action, reward, next_state, done))
        
        state = next_state
    
    # Replay
    if len(memory) > batch_size:
        mini_batch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in mini_batch:
            if done:
                target = reward
            else:
                target = reward + gamma * (np.max(model.predict(next_state.reshape(1, num_state))[0]))
            q_value = model.predict(state.reshape(1, num_state))
            q_value[0][action] = target
            model.fit(state.reshape(1, num_state), q_value, epochs=1, verbose=0)

env.close()

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:00<00:03,  2.25it/s]



 20%|██        | 2/10 [00:00<00:02,  2.89it/s]



 30%|███       | 3/10 [00:04<00:14,  2.08s/it]



 40%|████      | 4/10 [00:08<00:17,  2.85s/it]



 50%|█████     | 5/10 [00:13<00:16,  3.34s/it]



 60%|██████    | 6/10 [00:17<00:14,  3.60s/it]



 70%|███████   | 7/10 [00:21<00:11,  3.87s/it]



 80%|████████  | 8/10 [00:25<00:07,  3.93s/it]



 90%|█████████ | 9/10 [00:30<00:04,  4.08s/it]



100%|██████████| 10/10 [00:34<00:00,  3.41s/it]


In [6]:
import os

save_dir = os.getcwd()
model_name = 'keras_dqn_trained_model.h5'

# Save model and weights
model_path = os.path.join(save_dir, model_name)
model.save(model_path)


# https://github.com/skettee/notebooks/blob/master/deep_q_network.ipynb