In [1]:
import numpy as np
import random
from collections import deque
import gym

In [2]:
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [3]:
env_name = "CartPole-v1"
env = gym.make(env_name)

In [4]:
env.reset()

for step in range(1000):
    env.render(mode="human")
    random_action = env.action_space.sample()
    env.step(random_action)

env.close()



In [5]:
num_observations = env.observation_space.shape[0]

In [6]:
num_actions = env.action_space.n

In [9]:
model = Sequential()
model.add(Dense(16, input_shape=(1, num_observations)))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(num_actions))
model.add(Activation('linear'))

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1, 16)             80        
_________________________________________________________________
activation (Activation)      (None, 1, 16)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 32)             544       
_________________________________________________________________
activation_1 (Activation)    (None, 1, 32)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1, 2)              66        
_________________________________________________________________
activation_2 (Activation)    (None, 1, 2)              0         
Total params: 690
Trainable params: 690
Non-trainable params: 0
__________________________________________________________

In [11]:
target_model = clone_model(model)

In [26]:
EPOCHS = 1000
epsilon = 1.0
EPSILON_REDUCE = 0.995
BATCH_SIZE = 32
LEARNING_RATE = 0.001
GAMMA = 0.95

In [13]:
def epsilon_greedy_action_selection(model, epsilon, observation):
    if np.random.random() > epsilon:
        prediction = model.predict(observation)
        action = np.argmax(prediction)
    else:
        action = np.random.randint(0, env.action_space.n)
    
    return action

In [15]:
replay_buffer = deque(maxlen=20000)
update_target_model = 10

In [16]:
test_tuple = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [17]:
zipped_list = list(zip(*test_tuple))

In [18]:
a, b, c = zipped_list

In [19]:
a

(1, 4, 7)

In [20]:
b

(2, 5, 8)

In [21]:
c

(3, 6, 9)

In [31]:
def replay(replay_buffer, batch_size, model, target_model):
    if len(replay_buffer) < batch_size:
        return
    samples = random.sample(replay_buffer, batch_size)
    target_batch = []
    
    zipped_samples = list(zip(*samples))
    
    states, actions, rewards, new_states, dones = zipped_samples
    
    targets = target_model.predict(np.array(states))
    q_values = model.predict(np.array(new_states))
    
    for i in range(batch_size):
        q_value = max(q_values[i][0])
        target = targets[i].copy()
        
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_value * GAMMA
        
        target_batch.append(target)
        
    model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=0)
    

In [32]:
def update_model_handler(epoch, update_target_model, model, target_model):
    if epoch > 0 and (epoch % update_target_model == 0):
        target_model.set_weights(model.get_weights()) 

In [33]:
model.compile(loss="mse", optimizer=(Adam(learning_rate=LEARNING_RATE)))

In [34]:
best_so_far = 0

for epoch in range(EPOCHS):
    observation = env.reset()
    
    observation = observation.reshape([1, 4])
    done = False
    points = 0
    
    while not done:
        action = epsilon_greedy_action_selection(model, epsilon, observation)
        next_observation, reward, done, info = env.step(action)
        next_observation = next_observation.reshape([1, 4])
        replay_buffer.append((observation, action, reward, next_observation, done))
        observation = next_observation
        points += 1
        replay(replay_buffer, BATCH_SIZE, model, target_model)
    
    epsilon *= EPSILON_REDUCE
    
    update_model_handler(epoch, update_target_model, model, target_model)
    
    if points > best_so_far:
        best_so_far = points
    if epoch % 25 == 0:
        print(f"{epoch}:  POINTS: {points} eps: {epsilon} BSF: {best_so_far}")

0:  POINTS: 27 eps: 0.990025 BSF: 27
25:  POINTS: 23 eps: 0.8734200960253871 BSF: 51
50:  POINTS: 23 eps: 0.7705488893118823 BSF: 102
75:  POINTS: 32 eps: 0.6797938283326578 BSF: 102
100:  POINTS: 86 eps: 0.5997278763867329 BSF: 150
125:  POINTS: 78 eps: 0.5290920728090721 BSF: 150
150:  POINTS: 161 eps: 0.46677573701590436 BSF: 166
175:  POINTS: 133 eps: 0.4117990041127769 BSF: 167
200:  POINTS: 32 eps: 0.3632974174544486 BSF: 209
225:  POINTS: 140 eps: 0.32050833588933575 BSF: 209
250:  POINTS: 154 eps: 0.2827589419554058 BSF: 209
275:  POINTS: 139 eps: 0.2494556624678441 BSF: 209
300:  POINTS: 154 eps: 0.22007483514733558 BSF: 209
325:  POINTS: 152 eps: 0.19415447453059972 BSF: 209
350:  POINTS: 134 eps: 0.1712870076899825 BSF: 209
375:  POINTS: 133 eps: 0.15111286553822956 BSF: 209
400:  POINTS: 128 eps: 0.13331482894782642 BSF: 209
425:  POINTS: 148 eps: 0.1176130407830293 BSF: 209
450:  POINTS: 158 eps: 0.10376060541355137 BSF: 209
475:  POINTS: 150 eps: 0.09153970651645797 BSF: 

In [37]:
observation = env.reset()

for step in range(500):
    env.render()
    action = np.argmax(model.predict(observation.reshape([1, 4])))
    observation, reward, done, info = env.step(action)
    if done:
        break
env.close()