In [1]:
import numpy as np
import random
from collections import deque
import gym

In [2]:
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [3]:
env_name = "CartPole-v1"
env = gym.make(env_name)

In [4]:
env.reset()

for step in range(1000):
    env.render(mode="human")
    random_action = env.action_space.sample()
    env.step(random_action)

env.close()



In [5]:
num_observations = env.observation_space.shape[0]

In [6]:
num_actions = env.action_space.n

In [9]:
model = Sequential()
model.add(Dense(16, input_shape=(1, num_observations)))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(num_actions))
model.add(Activation('linear'))

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1, 16)             80        
_________________________________________________________________
activation (Activation)      (None, 1, 16)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 32)             544       
_________________________________________________________________
activation_1 (Activation)    (None, 1, 32)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1, 2)              66        
_________________________________________________________________
activation_2 (Activation)    (None, 1, 2)              0         
Total params: 690
Trainable params: 690
Non-trainable params: 0
__________________________________________________________

In [11]:
target_model = clone_model(model)

In [12]:
EPOCHS = 1000
epsilon = 1.0
EPSILON_REDUCE = 0.995
LEARNING_RATE = 0.001
GAMMA = 0.95

In [13]:
def epsilon_greedy_action_selection(model, epsilon, observation):
    if np.random.random() > epsilon:
        prediction = model.predict(observation)
        action = np.argmax(prediction)
    else:
        action = np.random.randint(0, env.action_space.n)
    
    return action

In [15]:
replay_buffer = deque(maxlen=20000)
update_target_model = 10

In [16]:
test_tuple = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [17]:
zipped_list = list(zip(*test_tuple))

In [18]:
a, b, c = zipped_list

In [19]:
a

(1, 4, 7)

In [20]:
b

(2, 5, 8)

In [21]:
c

(3, 6, 9)

In [22]:
def replay(replay_buffer, batch_size, model, target_model):
    if len(replay) < batch_size:
        return
    samples = random.sample(replay_buffer, batch_size)
    target_batch = []
    
    zipped_samples = list(zip(*samples))
    
    states, actions, rewards, new_states, dones = zipped_samples
    
    targets = target_model.predict(np.array(states))
    q_values = model.predict(np.array(new_states))
    
    for i in range(batch_size):
        q_value = max(q_values[i][0])
        target = targets[i].copy()
        
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_values * GAMMA
        
        target_batch.append(target)
        
    model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=0)
    

In [None]:
def update_model_handler(epoch, update_target_model, model, target_model):
    if epoch > 0 and (epoch % update_target_model == 0):
        target_model.set_weights(model.get_weights()) 