# Preliminary
- Experience replay
- Fixed Target Q-Network
- Reward clipping
- Huber loss

In [5]:
import gym
import numpy as np

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.losses import huber_loss

from collections import deque

import easydict

# 0. Parameters

In [21]:
args = easydict.EasyDict({
    "n_episodes": 500,
    "max_steps": 200,
    "success_step_criterion": 190,
    "gamma": 0.99,
    "warmup": 10,
    "epsilon_start": 1.0,
    "epsilon_end": 0.01,
    "epsilon_decay": 0.001,
    "memory_size": 10000,
    "batch_size": 32,
    "learning_rate": 0.001
})

# 1. Action value function (Q)

In [14]:
class QNetwork:
    def __init__(self, state_size, action_size, learning_rate):
        self.model = Sequential()
        self.model.add( Dense(16, activation='relu', input_dim=state_size) )
        self.model.add( Dense(16, activation='relu') )
        self.model.add( Dense(16, activation='relu') )
        self.model.add( Dense(action_size, activation='linear') )
        
        self.model.compile(loss=huber_loss, optimizer=Adam(lr=learning_rate))

# 2. Experience Memory

In [9]:
class Memory():
    def __init__(self, memory_size):
        self.buffer = deque(maxlen=memory_size)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        idx = np.random.choice( np.arange(len(self.buffer)), size=batch_size, replace=False )
        return [self.buffer[i] for i in idx]
    
    def __len__(self):
        return len(self.buffer)

# 3. OpenAI Gym Environment

In [24]:
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

main_QNet = QNetwork(state_size, action_size, args.learning_rate)
target_QNet = QNetwork(state_size, action_size, args.learning_rate)
memory = Memory(args.memory_size)

# 4. Training

In [27]:
state = env.reset()
state = np.reshape(state, [1, state_size])

total_step = 0
success_count = 0

for episode in range(1, args.n_episodes+1):    
    target_QNet.model.set_weights(main_QNet.model.get_weights())
    
    #############################################################################################################################
    step = 0
    for _ in range(1, args.max_steps+1):
        step += 1
        total_step += 1
        
        # 1. Choice action
        epsilon = args.epsilon_end + (args.epsilon_start - args.epsilon_end) * np.exp(-args.epsilon_decay * total_step)
        if(epsilon > np.random.rand()):
            action = env.action_space.sample()
        else:
            action - np.argmax(main_QNet.model.predict(state)[0])
            
        # 2. Get next state
        next_state, _, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        
        # 3. Finish or keep going
        if(done):
            if(step >= args.success_step_criterion):
                success_count += 1
                reward = 1
            else:
                success_count = 0
                reward = 0
                
            next_state = np.zeros(state.shape)
                
            if(step > args.warmup):
                memory.add((state, action, reward, next_state))
        else:
            reward = 0
            
            if(step > args.warmup):
                memory.add((state, action, reward, next_state))
                
            state = next_state
            
        
        #
        if( len(memory) >= args.batch_size ):
            inputs = np.zeros((args.batch_size, 4))
            targets = np.zeros((args.batch_size, 2))
            
            mini_batch = memory.sample(args.batch_size)
            
            for i, (state_b, action_b, reward_b, next_state_b) in enumerate(mini_batch):
                inputs[i] = state_b
                
                if not( (next_state_b == np.zeros(state_b.shape)).all(axis=1) ):
                    target = reward_b + args.gamma * np.amax(target_QNet.model.predict(next_state_b)[0])
                else:
                    target = reward_b
                    
                targets[i] = main_QNet.model.predict(state_b)
                targets[i][action_b] = target
            
            main_QNet.model.fit(inputs, targets, epochs=1, verbose=0)
            
            
        # Finish
        if(done):
            break
            
    print("Episode: {:02d}, Step: {:03d}, Epsilon: {:.4f}".format(episode, step, epsilon))
    #############################################################################################################################
    
    if(success_count >= 5):
        break
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])

Episode: 01, Step: 014, Epsilon: 0.9862
Episode: 02, Step: 012, Epsilon: 0.9746
Episode: 03, Step: 017, Epsilon: 0.9583
Episode: 04, Step: 021, Epsilon: 0.9386
Episode: 05, Step: 013, Epsilon: 0.9266
Episode: 06, Step: 016, Epsilon: 0.9121
Episode: 07, Step: 015, Epsilon: 0.8987
Episode: 08, Step: 012, Epsilon: 0.8881
Episode: 09, Step: 014, Epsilon: 0.8758
Episode: 10, Step: 015, Epsilon: 0.8630
Episode: 11, Step: 009, Epsilon: 0.8553
Episode: 12, Step: 016, Epsilon: 0.8419
Episode: 13, Step: 018, Epsilon: 0.8271
Episode: 14, Step: 020, Epsilon: 0.8109
Episode: 15, Step: 014, Epsilon: 0.7997
Episode: 16, Step: 022, Epsilon: 0.7826
Episode: 17, Step: 011, Epsilon: 0.7741
Episode: 18, Step: 026, Epsilon: 0.7545
Episode: 19, Step: 015, Epsilon: 0.7434
Episode: 20, Step: 017, Epsilon: 0.7310
Episode: 21, Step: 014, Epsilon: 0.7210
Episode: 22, Step: 018, Epsilon: 0.7083
Episode: 23, Step: 012, Epsilon: 0.7000
Episode: 24, Step: 017, Epsilon: 0.6884
Episode: 25, Step: 021, Epsilon: 0.6743
