## Partial Codes

In [1]:
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt 
import gym

# 케라스를 tensorflow의 버전을 통일해야 tensorflow와 keras간의 호환성이 제대로 동작한다.
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense #, Input
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [2]:
def create_q_model(num_states, num_actions):
    inputs = tf.keras.Input(shape=(num_states,))
    layer = tf.keras.layers.Dense(32, activation="relu")(inputs)
    layer = tf.keras.layers.Dense(16, activation="relu")(layer)
    action = tf.keras.layers.Dense(num_actions, activation="linear")(layer)
    return tf.keras.Model(inputs=inputs, outputs=action)

model = create_q_model(4,2)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 722
Trainable params: 722
Non-trainable params: 0
_________________________________________________________________


In [3]:
def train(model):
    state_size = 4
    action_size = 2        
    states = np.zeros((10,state_size), dtype=np.float32)
    with tf.GradientTape() as tape:
        predicts = model(states)
    print('Simple training is completed!')

def get_env_model():
    env = gym.make('CartPole-v1')
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n
    model = create_q_model(num_states, num_actions)
    return env, model


env, model = get_env_model()
print(model.summary())
train(model)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                160       
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 34        
Total params: 722
Trainable params: 722
Non-trainable params: 0
_________________________________________________________________
None
Simple training is completed!


In [4]:
class World_00:
    def __init__(self):
        self.get_env_model()

    def get_env_model(self):
        self.env = gym.make('CartPole-v1')
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.n
        self.model = create_q_model(self.num_states, self.num_actions)
        # print(self.model.summary())

    def train(self):        
        states = np.zeros((10,self.num_states), dtype=np.float32)
        with tf.GradientTape() as tape:
            predicts = self.model(states)
        print('Completed!')

new_world = World_00()
new_world.train()

Completed!


In [5]:
def env_test_model_memory(memory, env, model, n_episodes=1000, 
        flag_render=False):
    for e in range(n_episodes):
        done = False
        score = 0
        s = env.reset()
        while not done:
            s_array = np.array(s).reshape((1,-1))
            Qsa = model.predict(s_array)[0]
            a = np.argmax(Qsa)
            next_s, r, done, _ = env.step(a)
            if flag_render:
                env.render()
            score += r
            memory.append([s,a,r,next_s,done])
        print(f'Episode: {e:5d} -->  Score: {score:3.1f}')
    print('Notice that the max score is set to 500.0 in CartPole-v1')

def list_rotate(l):
    return list(zip(*l))

class World_01(World_00):
    def __init__(self):
        World_00.__init__(self)
        self.memory = deque(maxlen=2000)
        self.N_batch = 64
        self.t_model = create_q_model(self.num_states, self.num_actions)
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.optimizer = Adam(lr=self.learning_rate)

    def trial(self, flag_render=False):
        env_test_model_memory(self.memory, self.env,
            self.model, n_episodes=10, flag_render=flag_render)
        print(len(self.memory))

    def train_memory(self):
        if len(self.memory) >= self.N_batch:
            memory_batch = random.sample(self.memory, self.N_batch)
            s_l,a_l,r_l,next_s_l,done_l = [np.array(x) for x in list_rotate(memory_batch)]
            model_w = self.model.trainable_variables
            with tf.GradientTape() as tape:
                Qsa_pred_l = self.model(s_l.astype(np.float32))
                a_l_onehot = tf.one_hot(a_l, self.num_actions)
                Qs_a_pred_l = tf.reduce_sum(a_l_onehot * Qsa_pred_l, axis=1)    

                Qsa_tpred_l = self.t_model(next_s_l.astype(np.float32)) 
                Qsa_tpred_l = tf.stop_gradient(Qsa_tpred_l)

                max_Q_next_s_a_l = np.amax(Qsa_tpred_l, axis=-1)
                Qs_a_l = r_l + (1 - done_l) * self.discount_factor * max_Q_next_s_a_l
                loss = tf.reduce_mean(tf.square(Qs_a_l - Qs_a_pred_l))
            grads = tape.gradient(loss, model_w)
            self.optimizer.apply_gradients(zip(grads, model_w))

new_world = World_01()
new_world.trial(flag_render=True)
new_world.train_memory()
new_world.env.close()
print('Completed!')

Episode:     0 -->  Score: 9.0
Episode:     1 -->  Score: 9.0
Episode:     2 -->  Score: 9.0
Episode:     3 -->  Score: 10.0
Episode:     4 -->  Score: 9.0
Episode:     5 -->  Score: 9.0
Episode:     6 -->  Score: 9.0
Episode:     7 -->  Score: 10.0
Episode:     8 -->  Score: 10.0
Episode:     9 -->  Score: 10.0
Notice that the max score is set to 500.0 in CartPole-v1
94
Completed!


In [None]:
class World_02(World_01):
    def __init__(self):
        World_01.__init__(self)
        self.epsilon = 0.2
    
    def update_t_model(self):
        self.t_model.set_weights(self.model.get_weights())

    def best_action(self, s):
        if random.random() <= self.epsilon:
            return random.randrange(self.num_actions)
        else:
            s_array = np.array(s).reshape((1,-1))
            Qsa = self.model.predict(s_array)[0]
            return np.argmax(Qsa)

    def trials(self, n_episodes=100, flag_render=False):
        memory = self.memory
        env = self.env
        model = self.model        
        for e in range(n_episodes):
            done = False
            score = 0
            s = env.reset()
            while not done:                
                a = self.best_action(s)
                next_s, r, done, _ = env.step(a)
                if flag_render:
                    env.render()
                score += r
                memory.append([s,a,r,next_s,done])
                # self.train_memory()     
                s = next_s
                self.train_memory()                 
            self.update_t_model()
            print(f'Episode: {e:5d} -->  Score: {score:3.1f}')   

new_world = World_02()
new_world.trials(n_episodes=200)
# %%
new_world.env.close()
print('Completed!')

Episode:     0 -->  Score: 9.0
Episode:     1 -->  Score: 10.0
Episode:     2 -->  Score: 14.0
Episode:     3 -->  Score: 16.0
Episode:     4 -->  Score: 19.0
Episode:     5 -->  Score: 10.0
Episode:     6 -->  Score: 10.0
Episode:     7 -->  Score: 10.0
Episode:     8 -->  Score: 9.0
Episode:     9 -->  Score: 11.0
Episode:    10 -->  Score: 11.0
Episode:    11 -->  Score: 12.0
Episode:    12 -->  Score: 11.0
Episode:    13 -->  Score: 12.0
Episode:    14 -->  Score: 12.0
Episode:    15 -->  Score: 14.0
Episode:    16 -->  Score: 12.0
Episode:    17 -->  Score: 12.0
Episode:    18 -->  Score: 11.0
Episode:    19 -->  Score: 13.0
Episode:    20 -->  Score: 12.0
Episode:    21 -->  Score: 12.0
Episode:    22 -->  Score: 9.0
Episode:    23 -->  Score: 9.0
Episode:    24 -->  Score: 9.0
Episode:    25 -->  Score: 9.0
Episode:    26 -->  Score: 16.0
Episode:    27 -->  Score: 18.0
Episode:    28 -->  Score: 27.0
Episode:    29 -->  Score: 13.0
Episode:    30 -->  Score: 12.0
Episode:    31

## Full Codes