In [64]:
import random
import math

class Game:
    # Actions
    # 0 = player bergerak ke kiri
    # 1 = player bergerak ke kanan
    num_actions = 2
    
    # State
    # 0 = player position (1 on state space)
    # 1 = target position (2 on state space)
    state = {
        0 : None,
        1 : None
    }
    
    MAX_MOVEMENT = 5
    current_movement = 5
    
    bot_idx = 0
    up_idx = 9
    
    def __init__(self) -> None:
        player = random.randint(self.bot_idx, self.up_idx)
        target = player
        while target == player:
            x = random.randint(- self.MAX_MOVEMENT, self.MAX_MOVEMENT)
            target = (x + player if x + player >= self.bot_idx and x + player <= self.up_idx else player)
        self.state = [player, target]
    
    def print_state(self):
        print(f"Target Position : {self.state[1]}")
        print(f"Player Position : {self.state[0]}")
    
    def get_state(self):
        state = [0 for _ in range(10)]
        state[self.state[0]] = 1 # player assign
        state[self.state[1]] = 2 # target assign
        return state
    
    def execute_action(self, action_num):
        rewards = None
        
        state_before = self.state.copy()
        if action_num == 0:
            if self.state[0] - 1 >= self.bot_idx:
                self.state[0] -= 1
        else:
            if self.state[0] + 1 <= self.up_idx:
                self.state[0] += 1
            
        rewards = self.__calculate_rewards__(self.state, state_before)
        
        self.current_movement -= 1
        
        return self.get_state(), rewards, self.__check_game_end__()
    
    def __calculate_rewards__(self, current_state, previous_state):
        # Calculate the distance between player and target and normalize it
        current = abs(current_state[0] - current_state[1])
        before = abs(previous_state[0] - previous_state[1])
        if current < before:
            return 1
        else:
            return -1
    
    def __check_game_end__(self):
        if self.current_movement == 0:
            return 1
        
        if self.state[0] == self.state[1]:
            return 1
        else:
            return 0

In [65]:
# 1 samples must be a = [state, action, rewards, next_state, is_done]
# is_done is for determining a terminal or non-terminal state

import random
import tensorflow as tf
import numpy as np

class ReplayMemory:
    main_memory = []
    max_reply = 0
    num_batch = 0
    def __init__(self, max_replay: int, mini_batch_num: int):
        self.max_reply = max_replay
        self.num_batch = mini_batch_num

class DeepQAgent:
    replay:ReplayMemory = None
    num_actions: int = None
    eval_model = None
    target_model = None
    gamma:float = None
    epsilon:float = None
    epsilon_min: float = None
    epsilon_decay: float = None
    
    # counter for updating model weight
    learn_counter: int = 0
    update_weight_on: int = 0
    
    def __init__(self, num_actions: int, max_replay: int, mini_batch_num: int, 
                 weight_update: int, epsilon: float, epsilon_min: float, 
                 epsilon_decay:float, gamma:float):
        self.replay = ReplayMemory(max_replay, mini_batch_num)
        self.eval_model, self.target_model = self.create_model()
        self.num_actions = int(num_actions)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.update_weight_on = weight_update
        
    def create_model(self):
        # Create your own model and return the sequential model.
        # Need to watchout your input is need to be a state shape
        # And your output need to be your action shape
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(1024, input_shape=(None,10), activation="relu"),
            tf.keras.layers.Dense(512, activation="relu"),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(2, activation='linear'),
        ])
        
        model.compile(optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.00025),
                      loss = 'mean_squared_error',
                      metrics= ['mse'],
                      )
        
        return model, model
        
    def store_memory(self, state, action, rewards, next_state, is_done):
        if len(self.replay.main_memory) == self.replay.max_reply:
            self.replay.main_memory.pop(0)
        self.replay.main_memory.append([state, action, rewards, 
                                        next_state, is_done])
        
    def pick_action(self, state, epsilon = None):
        if epsilon == None:
            epsilon = self.epsilon
        action = None
        if random.random() > epsilon:
            if type(state) != list:
                state = state.tolist()
            prediction = self.eval_model.predict([state])[0]
            action = np.argmax(prediction)
        else:
            action = random.randint(0, self.num_actions - 1)
            
        return action
    
    def learn(self):
        if len(self.replay.main_memory) < self.replay.num_batch:
            return
        samples = self.__sample_mini_batch__()
        X_current = [x[0] for x in samples]
        X_current = np.array(X_current)
        X_next = [x[3] for x in samples]
        X_next = np.array(X_next)
        
        prediction = self.eval_model.predict(X_current)
        target_prediction = self.target_model.predict(X_next)
        for i in range(len(samples)):
            if samples[i][4]: # if is_done
                # For terminal next state
                prediction[i][samples[i][1]] = samples[i][2]
            else:
                # For non-terminal next state
                target = self.gamma * target_prediction[i][samples[i][1]]
                prediction[i][samples[i][1]] = samples[i][2] + target
                
        X_train = [i[0] for i in samples]
        X_train = np.array(X_train)
        
        # Tensorboard thingy
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs/", histogram_freq=1)
        
        self.eval_model.fit(X_train, prediction, verbose=0, epochs=10, callbacks=[tensorboard_callback])
        print(self.epsilon)
        if self.learn_counter % self.update_weight_on == 0:
            self.__update_target_models__()
        
        # Post Learn
        self.learn_counter += 1
        epsilon_after_decay = self.epsilon * self.epsilon_decay
        if  epsilon_after_decay < self.epsilon_min:
            self.epsilon = self.epsilon_min
        else:
            self.epsilon = epsilon_after_decay
            
    def load_model(self,path:str):
        self.target_model = tf.keras.models.load_model(path)
        self.eval_model = tf.keras.models.load_model(path)
        print("Model Loaded")
        
    def save_model(self,path:str):
        self.eval_model.save(path)
        print("Model saved")
            
    def __sample_mini_batch__(self):
        return random.sample(self.replay.main_memory, self.replay.num_batch)

    def __update_target_models__(self):
        self.target_model.set_weights(self.eval_model.get_weights())

In [66]:
import matplotlib.pyplot as plt 
import time
%matplotlib qt

class Environment:
    game: Game
    agent: DeepQAgent
    step_counter: int = 1
    
    def __init__(self):
        self.agent = DeepQAgent(2, 2000, 500, 5, 1, 0.001, 0.99, 0.95)
        
    def train(self, num_episodes: int):

        # For Drawing Purposes
        # x_axis = []
        # y_axis = []  
        # figure, ax = plt.subplots(figsize=(10, 8))
        # line1, = ax.plot(x_axis, y_axis)
        # WINDOW_RATIO = 0.1
        # WINDOW_LIMIT = 0.85
        # plt.title("Error Margin of Target and Player", fontsize=20)
        # plt.xlabel("Number of Episodes")
        # plt.ylabel("Error Margin")
        # figure.canvas.draw()
        # plt.show(block=False)
        
        # Tensorboard
        summary_writer1 = tf.summary.create_file_writer("./logs/")
              
        for i in range(self.step_counter, self.step_counter+num_episodes):
            # print(f"Episodes {i}")
            self.game = Game()
            game_end = False
            while not game_end:
                state = self.game.get_state()
                action = self.agent.pick_action(state)
                next_state, rewards, game_end = self.game.execute_action(action)
                self.agent.store_memory(state, action, rewards,
                                        next_state, game_end)
            # For Model Learning Purposes
            if i % 100 == 0:
                self.agent.learn()
                
            # For Drawing of Error Margin of Target and Player
            with summary_writer1.as_default():
                y_temp = self.game.state
                error_margin = ((y_temp[0] - y_temp[1])**2)**0.5
                tf.summary.scalar(name="abs-error-margin", data=error_margin, step=i)
            summary_writer1.flush()
            
            # For drawing purposes
            # x_axis.append(i)
            # y_axis.append(((y_temp[0] - y_temp[1])**2)**0.5)
            # line1.set_xdata(x_axis)
            # line1.set_ydata(self.calculate_window(WINDOW_RATIO, WINDOW_LIMIT, y_axis))
            # plt.title(f"Error Margin of Target and Player\nEpsilon:{self.agent.epsilon}"
            #           , fontsize=20)
            
            # ax.relim() 
            # ax.autoscale_view(True,True,True) 

            # figure.canvas.draw()
            
            # plt.pause(0.005)
        
        self.step_counter += num_episodes
        
    def play(self, num_episode: int = 100, print_state = False):
        errors = []
        for i in range(num_episode):
            self.game = Game()
            game_end = False
            counter = 0
            while not game_end:
                state = self.game.get_state()
                action = self.agent.pick_action(state)
                _, _, game_end = self.game.execute_action(action)
                if print_state:
                    print(f"Turn num {counter}",self.game.get_state())
                    counter += 1
            
            y_temp = self.game.get_state()
            error_margin = ((y_temp[0] - y_temp[1])**2)**0.5
            errors.append(error_margin)
            
            if i % 50 == 0:
                avg_err = sum(errors) / len(errors)
                print(f"Current error on episode {i}: {avg_err}")
            
    def calculate_window(self, win_ratio, win_limit, series):
        copy_series = series.copy()
        series_length = len(copy_series)
        window_steps = int(series_length * win_ratio)
        steps_boundary = int(series_length * win_limit)
        if series_length < 100 + window_steps:
            return copy_series
        for i in range(steps_boundary):
            copy_series[i] = np.average(copy_series[i : i+window_steps])
        
        return copy_series

In [38]:
tf.keras.backend.clear_session()
envir = Environment()

In [39]:
envir.agent.eval_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, None, 1024)        11264     
                                                                 
 dense_1 (Dense)             (None, None, 512)         524800    
                                                                 
 dense_2 (Dense)             (None, None, 256)         131328    
                                                                 
 dense_3 (Dense)             (None, None, 128)         32896     
                                                                 
 dense_4 (Dense)             (None, None, 64)          8256      
                                                                 
 dense_5 (Dense)             (None, None, 2)           130       
                                                                 
Total params: 708,674
Trainable params: 708,674
Non-trai

In [40]:
envir.train(30000)

1
0.99
0.9801
0.9702989999999999
0.96059601
0.9509900498999999
0.9414801494009999
0.9320653479069899
0.92274469442792
0.9135172474836407
0.9043820750088043
0.8953382542587163
0.8863848717161291
0.8775210229989678
0.8687458127689781
0.8600583546412883
0.8514577710948754
0.8429431933839266
0.8345137614500874
0.8261686238355865
0.8179069375972307
0.8097278682212583
0.8016305895390458
0.7936142836436553
0.7856781408072188
0.7778213593991465
0.7700431458051551
0.7623427143471035
0.7547192872036325
0.7471720943315961
0.7397003733882802
0.7323033696543974
0.7249803359578534
0.7177305325982748
0.7105532272722921
0.7034476949995692
0.6964132180495735
0.6894490858690777
0.682554595010387
0.6757290490602831
0.6689717585696803
0.6622820409839835
0.6556592205741436
0.6491026283684022
0.6426116020847181
0.6361854860638709
0.6298236312032323
0.6235253948912
0.617290140942288
0.6111172395328651
0.6050060671375365
0.5989560064661611
0.5929664464014994
0.5870367819374844
0.5811664141181095
0.57535474997

In [41]:
# envir.agent.save_model("./model/own-game.h5")

Model saved


In [60]:
envir = Environment()
envir.agent.load_model("./model/own-game.h5")

Model Loaded
