#### Implementation of DQN paper for 1-dimensional games, such as Cartpole.
- https://www.nature.com/articles/nature14236
- https://arxiv.org/pdf/1312.5602.pdf

<br>

    Detailed implementation of Q-Network, a state and ReplayBuffer are different from the original paper. Because this notebook aims to solve a "simple 1-dimensional" atari game.
    Please see the notebook named as "..._2dim" for more rigorous implementation of the paper. 

#### Please NOTE,
    The code lines different from Vanila DQN are annotated with '*/*/*/'.
    So, by searching '*/*/*/', you can find these lines.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim 
import torch.nn.functional as F 

import gym
import numpy as np
import time
import os
import cv2
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
class QNetwork(nn.Module):
    ''' Simple linear Q-Network. The architecture is, therefore, different from thg model in DQN paper.'''
    def __init__(self, 
                 input_feature: ("int: input state dimension"), 
                 action_dim: ("output: action dimensions"),
        ):
        super(QNetwork, self).__init__()
        self.action_dim = action_dim

        self.linear1 = nn.Linear(input_feature, 256)
        self.linear2 = nn.Linear(256, 128) 
        self.linear3 = nn.Linear(128, action_dim)
        self.relu = nn.ReLU()

    def forward(self, x):

        x = self.relu(self.linear1(x))
        x = self.linear3(self.relu(self.linear2(x)))
        return x 

In [3]:
# */*/*/
# The following SegmentTree Classes come from OpenAI Source
import operator

class SegmentTree(object):
    def __init__(self, capacity, operation, neutral_element):
        """Build a Segment Tree data structure.
        https://en.wikipedia.org/wiki/Segment_tree
        Can be used as regular array, but with two
        important differences:
            a) setting item's value is slightly slower.
               It is O(lg capacity) instead of O(1).
            b) user has access to an efficient ( O(log segment size) )
               `reduce` operation which reduces `operation` over
               a contiguous subsequence of items in the array.
        Paramters
        ---------
        capacity: int
            Total size of the array - must be a power of two.
        operation: lambda obj, obj -> obj
            and operation for combining elements (eg. sum, max)
            must form a mathematical group together with the set of
            possible values for array elements (i.e. be associative)
        neutral_element: obj
            neutral element for the operation above. eg. float('-inf')
            for max and 0 for sum.
        """
        assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
        self._capacity = capacity
        self._value = [neutral_element for _ in range(2 * capacity)]
        self._operation = operation

    def _reduce_helper(self, start, end, node, node_start, node_end):
        if start == node_start and end == node_end:
            return self._value[node]
        mid = (node_start + node_end) // 2
        if end <= mid:
            return self._reduce_helper(start, end, 2 * node, node_start, mid)
        else:
            if mid + 1 <= start:
                return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
            else:
                return self._operation(
                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
                )

    def reduce(self, start=0, end=None):
        """Returns result of applying `self.operation`
        to a contiguous subsequence of the array.
            self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
        Parameters
        ----------
        start: int
            beginning of the subsequence
        end: int
            end of the subsequences
        Returns
        -------
        reduced: obj
            result of reducing self.operation over the specified range of array elements.
        """
        if end is None:
            end = self._capacity
        if end < 0:
            end += self._capacity
        end -= 1
        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)

    def __setitem__(self, idx, val):
        # index of the leaf
        idx += self._capacity
        self._value[idx] = val
        idx //= 2
        while idx >= 1:
            self._value[idx] = self._operation(
                self._value[2 * idx],
                self._value[2 * idx + 1]
            )
            idx //= 2

    def __getitem__(self, idx):
        assert 0 <= idx < self._capacity
        return self._value[self._capacity + idx]

class SumSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(SumSegmentTree, self).__init__(
            capacity=capacity,
            operation=operator.add,
            neutral_element=0.0
        )

    def sum(self, start=0, end=None):
        """Returns arr[start] + ... + arr[end]"""
        return super(SumSegmentTree, self).reduce(start, end)

    def find_prefixsum_idx(self, prefixsum):
        """Find the highest index `i` in the array such that
            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
        if array values are probabilities, this function
        allows to sample indexes according to the discrete
        probability efficiently.
        Parameters
        ----------
        perfixsum: float
            upperbound on the sum of array prefix
        Returns
        -------
        idx: int
            highest index satisfying the prefixsum constraint
        """
        assert 0 <= prefixsum <= self.sum() + 1e-5
        idx = 1
        while idx < self._capacity:  # while non-leaf
            if self._value[2 * idx] > prefixsum:
                idx = 2 * idx
            else:
                prefixsum -= self._value[2 * idx]
                idx = 2 * idx + 1
        return idx - self._capacity

class MinSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(MinSegmentTree, self).__init__(
            capacity=capacity,
            operation=min,
            neutral_element=float('inf')
        )

    def min(self, start=0, end=None):
        """Returns min(arr[start], ...,  arr[end])"""

        return super(MinSegmentTree, self).reduce(start, end)
# */*/*/

In [4]:
# Naive ReplayBuffer
class ReplayBuffer:

    def __init__(self, 
                 buffer_size: int, 
                 input_dim: tuple, 
                 batch_size: int,
                 input_type: str):
        
        if input_type=='3-dim':
            assert len(input_dim)==3, "The state dimension should be 3-dim! (Channel x Width x Height). Please check if input_dim is right"

        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.save_count, self.current_size = 0, 0

        if input_type=='1-dim':
            self.state_buffer = np.ones((buffer_size, input_dim), dtype=np.float32) 
            self.action_buffer = np.ones(buffer_size, dtype=np.uint8) 
            self.reward_buffer = np.ones(buffer_size, dtype=np.float32) 
            self.next_state_buffer = np.ones((buffer_size, input_dim), dtype=np.float32)
            self.done_buffer = np.ones(buffer_size, dtype=np.uint8) 
        else:
            self.state_buffer = np.ones((buffer_size, input_dim[0], input_dim[1], input_dim[2]), 
                                        dtype=np.uint8) # WARN: data type is np.int8 so that it should be stored ONLY 0~255 integer!!!
            self.action_buffer = np.ones(buffer_size, dtype=np.uint8) 
            self.reward_buffer = np.ones(buffer_size, dtype=np.float32) 
            self.next_state_buffer = np.ones((buffer_size, input_dim[0], input_dim[1], input_dim[2]),  
                                            dtype=np.uint8) # WARN: data type is np.int8 so that it should be stored ONLY 0~255 integer!!!
            self.done_buffer = np.ones(buffer_size, dtype=np.uint8) 
            
    def __len__(self):
        return self.current_size

    def store(self, 
              state: np.ndarray, 
              action: int, 
              reward: float, 
              next_state: np.ndarray, 
              done: int):

        self.state_buffer[self.save_count] = state
        self.action_buffer[self.save_count] = action
        self.reward_buffer[self.save_count] = reward
        self.next_state_buffer[self.save_count] = next_state
        self.done_buffer[self.save_count] = done
        
        self.save_count = (self.save_count + 1) % self.buffer_size
        self.current_size = min(self.current_size+1, self.buffer_size)

    def batch_load(self):
        indices = np.random.randint(self.current_size, size=self.batch_size)
        return dict(
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices],
                rewards=self.reward_buffer[indices],
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices]) 

# */*/*/
# ReplayBuffer for Prioritized Experience Replay. 
class PrioritizedReplayBuffer(ReplayBuffer):
    
    def __init__(self, buffer_size, input_dim, batch_size, alpha, input_type):
        
        super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim, batch_size, input_type)
        
        # For PER. Parameter settings. 
        self.max_priority, self.tree_idx = 1.0, 0
        self.alpha = alpha

        tree_capacity = 1
        while tree_capacity < self.buffer_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
        
    def store(self, 
              state: np.ndarray, 
              action: int, 
              reward: float, 
              next_state: np.ndarray, 
              done: int):
        
        super().store(state, action, reward, next_state, done)
        
        # assigning the maximum priority as an initial value when storing the transitions 
        self.sum_tree[self.tree_idx] = self.max_priority ** self.alpha
        self.min_tree[self.tree_idx] = self.max_priority ** self.alpha
        self.tree_idx = (self.tree_idx + 1) % self.buffer_size
        
    def batch_load(self, beta):
        
        indices = self._load_batch_indices()

        # calculate the maximum weight, which is used to calculate all weights for transitions of the batch
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min*len(self)) ** (-beta)
        weights = np.array([self._calculate_weight(idx, beta, max_weight) for idx in indices])
        
        return dict(
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices],
                rewards=self.reward_buffer[indices],
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices],
                weights=weights,
                indices=indices) 

    def update_priorities(self, indices, priorities):
        # update priorities of transitions after used to update network parameters 
        for idx, priority in zip(indices, priorities):
            self.sum_tree[idx] = priority ** self.alpha
            self.min_tree[idx] = priority ** self.alpha
            self.max_priority = max(self.max_priority, priority)
    
    def _load_batch_indices(self):
        
        indices = []
        p_total = self.sum_tree.sum(0, len(self)-1) 
        segment = p_total / self.batch_size # dividing the sum of all priorities with batch_size to calculate an amount of each segment
        
        for i in range(self.batch_size):
            a = segment * i 
            b = segment * (i+1)
            sample = np.random.uniform(a, b) # sample a value uniformly bewteen two consecutive segments
            idx = self.sum_tree.find_prefixsum_idx(sample) # obtaining the index for the sampled value 
            indices.append(idx)

        return indices 
    
    def _calculate_weight(self, idx, beta, max_weight):
        
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample*len(self)) ** (-beta)
        weight = weight / max_weight
        
        return weight
# */*/*/

if __name__=='__main__':
    buffer_size = 100
    state_dim = 4
    batch_size = 16
    alpha = 0.6
    beta = 0.4
    input_type = '1-dim'
    buffer = PrioritizedReplayBuffer(buffer_size, state_dim, batch_size, alpha, input_type)
    for i in range(50):
        state = np.ones(state_dim)
        action = 1
        reward = 1
        next_state = np.ones(state_dim)
        done = 1
        buffer.store(state, action, reward, next_state, done)
    print(buffer.alpha)
    print(buffer.max_priority)
    print(buffer.batch_load(beta)['states'].shape)
    print(buffer.batch_load(beta)['actions'].shape)
    print(buffer.batch_load(beta)['rewards'].shape)
    print(buffer.batch_load(beta)['next_states'].shape)
    print(buffer.batch_load(beta)['dones'].shape)
    print(buffer.batch_load(beta)['weights'].shape)
    print(buffer.batch_load(beta)['indices'].__len__())

0.6
1.0
(16, 4)
(16,)
(16,)
(16, 4)
(16,)
(16,)
16


In [4]:
class Agent:
    def __init__(self, 
                 env: 'Environment',
                 input_dim: ('int: The width and height of pre-processed input image'),
                 training_frames: ('int: The total number of training frames'),
                 eps_decay: ('float: Epsilon Decay_rate'),
                 gamma: ('float: Discount Factor'),
                 target_update_freq: ('int: Target Update Frequency (by frames)'),
                 update_type: ('str: Update type for target network. Hard or Soft')='hard',
                 soft_update_tau: ('float: Soft update ratio')=None,
                 batch_size: ('int: Update batch size')=32,
                 buffer_size: ('int: Replay buffer size')=1000000,
                 # */*/*/
                 alpha: ('int: Hyperparameter for how large prioritization is applied')=0.5,
                 beta: ('int: Hyperparameter for the annealing factor of importance sampling')=0.5,
                 epsilon_for_priority: ('float: Hyperparameter for adding small increment to the priority')=1e-6, 
                 # */*/*/
                 update_start_buffer_size: ('int: Update starting buffer size')=50000,
                 learning_rate: ('float: Learning rate')=0.0004,
                 eps_min: ('float: Epsilon Min')=0.1,
                 eps_max: ('float: Epsilon Max')=1.0,
                 device_num: ('int: GPU device number')=0,
                 rand_seed: ('int: Random seed')=None,
                 plot_option: ('str: Plotting option')=False,
                 model_path: ('str: Model saving path')='./',
                 trained_model_path: ('str: Trained model path')=''):

        self.action_dim = env.action_space.n
        self.device = torch.device(f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path
        
        self.env = env
        self.input_dim = input_dim
        self.training_frames = training_frames
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option
        
        # */*/*/
        self.alpha = alpha
        self.beta = beta
        self.beta_step = (1.0 - beta) / self.training_frames
        self.epsilon_for_priority = epsilon_for_priority
        # */*/*/
        
        self.q_behave = QNetwork(self.input_dim, self.action_dim).to(self.device)
        self.q_target = QNetwork(self.input_dim, self.action_dim).to(self.device)
        if trained_model_path: # load a trained model if existing
            self.q_behave.load_state_dict(torch.load(trained_model_path))
            print("Trained model is loaded successfully.")
        
        # Initialize target network parameters with behavior network parameters
        self.q_target.load_state_dict(self.q_behave.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_behave.parameters(), lr=learning_rate) 

        self.memory = ReplayBuffer(self.buffer_size, self.input_dim, self.batch_size)

    def select_action(self, state: 'Must be pre-processed in the same way as updating current Q network. See def _compute_loss'):
        
        if np.random.random() < self.epsilon:
            return np.zeros(self.action_dim), self.env.action_space.sample()
        else:
            # with no_grad to compute faster
            with torch.no_grad():
                state = torch.FloatTensor(state).to(self.device)
                Qs = self.q_behave(state)
                # take an action of a maximum Q-value
                action = Qs.argmax()
            
            # return action and Q-values (Q-values are not required for implementing algorithms. This is just for checking Q-values for each state. Not must-needed)  
            return Qs.detach().cpu().numpy(), action.detach().item()  

    def get_init_state(self):

        init_state = self.env.reset()
        for _ in range(0): # loop for a random initial starting point. range(0) means the same initial point.
            action = self.env.action_space.sample()
            init_state, _, _, _ = self.env.step(action) 
        return init_state

    def get_state(self, state, action):

        next_state, reward, done, _ = self.env.step(action)
        return reward, next_state, done

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def update_behavior_q_net(self):
        # update behavior q network with a batch
        batch = self.memory.batch_load()
        loss = self._compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def target_soft_update(self):
        ''' target network is updated with Soft Update. tau is a hyperparameter for the updating ratio betweeen target and behavior network  '''
        for target_param, current_param in zip(self.q_target.parameters(), self.q_behave.parameters()):
            target_param.data.copy_(self.tau*current_param.data + (1.0-self.tau)*target_param.data)

    def target_hard_update(self):
        ''' target network is updated with Hard Update '''
        self.update_cnt = (self.update_cnt+1) % self.target_update_freq
        if self.update_cnt==0:
            self.q_target.load_state_dict(self.q_behave.state_dict())

    def train(self):
        tic = time.time()
        losses = []
        scores = []
        epsilons = []
        avg_scores = [[-10000]] # As an initial score, set an arbitrary score of an episode.

        score = 0

        print("Storing initial buffer..") 
        state = self.get_init_state()
        for frame_idx in range(1, self.update_start+1):
            # Store transitions into the buffer until the number of 'self.update_start' transitions is stored 
            _, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action)
            self.store(state, action, reward, next_state, done)
            state = next_state
            if done: state = self.get_init_state()

        print("Done. Start learning..")
        history_store = []
        for frame_idx in range(1, self.training_frames+1):
            Qs, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action)
            self.store(state, action, reward, next_state, done)
            history_store.append([state, Qs, action, reward, next_state, done]) # history_store is for checking an episode later. Not must-needed.
            loss = self.update_behavior_q_net()

            if self.update_type=='hard':   self.target_hard_update()
            elif self.update_type=='soft': self.target_soft_update()
            
            score += reward
            losses.append(loss)

            if done:
                # For saving and plotting when an episode is done.
                scores.append(score)
                if np.mean(scores[-10:]) > max(avg_scores):
                    torch.save(self.q_behave.state_dict(), self.model_path+'{}_Score:{}.pt'.format(frame_idx, np.mean(scores[-10:])))
                    training_time = round((time.time()-tic)/3600, 1)
                    np.save(self.model_path+'{}_history_Score_{}_{}hrs.npy'.format(frame_idx, score, training_time), np.array(history_store))
                    print("          | Model saved. Recent scores: {}, Training time: {}hrs".format(scores[-10:], training_time), ' /'.join(os.getcwd().split('/')[-3:]))
                avg_scores.append(np.mean(scores[-10:]))

                if self.plot_option=='inline': 
                    scores.append(score)
                    epsilons.append(self.epsilon)
                    self._plot(frame_idx, scores, losses, epsilons)
                else: 
                    print(score, end='\r')

                score=0
                state = self.get_init_state()
                history_store = []
            else: state = next_state

            self._epsilon_step()

        print("Total training time: {}(hrs)".format((time.time()-tic)/3600))

    def _epsilon_step(self):
        self.epsilon = max(self.epsilon-self.eps_decay, 0.1)

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        states = torch.FloatTensor(batch['states']).to(self.device)
        next_states = torch.FloatTensor(batch['next_states']).to(self.device)
        actions = torch.LongTensor(batch['actions'].reshape(-1, 1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1, 1)).to(self.device)

        current_q = self.q_behave(states).gather(1, actions)

        # target value
        next_q = self.q_target(next_states).max(dim=1, keepdim=True)[0].detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        # Use smooth l1 loss for clipping loss between -1 to 1 as in DQN paper.
        loss = F.smooth_l1_loss(current_q, target)
        return loss

    def _plot(self, frame_idx, scores, losses, epsilons):
        clear_output(True) 
        plt.figure(figsize=(20, 5), facecolor='w') 
        plt.subplot(131)  
        plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
        plt.plot(scores) 
        plt.subplot(132) 
        plt.title('loss') 
        plt.plot(losses) 
        plt.subplot(133) 
        plt.title('epsilons')
        plt.plot(epsilons) 
        plt.show() 

#### Configurations

![image](https://drive.google.com/uc?id=1P_PgreL2VnMTFTQAbM1wxANka8pu7bx0)

In [5]:
env_list = {
    0: "CartPole-v0",
    1: "CartPole-v2",
    2: "LunarLander-v2",
}

env_name = env_list[0]
env = gym.make(env_name)

# Same input size as in DQN paper. 
input_dim = env.observation_space.shape[0]
print("env_name", env_name) 
update_start_buffer_size = 200
training_frames = 20000
eps_max = 1.0
eps_min = 0.1
eps_decay = 1/2000
gamma = 0.99

buffer_size = int(2e3) 
batch_size = 32           
update_type = 'hard'
soft_update_tau = 0.002
learning_rate = 0.001
target_update_freq = 100

device_num = 0
rand_seed = None
rand_name = ('').join(map(str, np.random.randint(10, size=(3,))))
folder_name = os.getcwd().split('/')[-1] 

model_name = 'Test'
model_save_path = f'./model_save/{model_name}/'
if not os.path.exists('./model_save/'):
    os.mkdir('./model_save/')
if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
print("model_save_path:", model_save_path)

trained_model_path = ''

#for PER
alpha = 0.5
beta = 0.6
epsilon_for_priority = 1e-6

plot_options = {1: 'inline', 2: False} 
plot_option = plot_options[2] 

env_name CartPole-v0
model_save_path: ./model_save/Test/


In [None]:
agent = Agent( 
    env,
    input_dim,
    training_frames,
    eps_decay,
    gamma,
    target_update_freq,
    update_type,
    soft_update_tau,
    batch_size,
    buffer_size,
    update_start_buffer_size,
    learning_rate,
    eps_min,
    eps_max,
    device_num,
    rand_seed,
    plot_option,
    model_save_path,
    trained_model_path
) 

agent.train()

#### An example of results

    Storing initial buffer..
    Done. Start learning..
              | Model saved. Recent scores: [31.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [20.0, 30.0, 16.0, 42.0, 49.0, 22.0, 23.0, 24.0, 13.0, 84.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [30.0, 16.0, 42.0, 49.0, 22.0, 23.0, 24.0, 13.0, 84.0, 50.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [42.0, 49.0, 22.0, 23.0, 24.0, 13.0, 84.0, 50.0, 24.0, 126.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [84.0, 50.0, 24.0, 126.0, 13.0, 22.0, 35.0, 21.0, 49.0, 76.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [126.0, 13.0, 22.0, 35.0, 21.0, 49.0, 76.0, 33.0, 95.0, 145.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [13.0, 22.0, 35.0, 21.0, 49.0, 76.0, 33.0, 95.0, 145.0, 145.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [22.0, 35.0, 21.0, 49.0, 76.0, 33.0, 95.0, 145.0, 145.0, 126.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [35.0, 21.0, 49.0, 76.0, 33.0, 95.0, 145.0, 145.0, 126.0, 112.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [21.0, 49.0, 76.0, 33.0, 95.0, 145.0, 145.0, 126.0, 112.0, 161.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [49.0, 76.0, 33.0, 95.0, 145.0, 145.0, 126.0, 112.0, 161.0, 193.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [76.0, 33.0, 95.0, 145.0, 145.0, 126.0, 112.0, 161.0, 193.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [33.0, 95.0, 145.0, 145.0, 126.0, 112.0, 161.0, 193.0, 200.0, 175.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [95.0, 145.0, 145.0, 126.0, 112.0, 161.0, 193.0, 200.0, 175.0, 139.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [145.0, 145.0, 126.0, 112.0, 161.0, 193.0, 200.0, 175.0, 139.0, 150.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [145.0, 126.0, 112.0, 161.0, 193.0, 200.0, 175.0, 139.0, 150.0, 168.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [126.0, 112.0, 161.0, 193.0, 200.0, 175.0, 139.0, 150.0, 168.0, 187.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [112.0, 161.0, 193.0, 200.0, 175.0, 139.0, 150.0, 168.0, 187.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [161.0, 193.0, 200.0, 175.0, 139.0, 150.0, 168.0, 187.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [193.0, 200.0, 175.0, 139.0, 150.0, 168.0, 187.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 175.0, 139.0, 150.0, 168.0, 187.0, 200.0, 200.0, 200.0, 195.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [150.0, 168.0, 187.0, 200.0, 200.0, 200.0, 195.0, 155.0, 200.0, 178.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [168.0, 187.0, 200.0, 200.0, 200.0, 195.0, 155.0, 200.0, 178.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [187.0, 200.0, 200.0, 200.0, 195.0, 155.0, 200.0, 178.0, 200.0, 169.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 200.0, 200.0, 195.0, 155.0, 200.0, 178.0, 200.0, 169.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 183.0, 192.0, 200.0, 200.0, 157.0, 176.0, 200.0, 193.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN