In [None]:
import os
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple
from gym.spaces import Dict, Discrete, Box, Tuple
from parametric_distribution import get_parametric_distribution_for_action_space
from collections import deque

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
#gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_virtual_device_configuration(gpus[0],
#          [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])

tfd = tfp.distributions

class OurModel(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(OurModel, self).__init__()
        
        self.flatten = Flatten()
        self.dense_0 = Dense(512, activation='relu')
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)
        
    def call(self, X_input):
        X_input = self.flatten(X_input)
        X_input = self.dense_0(X_input)
        action_logit = self.dense_1(X_input)
        value = self.dense_2(X_input)
        
        return action_logit, value


def safe_log(x):
  """Computes a safe logarithm which returns 0 if x is zero."""
  return tf.where(
      tf.math.equal(x, 0),
      tf.zeros_like(x),
      tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    """
    For a batch of vectors, take a single vector component
    out of each vector.
    Args:
      vectors: a [batch x dims] Tensor.
      indices: an int32 Tensor with `batch` entries.
    Returns:
      A Tensor with `batch` entries, one for each vector.
    """
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()

parametric_action_distribution = get_parametric_distribution_for_action_space(Discrete(6))

class IMPALA_Agent:
    # IMPALA Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization Environment and parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.action_size = self.env.action_space.n
        self.EPISODES, self.episode, self.max_average = 2000000, 0, -21.0 # specific for pong
        
        self.memory_size = 25000
        self.memory_1 = []
        self.memory_2 = []
        
        self.batch_size = 4
        self.unroll_length = 101
        
        self.ROWS = 80
        self.COLS = 80
        self.REM_STEP = 4
        self.state_size = (self.COLS, self.ROWS, self.REM_STEP)
        
        self.memory_index_1 = 1
        self.memory_index_2 = 1
        self.memory_index_3 = 1
        self.memory_index_4 = 1
        
        self.states_1 = np.zeros((self.unroll_length + 1, *self.state_size), dtype=np.float32)
        self.actions_1 = np.zeros((self.unroll_length + 1), dtype=np.int32)
        self.policies_1 = np.zeros((self.unroll_length + 1, self.action_size), dtype=np.float32)
        self.rewards_1 = np.zeros((self.unroll_length + 1), dtype=np.float32)
        self.dones_1 = np.zeros((self.unroll_length + 1), dtype=np.bool)
        
        self.states_2 = np.zeros((self.unroll_length + 1, *self.state_size), dtype=np.float32)
        self.actions_2 = np.zeros((self.unroll_length + 1), dtype=np.int32)
        self.policies_2 = np.zeros((self.unroll_length + 1, self.action_size), dtype=np.float32)
        self.rewards_2 = np.zeros((self.unroll_length + 1), dtype=np.float32)
        self.dones_2 = np.zeros((self.unroll_length + 1), dtype=np.bool)
        
        self.states_3 = np.zeros((self.unroll_length + 1, *self.state_size), dtype=np.float32)
        self.actions_3 = np.zeros((self.unroll_length + 1), dtype=np.int32)
        self.policies_3 = np.zeros((self.unroll_length + 1, self.action_size), dtype=np.float32)
        self.rewards_3 = np.zeros((self.unroll_length + 1), dtype=np.float32)
        self.dones_3 = np.zeros((self.unroll_length + 1), dtype=np.bool)
        
        self.states_4 = np.zeros((self.unroll_length + 1, *self.state_size), dtype=np.float32)
        self.actions_4 = np.zeros((self.unroll_length + 1), dtype=np.int32)
        self.policies_4 = np.zeros((self.unroll_length + 1, self.action_size), dtype=np.float32)
        self.rewards_4 = np.zeros((self.unroll_length + 1), dtype=np.float32)
        self.dones_4 = np.zeros((self.unroll_length + 1), dtype=np.bool)
        
        self.deq = deque(maxlen=10)
        
        self.states = np.zeros((self.batch_size, self.unroll_length, *self.state_size), dtype=np.float32)
        self.actions = np.zeros((self.batch_size, self.unroll_length), dtype=np.int32)
        self.policies = np.zeros((self.batch_size, self.unroll_length, self.action_size), dtype=np.float32)
        self.rewards = np.zeros((self.batch_size, self.unroll_length), dtype=np.float32)
        self.dones = np.zeros((self.batch_size, self.unroll_length), dtype=np.bool)
        
        self.lock = Lock()
        self.lr = 0.0001

        num_hidden_units = 512
        self.image_memory = np.zeros(self.state_size)
        
        # Instantiate plot memory
        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'
        
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_IMPALA_{}'.format(self.env_name, self.lr)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.model = OurModel(input_shape=self.state_size, action_space=self.action_size)
        
        self.optimizer = tf.keras.optimizers.Adam(self.lr)

    def remember(self, state, action, policy, reward, done, thread):
        if thread == 0:
            self.states_1[self.memory_index_1] = state
            self.actions_1[self.memory_index_1] = action
            self.policies_1[self.memory_index_1] = policy
            self.rewards_1[self.memory_index_1] = reward
            self.dones_1[self.memory_index_1] = done

            if self.memory_index_1 == 100:
                self.deq.append([self.states_1, self.actions_1, self.policies_1, 
                                 self.rewards_1, self.dones_1])
                
                self.states_1[0] = self.states_1[self.memory_index_1]
                self.actions_1[0] = self.actions_1[self.memory_index_1]
                self.policies_1[0] = self.policies_1[self.memory_index_1]
                self.rewards_1[0] = self.rewards_1[self.memory_index_1]
                self.dones_1[0] = self.dones_1[self.memory_index_1]
 
                self.memory_index_1 = 1
                
            self.memory_index_1 += 1
        elif thread == 1:
            self.states_2[self.memory_index_2] = state
            self.actions_2[self.memory_index_2] = action
            self.policies_2[self.memory_index_2] = policy
            self.rewards_2[self.memory_index_2] = reward
            self.dones_2[self.memory_index_2] = done

            if self.memory_index_2 == 100:
                self.deq.append([self.states_2, self.actions_2, self.policies_2, 
                                 self.rewards_2, self.dones_2])
                    
                self.states_2[0] = self.states_2[self.memory_index_2]
                self.actions_2[0] = self.actions_2[self.memory_index_2]
                self.policies_2[0] = self.policies_2[self.memory_index_2]
                self.rewards_2[0] = self.rewards_2[self.memory_index_2]
                self.dones_2[0] = self.dones_2[self.memory_index_2]
                    
                self.memory_index_2 = 1

            self.memory_index_2 += 1
        elif thread == 2:
            self.states_3[self.memory_index_3] = state
            self.actions_3[self.memory_index_3] = action
            self.policies_3[self.memory_index_3] = policy
            self.rewards_3[self.memory_index_3] = reward
            self.dones_3[self.memory_index_3] = done

            if self.memory_index_3 == 100:
                self.deq.append([self.states_3, self.actions_3, self.policies_3, 
                                 self.rewards_3, self.dones_3])
                    
                self.states_3[0] = self.states_3[self.memory_index_3]
                self.actions_3[0] = self.actions_3[self.memory_index_3]
                self.policies_3[0] = self.policies_3[self.memory_index_3]
                self.rewards_3[0] = self.rewards_3[self.memory_index_3]
                self.dones_3[0] = self.dones_3[self.memory_index_3]
                    
                self.memory_index_3 = 1

            self.memory_index_3 += 1
        elif thread == 3:
            self.states_4[self.memory_index_4] = state
            self.actions_4[self.memory_index_4] = action
            self.policies_4[self.memory_index_4] = policy
            self.rewards_4[self.memory_index_4] = reward
            self.dones_4[self.memory_index_4] = done

            if self.memory_index_4 == 100:
                self.deq.append([self.states_4, self.actions_4, self.policies_4, 
                                 self.rewards_4, self.dones_4])
                    
                self.states_4[0] = self.states_4[self.memory_index_4]
                self.actions_4[0] = self.actions_4[self.memory_index_4]
                self.policies_4[0] = self.policies_4[self.memory_index_4]
                self.rewards_4[0] = self.rewards_4[self.memory_index_4]
                self.dones_4[0] = self.dones_4[self.memory_index_4]
                    
                self.memory_index_4 = 1

            self.memory_index_4 += 1
    
    def act(self, state):
        #print("state.shape: ", state.shape)
        prediction = self.model(state, training=False)
        dist = tfd.Categorical(logits=prediction[0])
        action = int(dist.sample()[0])
        policy = prediction[0]
        
        return action, policy

    def update(self, states, actions, agent_policies, rewards, dones):
        '''
        states.shape:  (8, 100, 80, 80, 4)
        actions.shape:  (8, 100)
        agent_policies.shape:  (8, 100, 6)
        rewards.shape:  (8, 100)
        dones.shape:  (8, 100)
        '''
        states = tf.transpose(states, perm=[1, 0, 2, 3, 4])
        actions = tf.transpose(actions, perm=[1, 0])
        agent_policies = tf.transpose(agent_policies, perm=[1, 0, 2])
        rewards = tf.transpose(rewards, perm=[1, 0])
        dones = tf.transpose(dones, perm=[1, 0])
        
        batch_size = states.shape[0]
        
        online_variables = self.model.trainable_variables
        with tf.GradientTape() as tape:
            tape.watch(online_variables)
            
            # states.shape:  (8, 100, 80, 80, 4)
            states_folded = tf.reshape(states, 
                            [states.shape[0]*states.shape[1], states.shape[2], states.shape[3], states.shape[4]])

            learner_output = self.model(states_folded, training=True)
            learner_policies = tf.reshape(learner_output[0], [states.shape[0], states.shape[1], -1])
            learner_values = tf.reshape(learner_output[1], [states.shape[0], states.shape[1], -1])
            
            agent_logits = tf.nn.softmax(agent_policies[:-1])
            actions = actions[:-1]
            rewards = rewards[1:]
            dones = dones[1:]
        
            learner_logits = tf.nn.softmax(learner_policies[:-1])
            
            learner_values = tf.squeeze(learner_values, axis=2)
            
            bootstrap_value = learner_values[-1]
            learner_values = learner_values[:-1]
            
            discounting = 0.99
            discounts = tf.cast(~dones, tf.float32) * discounting
            
            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
            
            target_action_log_probs = parametric_action_distribution.log_prob(learner_policies[:-1], actions)
            behaviour_action_log_probs = parametric_action_distribution.log_prob(agent_policies[:-1], actions)
            
            lambda_ = 1.0
            
            log_rhos = target_action_log_probs - behaviour_action_log_probs
            
            log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
            discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            values = tf.convert_to_tensor(learner_values, dtype=tf.float32)
            bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
            
            clip_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)
            clip_pg_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)
            
            rhos = tf.math.exp(log_rhos)
            
            clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name='clipped_rhos')
            
            cs = tf.minimum(1.0, rhos, name='cs')
            cs *= tf.convert_to_tensor(lambda_, dtype=tf.float32)

            values_t_plus_1 = tf.concat([values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
            deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
        
            acc = tf.zeros_like(bootstrap_value)
            vs_minus_v_xs = []
            for i in range(int(discounts.shape[0]) - 1, -1, -1):
                discount, c, delta = discounts[i], cs[i], deltas[i]
                acc = delta + discount * c * acc
                vs_minus_v_xs.append(acc)  
            
            vs_minus_v_xs = vs_minus_v_xs[::-1]
            
            vs = tf.add(vs_minus_v_xs, values, name='vs')
            vs_t_plus_1 = tf.concat([vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
            clipped_pg_rhos = tf.minimum(clip_pg_rho_threshold, rhos, name='clipped_pg_rhos')
            
            pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
            
            vs = tf.stop_gradient(vs)
            pg_advantages = tf.stop_gradient(pg_advantages)
            
            actor_loss = -tf.reduce_mean(target_action_log_probs * pg_advantages)
            
            baseline_cost = 0.5
            v_error = values - vs
            critic_loss = baseline_cost * 0.5 * tf.reduce_mean(tf.square(v_error))
            
            total_loss = actor_loss + critic_loss

        grads = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
    
    def replay(self):
        state_list, action_list, policy_list, reward_list, done_list = [], [], [], [], []
        for i in range(0, 4):
            state, action, policy, reward, done = self.deq.popleft()
            
            state_list.append(state)
            action_list.append(action)
            policy_list.append(policy)
            reward_list.append(reward)
            done_list.append(done)
            
        state_array = np.array(state_list)
        action_array = np.array(action_list)
        policy_array = np.array(policy_list)
        reward_array = np.array(reward_list)
        done_array = np.array(done_list)
        
        self.update(state_array, action_array, policy_array, reward_array, done_array)
           
    def load(self, model_name):
        self.model = load_model(model_name, compile=False)

    def save(self):
        self.model.save(self.model_name)

    pylab.figure(figsize=(18, 9))
    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes, self.scores, 'b')
            pylab.plot(self.episodes, self.average, 'r')
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.savefig(self.path + ".png")
            except OSError:
                pass

        return self.average[-1]
    
    def imshow(self, image, rem_step=0):
        #print("image[:,:,rem_step].shape: ", image[:,:,rem_step].shape)
        
        cv2.imshow("pong" + str(rem_step), image[:,:,rem_step])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()

    def GetImage(self, frame):
        #print("frame.shape: ", frame.shape)
        
        # croping frame to 80x80 size
        frame_cropped = frame[35:195:2, ::2,:]
        if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
            # OpenCV resize function 
            frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
        
        # converting to RGB (numpy way)
        frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
        
        # converting to Gray (OpenCV way)
        #frame_gray = cv2.cvtColor(frame_cropped, cv2.COLOR_BGR2GRAY)     
        #print("frame_gray.shape: ", frame_gray.shape)
        
        frame_rgb[frame_rgb < 100] = 0
        frame_rgb[frame_rgb >= 100] = 255
        # dividing by 255 we expresses value to 0-1 representation
        new_frame = np.array(frame_rgb).astype(np.float32) / 255.0

        # push our data by 1 frame, similar as deq() function work
        self.image_memory = np.roll(self.image_memory, 1, axis=2)

        # inserting new frame to free space
        self.image_memory[:,:,0] = new_frame

        # show image frame   
        #self.imshow(self.image_memory, 0)
        #self.imshow(self.image_memory, 1)
        #self.imshow(self.image_memory, 2)
        #self.imshow(self.image_memory, 3)

        return np.expand_dims(self.image_memory, axis=0)
        
    def reset(self, env):
        frame = env.reset()
        for i in range(self.REM_STEP):
            state = self.GetImage(frame)

        return state

    def step(self, action, env):
        next_state, reward, done, info = env.step(action)
        next_state = self.GetImage(next_state)
        
        return next_state, reward, done, info
    
    def train(self, n_threads):
        self.env.close()
        # Instantiate one environment per thread
        envs = [gym.make(self.env_name) for i in range(n_threads)]

        # Create threads
        threads = [threading.Thread(
                target=self.train_threading,
                daemon=True,
                args=(self, envs[i], i)) for i in range(n_threads)]

        for t in threads:
            time.sleep(2)
            t.start()
            
        for t in threads:
            time.sleep(10)
            t.join()
    
    def render(self, obs):
        cv2.imshow('obs', obs)
        cv2.waitKey(1)
    
    def train_threading(self, agent, env, thread):
        max_average = 15.0
        total_step_1 = 0
        total_step_2 = 0
        for e in range(self.EPISODES):
            state = self.reset(env)

            done = False
            score = 0
            SAVING = ''
            while not done:
                #self.env.render()
                    
                action, policy = self.act(state)
                next_state, reward, done, _ = self.step(action, env)

                if thread != 4:
                    self.remember(state, action, policy, reward / 20.0, done, thread)

                state = next_state
                score += reward   
                        
                if done:
                    break
                     
                if thread == 4:
                    if len(self.deq) > 4:
                        self.replay()
                        
            if thread != 4:
                # Update episode count
                with self.lock:
                    average = self.PlotModel(score, self.episode)
                    # saving best models
                    if average >= self.max_average:
                        self.max_average = average
                        #self.save()
                        SAVING = "SAVING"
                    else:
                        SAVING = ""

                    print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
                    if(self.episode < self.EPISODES):
                        self.episode += 1
                 
    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        
        for e in range(100):
            state = self.reset(self.env)
            done = False
            
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'Pong-v0'
    agent = IMPALA_Agent(env_name)
    
    #agent.run() # use as IMPALA
    agent.train(n_threads=5) # use as IMPALA
    #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')

2022-01-15 20:20:32.169436: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2022-01-15 20:20:33.494891: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-01-15 20:20:33.495531: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-01-15 20:20:33.521651: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-01-15 20:20:33.521675: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: kimbring2-ROG-Strix-GA35DX-G35DX
2022-01-15 20:20:33.521679: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: kimbring2-ROG-Strix-GA35DX-G35DX
2022-01-15 20:20:33.521741: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 

episode: 0/2000000, thread: 0, score: -20.0, average: -20.00 SAVING
episode: 1/2000000, thread: 1, score: -20.0, average: -20.00 SAVING
episode: 2/2000000, thread: 2, score: -20.0, average: -20.00 SAVING
episode: 3/2000000, thread: 3, score: -20.0, average: -20.00 SAVING
episode: 4/2000000, thread: 1, score: -20.0, average: -20.00 SAVING
episode: 5/2000000, thread: 0, score: -20.0, average: -20.00 SAVING
episode: 6/2000000, thread: 2, score: -21.0, average: -20.14 
episode: 7/2000000, thread: 3, score: -20.0, average: -20.12 
episode: 8/2000000, thread: 3, score: -21.0, average: -20.22 
episode: 9/2000000, thread: 0, score: -21.0, average: -20.30 
episode: 10/2000000, thread: 2, score: -21.0, average: -20.36 
episode: 11/2000000, thread: 1, score: -18.0, average: -20.17 
episode: 12/2000000, thread: 1, score: -21.0, average: -20.23 
episode: 13/2000000, thread: 3, score: -21.0, average: -20.29 
episode: 14/2000000, thread: 2, score: -21.0, average: -20.33 
episode: 15/2000000, thread: 

episode: 130/2000000, thread: 2, score: -21.0, average: -20.48 
episode: 131/2000000, thread: 0, score: -20.0, average: -20.48 
episode: 132/2000000, thread: 1, score: -21.0, average: -20.52 
episode: 133/2000000, thread: 3, score: -21.0, average: -20.54 
episode: 134/2000000, thread: 2, score: -21.0, average: -20.54 
episode: 135/2000000, thread: 0, score: -21.0, average: -20.58 
episode: 136/2000000, thread: 1, score: -21.0, average: -20.60 
episode: 137/2000000, thread: 3, score: -19.0, average: -20.64 
episode: 138/2000000, thread: 2, score: -21.0, average: -20.66 
episode: 139/2000000, thread: 0, score: -21.0, average: -20.68 
episode: 140/2000000, thread: 1, score: -21.0, average: -20.68 
episode: 141/2000000, thread: 3, score: -21.0, average: -20.68 
episode: 142/2000000, thread: 2, score: -20.0, average: -20.66 
episode: 143/2000000, thread: 0, score: -21.0, average: -20.66 
episode: 144/2000000, thread: 1, score: -21.0, average: -20.68 
episode: 145/2000000, thread: 3, score: 

episode: 259/2000000, thread: 2, score: -21.0, average: -20.74 
episode: 260/2000000, thread: 3, score: -21.0, average: -20.74 
episode: 261/2000000, thread: 0, score: -21.0, average: -20.74 
episode: 262/2000000, thread: 1, score: -19.0, average: -20.70 
episode: 263/2000000, thread: 2, score: -21.0, average: -20.70 
episode: 264/2000000, thread: 3, score: -21.0, average: -20.70 
episode: 265/2000000, thread: 0, score: -21.0, average: -20.70 
episode: 266/2000000, thread: 1, score: -21.0, average: -20.70 
episode: 267/2000000, thread: 2, score: -21.0, average: -20.70 
episode: 268/2000000, thread: 3, score: -21.0, average: -20.70 
episode: 269/2000000, thread: 1, score: -20.0, average: -20.68 
episode: 270/2000000, thread: 0, score: -20.0, average: -20.66 
episode: 271/2000000, thread: 2, score: -21.0, average: -20.66 
episode: 272/2000000, thread: 3, score: -20.0, average: -20.64 
episode: 273/2000000, thread: 0, score: -20.0, average: -20.62 
episode: 274/2000000, thread: 1, score: 

episode: 388/2000000, thread: 3, score: -21.0, average: -20.46 
episode: 389/2000000, thread: 2, score: -20.0, average: -20.46 
episode: 390/2000000, thread: 0, score: -21.0, average: -20.48 
episode: 391/2000000, thread: 1, score: -18.0, average: -20.44 
episode: 392/2000000, thread: 3, score: -21.0, average: -20.48 
episode: 393/2000000, thread: 2, score: -21.0, average: -20.50 
episode: 394/2000000, thread: 0, score: -20.0, average: -20.50 
episode: 395/2000000, thread: 1, score: -20.0, average: -20.48 
episode: 396/2000000, thread: 3, score: -18.0, average: -20.42 
episode: 397/2000000, thread: 2, score: -19.0, average: -20.38 
episode: 398/2000000, thread: 0, score: -21.0, average: -20.40 
episode: 399/2000000, thread: 1, score: -21.0, average: -20.42 
episode: 400/2000000, thread: 3, score: -20.0, average: -20.40 
episode: 401/2000000, thread: 0, score: -21.0, average: -20.40 
episode: 402/2000000, thread: 2, score: -20.0, average: -20.38 
episode: 403/2000000, thread: 1, score: 

episode: 517/2000000, thread: 2, score: -20.0, average: -20.26 
episode: 518/2000000, thread: 0, score: -20.0, average: -20.24 
episode: 519/2000000, thread: 1, score: -21.0, average: -20.24 
episode: 520/2000000, thread: 3, score: -20.0, average: -20.22 
episode: 521/2000000, thread: 2, score: -20.0, average: -20.22 
episode: 522/2000000, thread: 1, score: -21.0, average: -20.24 
episode: 523/2000000, thread: 0, score: -21.0, average: -20.26 
episode: 524/2000000, thread: 3, score: -19.0, average: -20.22 
episode: 525/2000000, thread: 1, score: -20.0, average: -20.20 
episode: 526/2000000, thread: 2, score: -19.0, average: -20.16 
episode: 527/2000000, thread: 0, score: -19.0, average: -20.14 
episode: 528/2000000, thread: 3, score: -20.0, average: -20.12 
episode: 529/2000000, thread: 1, score: -21.0, average: -20.14 
episode: 530/2000000, thread: 2, score: -21.0, average: -20.14 
episode: 531/2000000, thread: 0, score: -20.0, average: -20.16 
episode: 532/2000000, thread: 3, score: 

episode: 646/2000000, thread: 1, score: -19.0, average: -20.18 
episode: 647/2000000, thread: 0, score: -21.0, average: -20.20 
episode: 648/2000000, thread: 3, score: -21.0, average: -20.20 
episode: 649/2000000, thread: 1, score: -21.0, average: -20.22 
episode: 650/2000000, thread: 2, score: -19.0, average: -20.20 
episode: 651/2000000, thread: 0, score: -19.0, average: -20.20 
episode: 652/2000000, thread: 3, score: -21.0, average: -20.24 
episode: 653/2000000, thread: 2, score: -21.0, average: -20.26 
episode: 654/2000000, thread: 1, score: -21.0, average: -20.26 
episode: 655/2000000, thread: 0, score: -21.0, average: -20.28 
episode: 656/2000000, thread: 3, score: -21.0, average: -20.32 
episode: 657/2000000, thread: 2, score: -20.0, average: -20.32 
episode: 658/2000000, thread: 1, score: -21.0, average: -20.32 
episode: 659/2000000, thread: 0, score: -21.0, average: -20.32 
episode: 660/2000000, thread: 3, score: -21.0, average: -20.32 
episode: 661/2000000, thread: 2, score: 

episode: 774/2000000, thread: 2, score: -21.0, average: -20.04 
episode: 775/2000000, thread: 1, score: -20.0, average: -20.04 
episode: 776/2000000, thread: 0, score: -20.0, average: -20.06 
episode: 777/2000000, thread: 3, score: -19.0, average: -20.06 
episode: 778/2000000, thread: 2, score: -19.0, average: -20.04 
episode: 779/2000000, thread: 1, score: -20.0, average: -20.08 
episode: 780/2000000, thread: 0, score: -21.0, average: -20.08 
episode: 781/2000000, thread: 3, score: -21.0, average: -20.12 
episode: 782/2000000, thread: 2, score: -19.0, average: -20.10 
episode: 783/2000000, thread: 1, score: -18.0, average: -20.06 
episode: 784/2000000, thread: 0, score: -20.0, average: -20.04 
episode: 785/2000000, thread: 3, score: -19.0, average: -20.04 
episode: 786/2000000, thread: 2, score: -20.0, average: -20.04 
episode: 787/2000000, thread: 0, score: -21.0, average: -20.04 
episode: 788/2000000, thread: 1, score: -20.0, average: -20.06 
episode: 789/2000000, thread: 3, score: 

episode: 902/2000000, thread: 1, score: -21.0, average: -20.24 
episode: 903/2000000, thread: 2, score: -21.0, average: -20.24 
episode: 904/2000000, thread: 0, score: -20.0, average: -20.24 
episode: 905/2000000, thread: 3, score: -21.0, average: -20.26 
episode: 906/2000000, thread: 2, score: -20.0, average: -20.24 
episode: 907/2000000, thread: 0, score: -21.0, average: -20.24 
episode: 908/2000000, thread: 1, score: -19.0, average: -20.20 
episode: 909/2000000, thread: 3, score: -21.0, average: -20.20 
episode: 910/2000000, thread: 2, score: -19.0, average: -20.20 
episode: 911/2000000, thread: 0, score: -21.0, average: -20.24 
episode: 912/2000000, thread: 1, score: -20.0, average: -20.22 
episode: 913/2000000, thread: 3, score: -21.0, average: -20.22 
episode: 914/2000000, thread: 0, score: -21.0, average: -20.26 
episode: 915/2000000, thread: 1, score: -21.0, average: -20.28 
episode: 916/2000000, thread: 2, score: -19.0, average: -20.26 
episode: 917/2000000, thread: 3, score: 

episode: 1029/2000000, thread: 1, score: -20.0, average: -19.76 
episode: 1030/2000000, thread: 2, score: -20.0, average: -19.74 
episode: 1031/2000000, thread: 0, score: -20.0, average: -19.72 SAVING
episode: 1032/2000000, thread: 3, score: -19.0, average: -19.72 SAVING
episode: 1033/2000000, thread: 1, score: -21.0, average: -19.76 
episode: 1034/2000000, thread: 2, score: -20.0, average: -19.76 
episode: 1035/2000000, thread: 0, score: -21.0, average: -19.78 
episode: 1036/2000000, thread: 3, score: -19.0, average: -19.76 
episode: 1037/2000000, thread: 2, score: -20.0, average: -19.78 
episode: 1038/2000000, thread: 0, score: -20.0, average: -19.76 
episode: 1039/2000000, thread: 1, score: -17.0, average: -19.74 
episode: 1040/2000000, thread: 3, score: -19.0, average: -19.72 SAVING
episode: 1041/2000000, thread: 2, score: -21.0, average: -19.72 SAVING
episode: 1042/2000000, thread: 0, score: -20.0, average: -19.72 SAVING
episode: 1043/2000000, thread: 1, score: -20.0, average: -19

episode: 1155/2000000, thread: 2, score: -20.0, average: -20.00 
episode: 1156/2000000, thread: 3, score: -21.0, average: -20.00 
episode: 1157/2000000, thread: 0, score: -20.0, average: -20.02 
episode: 1158/2000000, thread: 1, score: -19.0, average: -20.02 
episode: 1159/2000000, thread: 2, score: -20.0, average: -20.02 
episode: 1160/2000000, thread: 3, score: -21.0, average: -20.06 
episode: 1161/2000000, thread: 0, score: -20.0, average: -20.08 
episode: 1162/2000000, thread: 1, score: -20.0, average: -20.08 
episode: 1163/2000000, thread: 3, score: -21.0, average: -20.08 
episode: 1164/2000000, thread: 2, score: -20.0, average: -20.06 
episode: 1165/2000000, thread: 0, score: -21.0, average: -20.10 
episode: 1166/2000000, thread: 1, score: -21.0, average: -20.10 
episode: 1167/2000000, thread: 3, score: -19.0, average: -20.06 
episode: 1168/2000000, thread: 2, score: -19.0, average: -20.06 
episode: 1169/2000000, thread: 0, score: -20.0, average: -20.08 
episode: 1170/2000000, th

episode: 1282/2000000, thread: 3, score: -20.0, average: -20.22 
episode: 1283/2000000, thread: 2, score: -21.0, average: -20.22 
episode: 1284/2000000, thread: 0, score: -19.0, average: -20.18 
episode: 1285/2000000, thread: 1, score: -19.0, average: -20.16 
episode: 1286/2000000, thread: 3, score: -20.0, average: -20.18 
episode: 1287/2000000, thread: 2, score: -20.0, average: -20.16 
episode: 1288/2000000, thread: 0, score: -21.0, average: -20.18 
episode: 1289/2000000, thread: 3, score: -19.0, average: -20.14 
episode: 1290/2000000, thread: 1, score: -19.0, average: -20.12 
episode: 1291/2000000, thread: 2, score: -20.0, average: -20.12 
episode: 1292/2000000, thread: 0, score: -20.0, average: -20.10 
episode: 1293/2000000, thread: 3, score: -20.0, average: -20.12 
episode: 1294/2000000, thread: 1, score: -21.0, average: -20.14 
episode: 1295/2000000, thread: 2, score: -21.0, average: -20.14 
episode: 1296/2000000, thread: 0, score: -20.0, average: -20.12 
episode: 1297/2000000, th

episode: 1409/2000000, thread: 0, score: -21.0, average: -19.88 
episode: 1410/2000000, thread: 3, score: -20.0, average: -19.88 
episode: 1411/2000000, thread: 1, score: -19.0, average: -19.84 
episode: 1412/2000000, thread: 0, score: -21.0, average: -19.86 
episode: 1413/2000000, thread: 2, score: -19.0, average: -19.86 
episode: 1414/2000000, thread: 3, score: -21.0, average: -19.96 
episode: 1415/2000000, thread: 0, score: -19.0, average: -19.98 
episode: 1416/2000000, thread: 1, score: -19.0, average: -19.94 
episode: 1417/2000000, thread: 2, score: -19.0, average: -19.94 
episode: 1418/2000000, thread: 3, score: -20.0, average: -19.92 
episode: 1419/2000000, thread: 0, score: -20.0, average: -19.98 
episode: 1420/2000000, thread: 1, score: -19.0, average: -19.98 
episode: 1421/2000000, thread: 2, score: -20.0, average: -20.02 
episode: 1422/2000000, thread: 0, score: -20.0, average: -20.04 
episode: 1423/2000000, thread: 3, score: -20.0, average: -20.04 
episode: 1424/2000000, th

episode: 1535/2000000, thread: 2, score: -20.0, average: -19.62 SAVING
episode: 1536/2000000, thread: 1, score: -21.0, average: -19.62 SAVING
episode: 1537/2000000, thread: 3, score: -21.0, average: -19.66 
episode: 1538/2000000, thread: 0, score: -21.0, average: -19.70 
episode: 1539/2000000, thread: 2, score: -21.0, average: -19.70 
episode: 1540/2000000, thread: 3, score: -21.0, average: -19.72 
episode: 1541/2000000, thread: 1, score: -17.0, average: -19.66 
episode: 1542/2000000, thread: 0, score: -20.0, average: -19.70 
episode: 1543/2000000, thread: 2, score: -20.0, average: -19.70 
episode: 1544/2000000, thread: 3, score: -20.0, average: -19.68 
episode: 1545/2000000, thread: 1, score: -20.0, average: -19.72 
episode: 1546/2000000, thread: 0, score: -19.0, average: -19.68 
episode: 1547/2000000, thread: 2, score: -20.0, average: -19.68 
episode: 1548/2000000, thread: 3, score: -20.0, average: -19.72 
episode: 1549/2000000, thread: 1, score: -20.0, average: -19.72 
episode: 1550

episode: 1661/2000000, thread: 1, score: -19.0, average: -19.74 
episode: 1662/2000000, thread: 0, score: -19.0, average: -19.70 
episode: 1663/2000000, thread: 3, score: -19.0, average: -19.66 
episode: 1664/2000000, thread: 2, score: -21.0, average: -19.68 
episode: 1665/2000000, thread: 1, score: -18.0, average: -19.64 
episode: 1666/2000000, thread: 0, score: -21.0, average: -19.70 
episode: 1667/2000000, thread: 2, score: -21.0, average: -19.74 
episode: 1668/2000000, thread: 3, score: -20.0, average: -19.76 
episode: 1669/2000000, thread: 0, score: -21.0, average: -19.78 
episode: 1670/2000000, thread: 1, score: -20.0, average: -19.78 
episode: 1671/2000000, thread: 2, score: -19.0, average: -19.74 
episode: 1672/2000000, thread: 3, score: -20.0, average: -19.78 
episode: 1673/2000000, thread: 0, score: -21.0, average: -19.84 
episode: 1674/2000000, thread: 2, score: -19.0, average: -19.82 
episode: 1675/2000000, thread: 1, score: -19.0, average: -19.80 
episode: 1676/2000000, th

episode: 1787/2000000, thread: 3, score: -19.0, average: -19.86 
episode: 1788/2000000, thread: 2, score: -21.0, average: -19.90 
episode: 1789/2000000, thread: 1, score: -21.0, average: -19.94 
episode: 1790/2000000, thread: 0, score: -21.0, average: -19.98 
episode: 1791/2000000, thread: 2, score: -21.0, average: -20.00 
episode: 1792/2000000, thread: 3, score: -19.0, average: -20.00 
episode: 1793/2000000, thread: 0, score: -21.0, average: -20.02 
episode: 1794/2000000, thread: 1, score: -21.0, average: -20.06 
episode: 1795/2000000, thread: 2, score: -20.0, average: -20.06 
episode: 1796/2000000, thread: 3, score: -20.0, average: -20.04 
episode: 1797/2000000, thread: 0, score: -19.0, average: -20.02 
episode: 1798/2000000, thread: 1, score: -20.0, average: -20.00 
episode: 1799/2000000, thread: 3, score: -21.0, average: -20.04 
episode: 1800/2000000, thread: 2, score: -18.0, average: -20.00 
episode: 1801/2000000, thread: 1, score: -21.0, average: -20.00 
episode: 1802/2000000, th

episode: 1914/2000000, thread: 0, score: -19.0, average: -19.58 
episode: 1915/2000000, thread: 1, score: -20.0, average: -19.60 
episode: 1916/2000000, thread: 3, score: -20.0, average: -19.58 
episode: 1917/2000000, thread: 2, score: -19.0, average: -19.54 SAVING
episode: 1918/2000000, thread: 0, score: -21.0, average: -19.56 
episode: 1919/2000000, thread: 1, score: -21.0, average: -19.56 
episode: 1920/2000000, thread: 3, score: -19.0, average: -19.54 SAVING
episode: 1921/2000000, thread: 2, score: -20.0, average: -19.52 SAVING
episode: 1922/2000000, thread: 0, score: -19.0, average: -19.48 SAVING
episode: 1923/2000000, thread: 1, score: -19.0, average: -19.46 SAVING
episode: 1924/2000000, thread: 3, score: -20.0, average: -19.44 SAVING
episode: 1925/2000000, thread: 2, score: -19.0, average: -19.44 SAVING
episode: 1926/2000000, thread: 0, score: -20.0, average: -19.46 
episode: 1927/2000000, thread: 1, score: -18.0, average: -19.46 
episode: 1928/2000000, thread: 3, score: -19.0, 

episode: 2039/2000000, thread: 1, score: -19.0, average: -19.42 
episode: 2040/2000000, thread: 3, score: -20.0, average: -19.46 
episode: 2041/2000000, thread: 2, score: -21.0, average: -19.46 
episode: 2042/2000000, thread: 1, score: -20.0, average: -19.46 
episode: 2043/2000000, thread: 0, score: -19.0, average: -19.44 
episode: 2044/2000000, thread: 3, score: -19.0, average: -19.42 
episode: 2045/2000000, thread: 2, score: -19.0, average: -19.38 
episode: 2046/2000000, thread: 1, score: -21.0, average: -19.40 
episode: 2047/2000000, thread: 0, score: -20.0, average: -19.40 
episode: 2048/2000000, thread: 3, score: -20.0, average: -19.40 
episode: 2049/2000000, thread: 1, score: -21.0, average: -19.44 
episode: 2050/2000000, thread: 2, score: -17.0, average: -19.38 
episode: 2051/2000000, thread: 0, score: -20.0, average: -19.38 
episode: 2052/2000000, thread: 3, score: -19.0, average: -19.36 
episode: 2053/2000000, thread: 1, score: -21.0, average: -19.36 
episode: 2054/2000000, th