In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
DRIVE_PATH = '/content/gdrive/My\ Drive/cs285_project'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/cs285_project'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

In [None]:
import time
logdir = 'cs285_project/gomoku_model' + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
# logdir = os.path.join(data_path, logdir)
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)
%cd $logdir

In [None]:
# Models and Helper Functions
import random
import numpy as np
import copy
import keras
import sys
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.optimizers import Adam
from keras import optimizers, metrics
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import CSVLogger
import tensorflow as tf
import pandas as pd

EPISODES = 4000
# EPISODES = 2000

EMPTY_SPACE = 0
WHITE_PLAYER = -1
BLACK_PLAYER = 1

# define width and length of MYMAP, it will be used in this way:
# MYMAP[Y_OF_MYMAP][X_OF_MYMAP]
Y_OF_MYMAP = 7
X_OF_MYMAP = 7
# How many continues chess is WIN?
WIN_CONDITION = 4

MYMAP = np.zeros((X_OF_MYMAP,Y_OF_MYMAP), dtype=int)

Running_MAP = 0 # runtime map, it will initialized as MYMAP.

# Define total actions
gTOTAL_ACTIONs = Y_OF_MYMAP * X_OF_MYMAP
gSTATE_SIZE = X_OF_MYMAP * Y_OF_MYMAP

# NN parameters
gGAMMA = 0.75
gLEARNNING_RATE = 0.00001

# Other parameters
WINNER_REWARD = 1
LOSER_REWARD = -1
DRAW_REWARD = 0

#############################

def env_render(pMYMAP):
    for y in range(0,Y_OF_MYMAP,1):
        for x in range(0,X_OF_MYMAP,1):
            if pMYMAP[0][x+y*Y_OF_MYMAP] == WHITE_PLAYER:
                print("O",end="")
            elif pMYMAP[0][x+y*Y_OF_MYMAP]  == BLACK_PLAYER:
                print("X",end="")
            elif pMYMAP[0][x+y*Y_OF_MYMAP]  == EMPTY_SPACE:
                print("_",end="")
            else:
                print("PANIC, unknow element of MYMAP")
                exit()
        print("") # chnage to new line

def check_state_has_winner(nn_state, who_is_playing):
    # Check every X line
    for y in range(0, gSTATE_SIZE, Y_OF_MYMAP):
        for x in range(0, X_OF_MYMAP-WIN_CONDITION+1,1):
            chess_count = 0
            if y+x+WIN_CONDITION-1 < y+X_OF_MYMAP:
                for j in range(0, WIN_CONDITION, 1):
                    if nn_state[0][y+x+j] == who_is_playing:                
                        chess_count=chess_count+1
                        if chess_count == WIN_CONDITION:
                            return True
                    else:
                        break
                    
    # Check every Y line
    for x in range(0, X_OF_MYMAP, 1):
        for y in range(0, Y_OF_MYMAP-WIN_CONDITION+1,1):
            chess_count = 0
            if x+X_OF_MYMAP*y+X_OF_MYMAP*(WIN_CONDITION-1) < gSTATE_SIZE:
                for j in range(0, WIN_CONDITION, 1):
                    if nn_state[0][x+X_OF_MYMAP*y+X_OF_MYMAP*j] == who_is_playing:
                        chess_count=chess_count+1
                        if chess_count == WIN_CONDITION:
                            return True                
                    else:
                        break

    # Check \
    for y in range(0, Y_OF_MYMAP, 1):
        for x in range(0, X_OF_MYMAP, 1):
            chess_count = 0
            for j in range(0,WIN_CONDITION,1):
                if (y+WIN_CONDITION-1)<Y_OF_MYMAP and (x+WIN_CONDITION-1)<X_OF_MYMAP:
                    if nn_state[0][x+X_OF_MYMAP*y+X_OF_MYMAP*j+j] == who_is_playing:
                        chess_count=chess_count+1
                        if chess_count == WIN_CONDITION:
                            return True
                    else:
                        break
                        
    # check / 
    for y in range(0, Y_OF_MYMAP, 1):
        for x in range(X_OF_MYMAP - 1, -1, -1):
            chess_count = 0
            for j in range(0,WIN_CONDITION,1):
                if (y+WIN_CONDITION-1)<Y_OF_MYMAP and (x-(WIN_CONDITION-1)) >= 0:
                    if nn_state[0][x+X_OF_MYMAP*y+X_OF_MYMAP*j-j] == who_is_playing:
                        chess_count=chess_count+1
                        if chess_count == WIN_CONDITION:
                            return True
                    else:
                        break
                        
    # No winner
    return False

def env_step(nn_state, internal_action,who_is_playing):
    done = False
    reward = 0
    private_next_state = copy.deepcopy(nn_state)

    if private_next_state[0][internal_action] != EMPTY_SPACE:
        print ("PANIC, env_step(),  private_next_state[internal_action] != EMPTY_SPACE")
    else:
        private_next_state[0][internal_action] = who_is_playing

    done = check_state_has_winner(private_next_state,who_is_playing)
    if done == True:
        reward = 100
        return private_next_state, reward, done, 0

    for index in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
        if private_next_state[0][index] != EMPTY_SPACE:
            continue
        else:
            return private_next_state, reward, done, 0

    done = True # Full of map, but no winner
    return private_next_state, reward, done, 0

####################################################
# 2019/09/27
# In order to use CNN, convert 1D state to 2D array.
####################################################
def convert_1D_state_to_2D_array(_state_1D):
    _state_2d_array = np.zeros((1,X_OF_MYMAP,Y_OF_MYMAP,1), dtype=int) # Keras CNN needs 4-D array as input.
    for y in range(0,Y_OF_MYMAP):
        for x in range(0, X_OF_MYMAP):
            _state_2d_array[0][y][x][0] = _state_1D[0][ y*Y_OF_MYMAP + x]

    return _state_2d_array

def play_game(agent_one, agent_two):
  done = False 
  player = 1
  time = 0
  x_len = 7 # game board row
  y_len = 7 # game board col
  map = np.zeros((x_len, y_len), dtype=int) # current map
  NN_state = np.reshape(map, [1, x_len*y_len])

  while not done:
    if player == 1:
      action, _ = agent_one.act(NN_state, 1, time, test=True)
    else:
      action, _ = agent_two.act(NN_state, -1, time, test=True)

    NN_state, reward, done, _ = env_step(NN_state, action, player)
    # print(NN_state)
    
    if done and reward == 0:
      env_render(NN_state)
      print("done for the testing game", 0)
      return 0
    elif done:
      env_render(NN_state)
      print("done for the testing game", player)
      return player
    player = -player
  return '?'

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size =  state_size
        self.action_size = action_size
        self.memory = deque(maxlen=6000)
        self.gamma = gGAMMA    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.99
        self.learning_rate = gLEARNNING_RATE
        self.model = self._build_model()
        self.name=1

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        # model.add(Dense(512, input_dim=self.state_size, activation='linear'))
        ##################################################
        model.add(Conv2D(1024, kernel_size=(3,3), activation='linear', input_shape=(Y_OF_MYMAP, X_OF_MYMAP, 1)))
        model.add(Conv2D(1024, (2, 2), activation='linear'))   
        model.add(Conv2D(1024, (2, 2), activation='linear'))
        model.add(Flatten())
        model.add(Dense(512, activation='linear'))
        model.add(Dense(512, activation='relu'))        
        model.add(Dense(512, activation='linear'))         
        model.add(Dense(self.action_size, activation='linear'))
        #model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta())
        #model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        #model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.0001, clipnorm=1.))
        #model.compile(loss='mse',optimizer=keras.optimizers.Adagrad(lr=gLEARNNING_RATE, epsilon=None, decay=0.0))
        model.compile(loss='mse',optimizer=keras.optimizers.RMSprop(lr=gLEARNNING_RATE,rho=0.9, epsilon=None, decay=0.0), metrics=[metrics.MeanSquaredError()])
        #model.compile(loss='mse', optimizer=keras.optimizers.Adadelta())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def predict(self, nn_state):
        return self.model.predict(nn_state, verbose=0)

    def act(self, nn_state, Who_is_playing, _time, test=False):
        internal_action = -1
        available_location = copy.deepcopy(MYMAP)
        available_location = np.reshape(available_location, [1, gSTATE_SIZE])
        available_location_count = 0

        state_2d=convert_1D_state_to_2D_array(nn_state)

        for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
            if (nn_state[0][x] == EMPTY_SPACE ):
                available_location[0][available_location_count] = x
                available_location_count=available_location_count+1

        while (True):
            if np.random.rand() <= self.epsilon:
                index = random.randrange(0, available_location_count,1)
                internal_action = available_location[0][index]
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    internal_action = -1
                    print("random choosing new action")
            else:
                act_values = self.model.predict(state_2d, verbose=0)
                internal_action = np.argmax(act_values[0])
                #print("AI")
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    #print(" AI missed")
                    max_q = -99999
                    for p in range (0,available_location_count,1 ):
                        if max_q < act_values[0][available_location[0][p]]:
                            max_q = act_values[0][available_location[0][p]]
                            internal_action = available_location[0][p]
                        if nn_state[0][internal_action] != EMPTY_SPACE:
                            internal_action = -1
            
            if test:
              if internal_action > -1:
                break
            else:
              ####################################################################################
              # One step win check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepWinCheck action, action={}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepWinCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              ####################################################################################
              # One-step-lose check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, -1 * Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepLoseCheck action, action = {}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepLoseCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              if internal_action > -1:
                  break

        return internal_action, self.model.predict(state_2d, verbose=0)

    def replay(self, batch_size):
        ######### Orignal fetch 
        #minibatch = random.sample(self.memory, batch_size)
        #########
        minibatch = []
        _memory_length = len(self.memory)
        _state_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _CNN_all_state=np.zeros(( _memory_length, Y_OF_MYMAP, X_OF_MYMAP,1), dtype=float)
        _target_f_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _index = 0

        for _ in range(_memory_length):            
            #minibatch.append(self.memory.popleft()) 
            minibatch.append(self.memory.pop()) 

        for state, action, replay_reward, next_state, done in minibatch:
            CNN_signle_state=convert_1D_state_to_2D_array(state)
            ###############
            internal_action = -1
            unavailable_location = copy.deepcopy(MYMAP)
            unavailable_location = np.reshape(unavailable_location, [1, gSTATE_SIZE])
            unavailable_location_count = 0           
            for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
                if (state[0][x] != EMPTY_SPACE ):
                    unavailable_location[0][unavailable_location_count] = x
                    unavailable_location_count=unavailable_location_count+1
            #################
            target = replay_reward
            if not done:
                target = replay_reward + self.gamma * np.amax(self.model.predict(CNN_signle_state, verbose=0))                         
            target_f = self.model.predict(CNN_signle_state, verbose=0)
            target_f[0][action] = target
            ####
            for x in range (0, unavailable_location_count, 1):
                target_f[0][unavailable_location[0][x]] = LOSER_REWARD * 2
            ####
            if _index == 0:
                _CNN_all_state = copy.deepcopy(CNN_signle_state)
            else:
                _CNN_all_state = np.vstack((_CNN_all_state,CNN_signle_state))
                
            _target_f_all[_index]=copy.deepcopy(target_f)
            
            _index = _index + 1                    
        csv_logger = CSVLogger('log_dqn.csv', append=True, separator=';')         
        self.model.fit(_CNN_all_state, _target_f_all, batch_size=128, shuffle=False, epochs=1, verbose=2, callbacks=[csv_logger])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights("DQN_ONE_NET" + name)

    def update_target_network(self):
      return

class Two_Net_DQN_Agent:
    def __init__(self, state_size, action_size):
        self.state_size =  state_size
        self.action_size = action_size
        self.memory = deque(maxlen=6000)
        self.gamma = gGAMMA    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.99
        self.learning_rate = gLEARNNING_RATE
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_freq=3000
        self.name=2

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        # model.add(Dense(512, input_dim=self.state_size, activation='linear'))
        ##################################################
        model.add(Conv2D(1024, kernel_size=(3,3), activation='linear', input_shape=(Y_OF_MYMAP, X_OF_MYMAP, 1)))
        model.add(Conv2D(1024, (2, 2), activation='linear'))   
        model.add(Conv2D(1024, (2, 2), activation='linear'))
        model.add(Flatten())
        model.add(Dense(512, activation='linear'))
        model.add(Dense(512, activation='relu'))        
        model.add(Dense(512, activation='linear'))         
        model.add(Dense(self.action_size, activation='linear'))
        #model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta())
        #model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        #model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.0001, clipnorm=1.))
        #model.compile(loss='mse',optimizer=keras.optimizers.Adagrad(lr=gLEARNNING_RATE, epsilon=None, decay=0.0))
        model.compile(loss='mse',optimizer=keras.optimizers.RMSprop(lr=gLEARNNING_RATE,rho=0.9, epsilon=None, decay=0.0), metrics=[metrics.MeanSquaredError()])
        #model.compile(loss='mse', optimizer=keras.optimizers.Adadelta())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def predict(self, nn_state):
        return self.model.predict(nn_state, verbose=0)

    def act(self, nn_state, Who_is_playing, _time, test=False):
        internal_action = -1
        available_location = copy.deepcopy(MYMAP)
        available_location = np.reshape(available_location, [1, gSTATE_SIZE])
        available_location_count = 0

        state_2d=convert_1D_state_to_2D_array(nn_state)

        for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
            if (nn_state[0][x] == EMPTY_SPACE ):
                available_location[0][available_location_count] = x
                available_location_count=available_location_count+1

        while (True):
            #if np.random.rand() <= self.epsilon or who_is_playing == WHITE_PLAYER:
            #if np.random.rand() <= self.epsilon or who_is_playing == BLACK_PLAYER:
            if np.random.rand() <= self.epsilon:
                index = random.randrange(0, available_location_count,1)
                internal_action = available_location[0][index]
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    internal_action = -1
                    print("random choosing new action")
            else:
                act_values = self.model.predict(state_2d, verbose=0)
                internal_action = np.argmax(act_values[0])
                #print("AI")
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    #print(" AI missed")
                    max_q = -99999
                    for p in range (0,available_location_count,1 ):
                        if max_q < act_values[0][available_location[0][p]]:
                            max_q = act_values[0][available_location[0][p]]
                            internal_action = available_location[0][p]
                        if nn_state[0][internal_action] != EMPTY_SPACE:
                            internal_action = -1
            
            if test:
              if internal_action > -1:
                break
            else:
              ####################################################################################
              # One step win check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepWinCheck action, action={}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepWinCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              ####################################################################################
              # One-step-lose check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, -1 * Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepLoseCheck action, action = {}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepLoseCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              if internal_action > -1:
                  break

        return internal_action, self.model.predict(state_2d, verbose=0)

    def replay(self, batch_size):
        ######### Orignal fetch 
        #minibatch = random.sample(self.memory, batch_size)
        #########
        minibatch = []
        _memory_length = len(self.memory)
        _state_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _CNN_all_state=np.zeros(( _memory_length, Y_OF_MYMAP, X_OF_MYMAP,1), dtype=float)
        _target_f_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _index = 0

        for _ in range(_memory_length):            
            #minibatch.append(self.memory.popleft()) 
            minibatch.append(self.memory.pop()) 

        for state, action, replay_reward, next_state, done in minibatch:
            CNN_signle_state=convert_1D_state_to_2D_array(state)
            ###############
            internal_action = -1
            unavailable_location = copy.deepcopy(MYMAP)
            unavailable_location = np.reshape(unavailable_location, [1, gSTATE_SIZE])
            unavailable_location_count = 0           
            for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
                if (state[0][x] != EMPTY_SPACE ):
                    unavailable_location[0][unavailable_location_count] = x
                    unavailable_location_count=unavailable_location_count+1
            #################
            # target = replay_reward
            CNN_signle_state_next=convert_1D_state_to_2D_array(next_state)
            # if not done:
            target = replay_reward + self.gamma * np.amax(self.target_model.predict(CNN_signle_state_next))   
            #target_f[0][action] = target                
            target_f = self.model.predict(CNN_signle_state, verbose=0)
            target_f[0][action] = (target-target_f[0][action])**2
        
            ####
            for x in range (0, unavailable_location_count, 1):
                target_f[0][unavailable_location[0][x]] = LOSER_REWARD * 2
            ####
            if _index == 0:
                _CNN_all_state = copy.deepcopy(CNN_signle_state)
            else:
                _CNN_all_state = np.vstack((_CNN_all_state,CNN_signle_state))
                
            _target_f_all[_index]=copy.deepcopy(target_f)
            
            _index = _index + 1                    
        csv_logger = CSVLogger('log_new_dqn.csv', append=True, separator=';')         
        self.model.fit(_CNN_all_state, _target_f_all, batch_size=128, shuffle=False, epochs=1, verbose=2, callbacks=[csv_logger])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
      self.target_model.set_weights(self.model.get_weights())

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights("DQN_saved" + name)

class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size =  state_size
        self.action_size = action_size
        self.memory = deque(maxlen=6000)
        self.gamma = gGAMMA    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.99
        self.learning_rate = gLEARNNING_RATE
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_freq=3000
        self.name=3

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        # model.add(Dense(512, input_dim=self.state_size, activation='linear'))
        ##################################################
        model.add(Conv2D(1024, kernel_size=(3,3), activation='linear', input_shape=(Y_OF_MYMAP, X_OF_MYMAP, 1)))
        model.add(Conv2D(1024, (2, 2), activation='linear'))   
        model.add(Conv2D(1024, (2, 2), activation='linear'))
        model.add(Flatten())
        model.add(Dense(512, activation='linear'))
        model.add(Dense(512, activation='relu'))        
        model.add(Dense(512, activation='linear'))         
        model.add(Dense(self.action_size, activation='linear'))
        #model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta())
        #model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        #model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.0001, clipnorm=1.))
        #model.compile(loss='mse',optimizer=keras.optimizers.Adagrad(lr=gLEARNNING_RATE, epsilon=None, decay=0.0))
        model.compile(loss='mse',optimizer=keras.optimizers.RMSprop(lr=gLEARNNING_RATE,rho=0.9, epsilon=None, decay=0.0), metrics=[metrics.MeanSquaredError()])
        #model.compile(loss='mse', optimizer=keras.optimizers.Adadelta())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def predict(self, nn_state):
        return self.model.predict(nn_state, verbose=0)

    def act(self, nn_state, Who_is_playing, _time, test=False):
        internal_action = -1
        available_location = copy.deepcopy(MYMAP)
        available_location = np.reshape(available_location, [1, gSTATE_SIZE])
        available_location_count = 0

        state_2d=convert_1D_state_to_2D_array(nn_state)

        for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
            if (nn_state[0][x] == EMPTY_SPACE ):
                available_location[0][available_location_count] = x
                available_location_count=available_location_count+1

        while (True):
            #if np.random.rand() <= self.epsilon or who_is_playing == WHITE_PLAYER:
            #if np.random.rand() <= self.epsilon or who_is_playing == BLACK_PLAYER:
            if np.random.rand() <= self.epsilon:
                index = random.randrange(0, available_location_count,1)
                internal_action = available_location[0][index]
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    internal_action = -1
                    print("random choosing new action")
            else:
                act_values = self.model.predict(state_2d, verbose=0)
                internal_action = np.argmax(act_values[0])
                #print("AI")
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    #print(" AI missed")
                    max_q = -99999
                    for p in range (0,available_location_count,1 ):
                        if max_q < act_values[0][available_location[0][p]]:
                            max_q = act_values[0][available_location[0][p]]
                            internal_action = available_location[0][p]
                        if nn_state[0][internal_action] != EMPTY_SPACE:
                            internal_action = -1
            
            if test:
              if internal_action > -1:
                break
            else:
              ####################################################################################
              # One step win check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepWinCheck action, action={}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepWinCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              ####################################################################################
              # One-step-lose check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, -1 * Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepLoseCheck action, action = {}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepLoseCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              if internal_action > -1:
                  break

        return internal_action, self.model.predict(state_2d, verbose=0)

    def replay(self, batch_size):
        ######### Orignal fetch 
        #minibatch = random.sample(self.memory, batch_size)
        #########
        print("REPLAYING")
        minibatch = []
        _memory_length = len(self.memory)
        _state_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _CNN_all_state=np.zeros(( _memory_length, Y_OF_MYMAP, X_OF_MYMAP,1), dtype=float)
        _target_f_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _index = 0

        for _ in range(_memory_length):            
            #minibatch.append(self.memory.popleft()) 
            minibatch.append(self.memory.pop()) 

        for state, action, replay_reward, next_state, done in minibatch:
            # print("SHOULD BE THE SAME")
            CNN_signle_state=convert_1D_state_to_2D_array(state)
            ###############
            internal_action = -1
            unavailable_location = copy.deepcopy(MYMAP)
            unavailable_location = np.reshape(unavailable_location, [1, gSTATE_SIZE])
            unavailable_location_count = 0           
            for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
                if (state[0][x] != EMPTY_SPACE ):
                    unavailable_location[0][unavailable_location_count] = x
                    unavailable_location_count=unavailable_location_count+1
            #################
            #target = replay_reward
            CNN_signle_state_next=convert_1D_state_to_2D_array(next_state)
            b=np.argmax(self.model.predict(CNN_signle_state_next, verbose=0))

            # .expand_dims(x, axis)
            qa_t_values = self.model.predict(CNN_signle_state, verbose=0)
            q_t_values= tf.convert_to_tensor([qa_t_values[0][action]])
            c=tf.convert_to_tensor([self.target_model.predict(CNN_signle_state_next)[0][b]])
            target = replay_reward + self.gamma * c           
            target_f = self.model.predict(CNN_signle_state, verbose=0)
            target_f[0][action] = tf.math.reduce_mean((tf.stop_gradient(target)-q_t_values)**2)
        
            ####
            for x in range (0, unavailable_location_count, 1):
                target_f[0][unavailable_location[0][x]] = LOSER_REWARD * 2
            ####
            if _index == 0:
                _CNN_all_state = copy.deepcopy(CNN_signle_state)
            else:
                _CNN_all_state = np.vstack((_CNN_all_state,CNN_signle_state))
                
            _target_f_all[_index]=copy.deepcopy(target_f)
            
            _index = _index + 1                    
        csv_logger = CSVLogger('log_ddqn.csv', append=True, separator=';')         
        self.model.fit(_CNN_all_state, _target_f_all, batch_size=128, shuffle=False, epochs=1, verbose=2, callbacks=[csv_logger])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
      self.target_model.set_weights(self.model.get_weights())

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights("DDQN_saved" + name)


class CQLAgent:
    def __init__(self, state_size, action_size):
        self.state_size =  state_size
        self.action_size = action_size
        self.memory = deque(maxlen=6000)
        self.gamma = gGAMMA    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.99
        self.learning_rate = gLEARNNING_RATE
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_freq=3000
        self.name=4

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        # model.add(Dense(512, input_dim=self.state_size, activation='linear'))
        ##################################################
        model.add(Conv2D(1024, kernel_size=(3,3), activation='linear', input_shape=(Y_OF_MYMAP, X_OF_MYMAP, 1)))
        model.add(Conv2D(1024, (2, 2), activation='linear'))   
        model.add(Conv2D(1024, (2, 2), activation='linear'))
        model.add(Flatten())
        model.add(Dense(512, activation='linear'))
        model.add(Dense(512, activation='relu'))        
        model.add(Dense(512, activation='linear'))         
        model.add(Dense(self.action_size, activation='linear'))
        #model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta())
        #model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        #model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.0001, clipnorm=1.))
        #model.compile(loss='mse',optimizer=keras.optimizers.Adagrad(lr=gLEARNNING_RATE, epsilon=None, decay=0.0))
        model.compile(loss='mse',optimizer=keras.optimizers.RMSprop(lr=gLEARNNING_RATE,rho=0.9, epsilon=None, decay=0.0), metrics=[metrics.MeanSquaredError()])
        #model.compile(loss='mse', optimizer=keras.optimizers.Adadelta())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def predict(self, nn_state):
        return self.model.predict(nn_state, verbose=0)

    def act(self, nn_state, Who_is_playing, _time, test=False):
        internal_action = -1
        available_location = copy.deepcopy(MYMAP)
        available_location = np.reshape(available_location, [1, gSTATE_SIZE])
        available_location_count = 0

        state_2d=convert_1D_state_to_2D_array(nn_state)

        for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
            if (nn_state[0][x] == EMPTY_SPACE ):
                available_location[0][available_location_count] = x
                available_location_count=available_location_count+1

        while (True):
            #if np.random.rand() <= self.epsilon or who_is_playing == WHITE_PLAYER:
            #if np.random.rand() <= self.epsilon or who_is_playing == BLACK_PLAYER:
            if np.random.rand() <= self.epsilon:
                index = random.randrange(0, available_location_count,1)
                internal_action = available_location[0][index]
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    internal_action = -1
                    print("random choosing new action")
            else:
                act_values = self.model.predict(state_2d, verbose=0)
                internal_action = np.argmax(act_values[0])
                #print("AI")
                if nn_state[0][internal_action] != EMPTY_SPACE:
                    #print(" AI missed")
                    max_q = -99999
                    for p in range (0,available_location_count,1 ):
                        if max_q < act_values[0][available_location[0][p]]:
                            max_q = act_values[0][available_location[0][p]]
                            internal_action = available_location[0][p]
                        if nn_state[0][internal_action] != EMPTY_SPACE:
                            internal_action = -1
            
            if test:
              if internal_action > -1:
                break
            else:
              ####################################################################################
              # One step win check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepWinCheck action, action={}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepWinCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              ####################################################################################
              # One-step-lose check and compare AI predict result to see training is converged yet.
              ####################################################################################
              for _index in range (0, available_location_count,1):
                  _osc_Action = available_location[0][_index]
                  _osc_next_state, _osc_reward, _osc_done, _ = env_step(nn_state, _osc_Action, -1 * Who_is_playing)
                  if _osc_reward > 0:
                      ######
                      if internal_action==_osc_Action:
                          print("Who_is_playing = {}, AI action hit OneStepLoseCheck action, action = {}".format(Who_is_playing,_osc_Action))
                      else:
                          print("Who_is_playing = {}, OneStepLoseCheck overrides AI predicted data".format(Who_is_playing))
                      internal_action = _osc_Action
                      return internal_action, self.model.predict(state_2d, verbose=0)

              if internal_action > -1:
                  break

        return internal_action, self.model.predict(state_2d, verbose=0)

    def replay(self, batch_size):
        ######### Orignal fetch 
        #minibatch = random.sample(self.memory, batch_size)
        #########
        print("replayingg")
        minibatch = []
        _memory_length = len(self.memory)
        _state_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _CNN_all_state=np.zeros(( _memory_length, Y_OF_MYMAP, X_OF_MYMAP,1), dtype=float)
        _target_f_all=np.zeros(( _memory_length,gSTATE_SIZE), dtype=float)
        _index = 0

        for _ in range(_memory_length):            
            minibatch.append(self.memory.pop()) 

        for state, action, replay_reward, next_state, done in minibatch:
            CNN_signle_state=convert_1D_state_to_2D_array(state)
            ###############

            internal_action = -1
            unavailable_location = copy.deepcopy(MYMAP)
            unavailable_location = np.reshape(unavailable_location, [1, gSTATE_SIZE])
            unavailable_location_count = 0           
            for x in range (0, Y_OF_MYMAP * X_OF_MYMAP ,1):
                if (state[0][x] != EMPTY_SPACE ):
                    unavailable_location[0][unavailable_location_count] = x
                    unavailable_location_count=unavailable_location_count+1
            #################
            re_n=(replay_reward+1)*100
            # target = re_n
            CNN_signle_state_next=convert_1D_state_to_2D_array(next_state)
            qa_t_values = self.model.predict(CNN_signle_state, verbose=0)
            
            q_t_values=tf.convert_to_tensor([qa_t_values[0][action]])
            # if not done:
            qa_tp1_values = self.target_model(CNN_signle_state_next)
            next_actions=np.argmax(self.model.predict(CNN_signle_state_next, verbose=0)
            q_tp1=tf.convert_to_tensor([qa_tp1_values[0][next_actions]])
            target = tf.stop_gradient(re_n + self.gamma * q_tp1)
            loss =tf.math.reduce_mean((q_t_values-target)**2)    
            q_t_logsumexp =  tf.math.reduce_logsumexp(qa_t_values,1) 
            cql_loss=tf.math.reduce_mean(q_t_logsumexp - q_t_values)
            loss=loss+0.05*cql_loss
            target_f = self.model.predict(CNN_signle_state, verbose=0)
            target_f[0][action] = loss
        
            ####
            for x in range (0, unavailable_location_count, 1):
                target_f[0][unavailable_location[0][x]] = LOSER_REWARD * 2
            ####
            if _index == 0:
                _CNN_all_state = copy.deepcopy(CNN_signle_state)
            else:
                _CNN_all_state = np.vstack((_CNN_all_state,CNN_signle_state))
                
            _target_f_all[_index]=copy.deepcopy(target_f)
            
            _index = _index + 1                    
        csv_logger = CSVLogger('log_cql.csv', append=True, separator=';')         
        self.model.fit(_CNN_all_state, _target_f_all, batch_size=128, shuffle=False, epochs=1, verbose=2, callbacks=[csv_logger])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
      self.target_model.set_weights(self.model.get_weights())

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights("CQL_saved"+name)


def get_best_agent(agents):
  '''
  Get the best agent from a list of agents. 
  '''
  agent_weights = run_tournament(agents)
  best_agent_ind=np.random.choice(np.where(agent_weights == agent_weights.max())[0])
  best_agent=agents[best_agent_ind]
  #best_agent = agents[np.argmax(agent_weights)]
  return best_agent

def soft_get_agent(agents):
  agent_weights = run_tournament(agents)
  softmax_weights = [weight/np.sum(agent_weights) for weight in agent_weights]
  agent = np.choice(agents, p=softmax_weights)
  print(softmax_weights, agent)
  return agent     

In [None]:
from tqdm.notebook import tqdm

def run_tournament(agents, n_iter=None):
  # Shuffle the agents
  agent_weights = np.array([0] * len(agents))
  agent_idxs = list(range(len(agents)))
  tup = [[0,1],[0,2],[0,3],[1,0],[1,2],[1,3],[2,0],[2,1],[2,3],[3,0],[3,1],[3,2], 
         [0,1],[0,2],[0,3],[1,0],[1,2],[1,3],[2,0],[2,1],[2,3],[3,0],[3,1],[3,2]]

  # run the tournament once, knocking out each team after they lose. The last team standing is the winner
  for idx1, idx2 in tqdm(tup):
      # randomly select two agents to play
      
      agent1 = agents[idx1]
      agent2 = agents[idx2]

      # play the game
      winner_id = play_game(agent1, agent2)
      # remove the loser from the tournament
      if winner_id == 1:
        agent_weights[idx1] += 1
      elif winner_id == -1:
        agent_weights[idx2] += 1
      else:
        # Assymetric reward for tie bc player 1 has an advantage
        agent_weights[idx1] += .4
        agent_weights[idx2] += .6
  print(agent_weights)
  return agent_weights

def train_vs(training_agent, best_agent, log, eps=40):
  gSTATE_SIZE = X_OF_MYMAP * Y_OF_MYMAP
  Agent_Black = training_agent
  Agent_White = best_agent

  Agent_Black.epsilon = 0.0
  Agent_White.epsilon = 0.0

  Win_Black_Count = 0
  Win_White_Count = 0
  Win_Draw_Count = 0

  done = False
  BLACK_batch_size = 96
  WHITE_batch_size = 96

  time_step=0
  prev_step=0
  for e in range(eps+1):
      # Enviroment Reset
      Running_MAP = copy.deepcopy(MYMAP)
      Who_is_playing  = np.random.choice([WHITE_PLAYER, BLACK_PLAYER], 1)[0]
      BLACK_Last_Action = -1
      WHITE_Last_Action = -1
      
      NN_state = np.reshape(Running_MAP, [1, gSTATE_SIZE]) # reshape Running_MAP to NN input form
      BLACK_Last_State = copy.deepcopy(NN_state)
      
      #### 20191015 his feature, history feature.
      BLACK_History_State = np.zeros((X_OF_MYMAP*Y_OF_MYMAP + 1, 1, X_OF_MYMAP * Y_OF_MYMAP), dtype=int)                        
      WHITE_History_State = np.zeros((X_OF_MYMAP*Y_OF_MYMAP + 1, 1, X_OF_MYMAP * Y_OF_MYMAP), dtype=int)        

      BLACK_History_Action = np.zeros((X_OF_MYMAP*Y_OF_MYMAP + 1), dtype=int)
      WHITE_History_Action = np.zeros((X_OF_MYMAP*Y_OF_MYMAP + 1), dtype=int)
      #### 20191015 his feature, history feature.

      #env_render(NN_state)

      for time in range(1, Y_OF_MYMAP * X_OF_MYMAP+1, 1):
          if (Who_is_playing == WHITE_PLAYER):
              Action,Q_Values = Agent_White.act(NN_state, Who_is_playing, time)

          elif (Who_is_playing == BLACK_PLAYER):
              Action,Q_Values = Agent_Black.act(NN_state, Who_is_playing, time)

          ###################
          next_state, reward, done, _ = env_step(NN_state, Action, Who_is_playing) #env.step(action)
          ###################

          if Who_is_playing == BLACK_PLAYER:
              #### 20191015 his feature, history feature.
              BLACK_History_State[time] = copy.deepcopy(NN_state)
              BLACK_History_Action[time] = Action
              #### 20191015 his feature, history feature.

          if Who_is_playing == WHITE_PLAYER:
              #### 20191015 his feature, history feature.
              WHITE_History_State[time] = copy.deepcopy(NN_state)
              WHITE_History_Action[time] = Action
              #### 20191015 his feature, history feature.

          if reward == 0:      # DRAW
              BLACK_reward = 0 # TODO may not hard code here.
              WHITE_reward = 0 # TODO may not hard code here.

          if done and reward == 100: # Someone WIN
              if Who_is_playing == BLACK_PLAYER:      # BLACK_PLAYER WIN
                  BLACK_reward = WINNER_REWARD
                  WHITE_reward = LOSER_REWARD
                
                  Agent_Black.remember(NN_state, Action, WINNER_REWARD, next_state, True)
                  _count = 1
                  for _end_index in range (time, 1, -2 ):                        
                      Agent_Black.remember(BLACK_History_State[_end_index-2], BLACK_History_Action[_end_index-2], WINNER_REWARD * ( gGAMMA ** _count) , BLACK_History_State[_end_index], True)
                      _count = _count + 1


              elif Who_is_playing == WHITE_PLAYER:    # WHITE_PLAYER WIN
                  BLACK_reward = LOSER_REWARD
                  WHITE_reward = WINNER_REWARD
    
                  Agent_White.remember(NN_state, Action, WINNER_REWARD, next_state, True)
                  _count = 1                    
                  for _end_index in range (time-1,1,-2):
                      Agent_Black.remember(BLACK_History_State[_end_index-2], BLACK_History_Action[_end_index-2], LOSER_REWARD  * ( gGAMMA ** _count), BLACK_History_State[_end_index], True)
                      _count = _count + 1

              else:
                  print ("PANIC - unknown who is Winner")
                  sys.exit()
                  
          elif done and reward ==0: ## DRAW
              if Who_is_playing == BLACK_PLAYER:
                  Agent_Black.remember(NN_state, Action, DRAW_REWARD, next_state, True)

              # if Who_is_playing == WHITE_PLAYER:
              #     Agent_White.remember(NN_state, Action, DRAW_REWARD, next_state, True)

              _count = 1
              for _end_index in range (time, 1, -2 ):                        
                  Agent_Black.remember(BLACK_History_State[_end_index-2], BLACK_History_Action[_end_index-2], DRAW_REWARD, BLACK_History_State[_end_index], True)
                  _count = _count + 1
              # _count = 1                    
              # for _end_index in range (time-1,1,-2):
              #     Agent_White.remember(WHITE_History_State[_end_index-2], WHITE_History_Action[_end_index-2], DRAW_REWARD, WHITE_History_State[_end_index], True)
              #     _count = _count + 1
                                  
          NN_state = copy.deepcopy(next_state) # update NN_state to new state

          if len(Agent_Black.memory) > BLACK_batch_size and done == True:
              Agent_Black.replay(BLACK_batch_size)
              time_step+=1
          if len(Agent_White.memory) > WHITE_batch_size and done == True:
              time_step+=1
          if time_step%10==0 and time_step!=prev_step:
            prev_step=time_step
            Agent_Black.update_target_network()

          if done == True:   # This round is finished.

              if reward>0:
                  if Who_is_playing == BLACK_PLAYER:
                      Win_Black_Count = Win_Black_Count+1
                  elif Who_is_playing == WHITE_PLAYER:
                      Win_White_Count = Win_White_Count+1
                  else:
                      print ("PANIC, unknow know who is winner")
                      
              elif reward == 0:
                  Win_Draw_Count = Win_Draw_Count + 1
                              
              if (Win_Black_Count+Win_White_Count)>0:
                  print("Action={} BlackWinRate={} Black Wins={} White Wins={} Draw={} episode: {}/{}, used_step: {}, epsilon: {:.2}"
                          .format(Action, Win_Black_Count/(Win_Black_Count+Win_White_Count), Win_Black_Count,Win_White_Count, Win_Draw_Count, e, EPISODES, time, Agent_Black.epsilon))  
                  if e != 0 and e % eps==0:
                    test = 20
                    win = 0
                    draw =0

                    temp_b = Agent_Black.epsilon
                    Agent_Black.epsilon = 0.0
                    temp_a = Agent_White.epsilon
                    Agent_White.epsilon = 1.0

                    for i in range(test):
                      result = play_game(Agent_Black, Agent_White)
                      if result == 1:
                        win +=1
                      elif result == 0:
                        draw +=1
                    
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    print('after ', e, ' iterations', "the win rate against the best (going first)is: ", win/test, "draw rate is: ", draw/test)
                    log[str(Agent_Black)]["First"].append(win/test)
                    win_sec = 0
                    draw_sec = 0
                    for i in range(test):
                      result = play_game(Agent_White, Agent_Black)
                      if result == -1:
                        win_sec +=1
                      elif result == 0:
                        draw_sec +=1

                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    print('after ', e, ' iterations', "the win rate against the best (going second)is: ", win_sec/test, "draw rate is: ", draw_sec/test)
                    log[str(Agent_Black)]["Second"].append(win_sec/test)

                    Agent_Black.epsilon = temp_b # change it back to training stage
                    Agent_White.epsilon = temp_a
              break

          Who_is_playing = -1 * Who_is_playing

def tournament_training(n=25):
  #loading in pretrained models
  agents = [
      DQNAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      DDQNAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      CQLAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      Two_Net_DQN_Agent(gSTATE_SIZE, gTOTAL_ACTIONs),
  ]
  pathes = [
      "/content/cs285_project/models/dqn_one_net_agent.h5",
      "/content/cs285_project/models/ddqn_agent.h5",
      "/content/cs285_project/models/cql.h5",
      "/content/cs285_project/models/dqn_agent.h5"
  ]
  agent_wl_rates = {str(a):{"First":[], "Second":[]} for a in agents}
  path = f"/content/cs285_project/logs/{time.strftime('%d-%m-%Y_%H-%M-%S')}_WL_new_rate_logs_tournament.csv"
  print("PATH",path)
  best_agents=[]
  for a, p in zip(agents, pathes):
    a.load(p)
  for i in tqdm(range(n)):
    pd.DataFrame(agent_wl_rates).to_csv(path)
    best_agent = get_best_agent(agents)
    best_agents.append(best_agent.name)
    np.savetxt(fname="best_agents_in_time.csv", delimiter=",", X=best_agents)
    for a in agents:
      train_vs(a, best_agent, agent_wl_rates)
    pd.DataFrame(agent_wl_rates).to_csv(path)
    for a in agents:
      a.save("_saved_agent.h5")
tournament_training()

def tournament_training(n=25):
  #loading in pretrained models
  agents = [
      DQNAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      DDQNAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      CQLAgent(gSTATE_SIZE, gTOTAL_ACTIONs),
      Two_Net_DQN_Agent(gSTATE_SIZE, gTOTAL_ACTIONs),
  ]
  pathes = [
      "/content/cs285_project/models/dqn_one_net_agent.h5",
      "/content/cs285_project/models/ddqn_agent.h5",
      "/content/cs285_project/models/cql.h5",
      "/content/cs285_project/models/dqn_agent.h5"
  ]
  agent_wl_rates = {str(a):{"First":[], "Second":[]} for a in agents}
  path = f"/content/cs285_project/logs/{time.strftime('%d-%m-%Y_%H-%M-%S')}_WL_new_rate_logs_tournament.csv"
  print("PATH",path)
  best_agents=[]
  for a, p in zip(agents, pathes):
    a.load(p)
  for i in tqdm(range(n)):
    pd.DataFrame(agent_wl_rates).to_csv(path)
    best_agent = soft_get_agent(agents)
    best_agents.append(best_agent.name)
    np.savetxt(fname="best_agents_in_time.csv", delimiter=",", X=best_agents)
    for a in agents:
      train_vs(a, best_agent, agent_wl_rates)
    pd.DataFrame(agent_wl_rates).to_csv(path)
    for a in agents:
      a.save("_saved_agent.h5")
  tournament_training()