In [246]:
import os, sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

import importlib
import ChessEngine
import Minimax.SmartMoveFinder as SmartMoveFinder
import Minimax.Evaluate as Evaluate
import ChessEnv
import replaybuffer
import network
import copy 
import random
import torch
import torch.nn as nn
import tqdm
from tqdm import trange

importlib.reload(ChessEngine)
importlib.reload(ChessEnv)
importlib.reload(replaybuffer)
importlib.reload(network)

<module 'network' from 'd:\\User\\ProjectGithub\\hiepnguyenn-99\\Chess\\RL\\network.py'>

In [247]:
env = ChessEnv.Env()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

q_net = network.DQN(env.action_size).to(device)

# load mô hình lưu
model_path = 'DQN.pth'
if os.path.exists(model_path):
    checkpoint = torch.load(model_path, map_location=device)
    q_net.load_state_dict(checkpoint)
    q_net.train()
    print("Đã load model")
else:
    print("Không tìm thấy model")

target_net = copy.deepcopy(q_net).to(device)
target_net.eval()
optimizer = torch.optim.Adam(q_net.parameters(), lr=1e-4)
criterion  = nn.MSELoss(reduction='mean')

cuda
Đã load model


In [None]:
env = ChessEnv.Env()
capacity = 32000
rb = replaybuffer.ReplayBuffer(capacity)
batch_size = 64
epsilon = 1.0
epsilon_final = 0.01
epsilon_decay = 0.995
gamma = 0.99
step = 0
target_update_freq = 1250
env.gs.whiteToMove = True # train trắng đi trước (minimax)

for episode in trange(20000):
    done = False
    state = env.reset()
    while not done:
        # nước đi của minimax
        white_capture = None # kiểm tra minimax có ăn quân của rl không
        if env.gs.whiteToMove:
            if Evaluate.check_mid_game(env.gs):
                SmartMoveFinder.DEPTH = 4
            else:
                SmartMoveFinder.DEPTH = 3
            MinimaxMove = SmartMoveFinder.findBestMinimaxMove(env.gs, env.gs.getValidMoves())
            if MinimaxMove is None:
                MinimaxMove = SmartMoveFinder.findRandomMove(env.gs.getValidMoves())
            if MinimaxMove.pieceCaptured != '--':
                white_capture = MinimaxMove.pieceCaptured[1]

            env.gs.makeMove(MinimaxMove)

        state_tensor = env.state_to_tensor()
        lenlegalmove = len(env.gs.getValidMoves())
        # khám phá
        if lenlegalmove != 0:    
            if random.random() < epsilon:
                move = random.choice(env.gs.getValidMoves())
                action = env.moveid_to_index[move.moveID]

            # khai thác
            else:
                q_value = q_net(state_tensor.unsqueeze(0).to(device)) 
                action = q_value.argmax()
            
        next_state_tensor, reward, done, before_legal_mask, after_legal_mask = env.step(action, white_capture, lenlegalmove) # đen đã đi, kiểm tra đen bị ăn quân ko
        rb.push(state_tensor, action, reward, next_state_tensor, done, before_legal_mask, after_legal_mask)

        if rb.__len__() >= batch_size:
            batch = rb.sample(batch_size)
            # chuyển về tensor
            states, actions, reward, next_states, dones, before_legal_masks, after_legal_mask= zip(*batch)
            states = torch.stack([s.to(device) for s in states])
            actions = torch.tensor(actions, device=device, dtype=torch.int64).unsqueeze(1) # (B, 1)
            reward = torch.tensor(reward, device=device, dtype=torch.float32).unsqueeze(1)
            next_states = torch.stack([ns.to(device) for ns in next_states])
            before_legal_masks = torch.stack([b.to(device) for b in before_legal_masks])
            after_legal_mask = torch.stack([a.to(device) for a in after_legal_mask])
            dones = torch.tensor(dones, device=device, dtype=torch.float32).unsqueeze(1)

            with torch.no_grad():
                next_q_values = target_net(next_states) # (B, action_size)
                next_q_values[~after_legal_mask] = -torch.inf
                next_q_max = next_q_values.max(1)[0].unsqueeze(1) # (B, 1)

                # nếu không còn nước đi hợp lệ của next_states thì đặt bằng 0
                legal_exists = after_legal_mask.any(dim=1, keepdim=True)
                next_q_max = torch.where(legal_exists, next_q_max, torch.zeros_like(next_q_max))

                q_target = reward + gamma * (1 - dones) * next_q_max # (B, 1)

            q_values = q_net(states.to(device)) # (B, action_size)
            q_values[~before_legal_masks] = -torch.inf
            q_value = q_values.gather(dim=1, index=actions) # tính q value với hành động đã chọn (B, 1)
            if step % 125 == 0:
                with open('train_log.txt', 'a') as f:
                    print(f'q_target max {q_target[0][0]}', file=f)
                    print(f'q_value max {q_value[0][0]}', file=f)
                print(f'q_target max {q_target[0][0]}')
                print(f'q_value max {q_value[0][0]}')

            loss = criterion(q_value, q_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            step += 1

            if step % target_update_freq == 0:
                target_net.load_state_dict(q_net.state_dict())
                torch.save(target_net.state_dict(), 'DQN.pth')
                print(f'step {step}, model saved')

        epsilon = max(epsilon_final, epsilon*epsilon_decay)


  0%|          | 0/20000 [00:00<?, ?it/s]

total sum reward 0.0
total sum reward 0.0


  0%|          | 7/20000 [01:24<22:01:18,  3.97s/it] 

q_target max 0.11671483516693115
q_value max -0.009818974882364273


  0%|          | 46/20000 [03:15<1:47:28,  3.09it/s] 

q_target max -10.0
q_value max -inf


  1%|          | 126/20000 [04:04<4:07:11,  1.34it/s]

q_target max 0.022378679364919662
q_value max 0.02067379094660282


  1%|          | 237/20000 [04:33<59:03,  5.58it/s]  

q_target max 0.18753160536289215
q_value max 0.18508148193359375


  2%|▏         | 347/20000 [05:02<1:02:31,  5.24it/s]

q_target max 0.15514308214187622
q_value max 0.15550145506858826


  2%|▏         | 466/20000 [05:27<1:05:23,  4.98it/s]

q_target max 0.10673297941684723
q_value max 0.10646098107099533


  3%|▎         | 586/20000 [05:52<57:04,  5.67it/s]  

q_target max -10.0
q_value max -inf


  4%|▎         | 712/20000 [06:16<58:45,  5.47it/s]  

q_target max -10.0
q_value max -inf


  4%|▍         | 835/20000 [06:39<59:42,  5.35it/s]  

q_target max -10.0
q_value max -inf


  5%|▍         | 960/20000 [07:03<58:49,  5.39it/s]  

q_target max -10.0
q_value max -inf


  5%|▌         | 1081/20000 [07:27<1:05:18,  4.83it/s]

step 1250, model saved


  5%|▌         | 1082/20000 [07:27<1:08:15,  4.62it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 1151/20000 [08:07<3:20:54,  1.56it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 1214/20000 [08:47<3:28:45,  1.50it/s]

q_target max -10.0
q_value max -inf


  6%|▋         | 1276/20000 [09:26<3:24:53,  1.52it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 1339/20000 [10:06<3:13:15,  1.61it/s]

q_target max 0.13040652871131897
q_value max 0.1306895613670349


  7%|▋         | 1401/20000 [10:47<3:14:10,  1.60it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 1463/20000 [11:28<3:11:01,  1.62it/s]

q_target max 0.30647897720336914
q_value max 0.30708158016204834


  8%|▊         | 1525/20000 [12:10<6:23:18,  1.24s/it]

q_target max -10.0
q_value max -inf


  8%|▊         | 1587/20000 [12:50<3:17:29,  1.55it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 1649/20000 [13:30<3:14:16,  1.57it/s]

q_target max -10.0
q_value max -inf


  9%|▊         | 1710/20000 [14:10<3:20:00,  1.52it/s]

step 2500, model saved


  9%|▊         | 1711/20000 [14:11<3:32:54,  1.43it/s]

q_target max 0.26240265369415283
q_value max 0.23954641819000244


  9%|▉         | 1781/20000 [14:57<2:45:45,  1.83it/s] 

q_target max -10.0
q_value max -inf


  9%|▉         | 1856/20000 [15:37<3:15:04,  1.55it/s]

q_target max 0.13979503512382507
q_value max 0.1378827542066574


 10%|▉         | 1921/20000 [16:19<3:29:43,  1.44it/s]

q_target max -10.0
q_value max -inf


 10%|▉         | 1983/20000 [17:05<4:08:21,  1.21it/s]

q_target max -10.0
q_value max -inf


 10%|█         | 2046/20000 [17:52<3:51:34,  1.29it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 2108/20000 [18:39<3:46:16,  1.32it/s]

q_target max 0.23715095221996307
q_value max 0.23723573982715607


 11%|█         | 2171/20000 [19:25<3:29:24,  1.42it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 2233/20000 [20:12<3:50:48,  1.28it/s]

q_target max 0.25530707836151123
q_value max 0.25622299313545227


 11%|█▏        | 2296/20000 [20:58<3:29:37,  1.41it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 2357/20000 [21:44<3:56:15,  1.24it/s]

step 3750, model saved


 12%|█▏        | 2358/20000 [21:44<4:03:17,  1.21it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 2426/20000 [22:34<3:50:21,  1.27it/s] 

q_target max -10.0
q_value max -inf


 12%|█▏        | 2484/20000 [23:22<3:09:06,  1.54it/s]

q_target max 0.2725101411342621
q_value max 0.25331562757492065


 13%|█▎        | 2545/20000 [24:14<3:19:31,  1.46it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 2603/20000 [25:04<3:37:42,  1.33it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 2660/20000 [25:56<3:54:30,  1.23it/s]

q_target max -10.0
q_value max -inf


 14%|█▎        | 2721/20000 [26:44<4:02:15,  1.19it/s]

q_target max -10.0
q_value max -inf


 14%|█▍        | 2778/20000 [27:34<6:41:57,  1.40s/it]

q_target max -10.0
q_value max -inf


 14%|█▍        | 2838/20000 [28:27<6:46:36,  1.42s/it] 

q_target max -10.0
q_value max -inf


 14%|█▍        | 2898/20000 [29:15<5:39:28,  1.19s/it]

q_target max -10.0
q_value max -inf


 15%|█▍        | 2956/20000 [30:08<3:56:18,  1.20it/s]

step 5000, model saved
q_target max -10.0
q_value max -inf


 15%|█▌        | 3000/20000 [31:05<5:01:11,  1.06s/it] 

q_target max -10.0
q_value max -inf


 15%|█▌        | 3049/20000 [31:57<5:12:46,  1.11s/it]

q_target max -10.0
q_value max -inf


 15%|█▌        | 3099/20000 [32:47<4:14:58,  1.10it/s]

q_target max 0.20966902375221252
q_value max 0.20792920887470245


 16%|█▌        | 3151/20000 [33:33<3:57:42,  1.18it/s]

q_target max 0.261956125497818
q_value max 0.27981922030448914


 16%|█▌        | 3201/20000 [34:21<4:52:11,  1.04s/it]

q_target max 0.261956125497818
q_value max 0.27830755710601807


 16%|█▌        | 3244/20000 [35:13<6:04:50,  1.31s/it]

q_target max -10.0
q_value max -inf


 16%|█▋        | 3289/20000 [36:08<6:12:49,  1.34s/it]

q_target max -10.0
q_value max -inf


 17%|█▋        | 3333/20000 [36:59<3:30:15,  1.32it/s]

q_target max 0.261956125497818
q_value max 0.2784249186515808


 17%|█▋        | 3372/20000 [37:58<7:24:10,  1.60s/it]

q_target max -10.0
q_value max -inf


 17%|█▋        | 3412/20000 [38:55<6:29:55,  1.41s/it]

step 6250, model saved
q_target max 0.30285555124282837
q_value max 0.29161006212234497


 17%|█▋        | 3465/20000 [39:54<2:44:00,  1.68it/s] 

q_target max 0.2892285883426666
q_value max 0.27452781796455383


 18%|█▊        | 3523/20000 [40:39<2:43:27,  1.68it/s]

q_target max -10.0
q_value max -inf


 18%|█▊        | 3585/20000 [41:18<2:40:37,  1.70it/s]

q_target max 0.2592441141605377
q_value max 0.2731417417526245


 18%|█▊        | 3647/20000 [41:58<2:52:24,  1.58it/s]

q_target max -10.0
q_value max -inf


 19%|█▊        | 3707/20000 [42:40<2:44:52,  1.65it/s]

q_target max -10.0
q_value max -inf


 19%|█▉        | 3767/20000 [43:27<4:55:40,  1.09s/it] 

q_target max -10.0
q_value max -inf


 19%|█▉        | 3828/20000 [44:09<2:43:10,  1.65it/s]

q_target max -10.0
q_value max -inf


 19%|█▉        | 3889/20000 [44:50<2:57:41,  1.51it/s]

q_target max 0.3610410690307617
q_value max 0.3574449121952057


 20%|█▉        | 3950/20000 [45:32<2:34:36,  1.73it/s]

q_target max -10.0
q_value max -inf


 20%|█▉        | 3999/20000 [46:21<6:41:02,  1.50s/it]

step 7500, model saved
q_target max 0.30727481842041016
q_value max 0.35861659049987793


 20%|██        | 4054/20000 [47:11<2:56:08,  1.51it/s]

q_target max -10.0
q_value max -inf


 21%|██        | 4113/20000 [48:00<3:42:48,  1.19it/s]

q_target max -0.1396811604499817
q_value max -0.1504763811826706


 21%|██        | 4171/20000 [48:50<2:56:58,  1.49it/s]

q_target max 0.26960524916648865
q_value max 0.279090017080307


 21%|██        | 4230/20000 [49:37<3:06:38,  1.41it/s]

q_target max -10.0
q_value max -inf


 21%|██▏       | 4290/20000 [50:23<2:55:06,  1.50it/s]

q_target max 0.30727481842041016
q_value max 0.3061627745628357


 22%|██▏       | 4351/20000 [51:10<2:55:11,  1.49it/s]

q_target max -10.0
q_value max -inf


 22%|██▏       | 4413/20000 [51:53<2:57:51,  1.46it/s]

q_target max -10.0
q_value max -inf


 22%|██▏       | 4476/20000 [52:37<2:53:25,  1.49it/s]

q_target max -10.0
q_value max -inf


 23%|██▎       | 4535/20000 [53:25<3:01:37,  1.42it/s]

q_target max -10.0
q_value max -inf


 23%|██▎       | 4596/20000 [54:11<3:03:59,  1.40it/s]

step 8750, model saved


 23%|██▎       | 4597/20000 [54:12<3:00:39,  1.42it/s]

q_target max -10.0
q_value max -inf


 23%|██▎       | 4659/20000 [54:56<3:06:54,  1.37it/s]

q_target max 0.2917368412017822
q_value max 0.2914356589317322


 24%|██▎       | 4721/20000 [55:40<2:59:26,  1.42it/s]

q_target max 0.2917368412017822
q_value max 0.29305776953697205


 24%|██▍       | 4783/20000 [56:24<3:34:34,  1.18it/s]

q_target max -10.0
q_value max -inf


 24%|██▍       | 4845/20000 [57:08<2:53:08,  1.46it/s]

q_target max -10.0
q_value max -inf


 25%|██▍       | 4907/20000 [57:52<3:01:39,  1.38it/s]

q_target max -10.0
q_value max -inf


 25%|██▍       | 4968/20000 [58:37<3:18:13,  1.26it/s]

q_target max 0.3159312903881073
q_value max 0.31802093982696533


 25%|██▌       | 5031/20000 [59:21<3:01:47,  1.37it/s]

q_target max 0.20764896273612976
q_value max 0.2055422067642212


 25%|██▌       | 5093/20000 [1:00:04<2:53:03,  1.44it/s]

q_target max -10.0
q_value max -inf


 26%|██▌       | 5155/20000 [1:00:48<2:58:21,  1.39it/s]

q_target max -10.0
q_value max -inf


 26%|██▌       | 5215/20000 [1:01:34<2:39:46,  1.54it/s]

step 10000, model saved


 26%|██▌       | 5216/20000 [1:01:35<3:03:48,  1.34it/s]

q_target max -10.0
q_value max -inf


 26%|██▋       | 5278/20000 [1:02:19<2:39:30,  1.54it/s]

q_target max 0.32577523589134216
q_value max 0.32884708046913147


 27%|██▋       | 5340/20000 [1:03:04<2:53:55,  1.40it/s]

q_target max 0.311849981546402
q_value max 0.3102131485939026


 27%|██▋       | 5403/20000 [1:03:49<2:48:39,  1.44it/s]

q_target max -10.0
q_value max -inf


 27%|██▋       | 5475/20000 [1:04:31<2:58:41,  1.35it/s]

q_target max 0.32296323776245117
q_value max 0.32784825563430786


 28%|██▊       | 5540/20000 [1:05:17<3:04:27,  1.31it/s]

q_target max 0.32577523589134216
q_value max 0.32313284277915955


 28%|██▊       | 5601/20000 [1:06:02<3:11:01,  1.26it/s]

q_target max 0.3090062439441681
q_value max 0.3095269203186035


 28%|██▊       | 5663/20000 [1:06:46<2:49:49,  1.41it/s]

q_target max -10.0
q_value max -inf


 29%|██▊       | 5725/20000 [1:07:30<2:27:25,  1.61it/s]

q_target max 0.3150131404399872
q_value max 0.31407368183135986


 29%|██▉       | 5787/20000 [1:08:17<2:50:45,  1.39it/s]

q_target max -10.0
q_value max -inf


 29%|██▉       | 5862/20000 [1:08:59<3:00:34,  1.30it/s]

step 11250, model saved


 29%|██▉       | 5863/20000 [1:08:59<3:10:15,  1.24it/s]

q_target max 0.3249971866607666
q_value max 0.32467663288116455


 30%|██▉       | 5929/20000 [1:09:43<2:37:38,  1.49it/s]

q_target max -10.0
q_value max -inf


 30%|███       | 6012/20000 [1:10:23<1:40:24,  2.32it/s]

q_target max 0.348562091588974
q_value max 0.34894871711730957


 30%|███       | 6078/20000 [1:11:07<2:51:41,  1.35it/s]

q_target max -10.0
q_value max -inf


 31%|███       | 6139/20000 [1:11:53<2:49:35,  1.36it/s]

q_target max -10.0
q_value max -inf


 31%|███       | 6204/20000 [1:12:36<2:18:46,  1.66it/s]

q_target max -10.0
q_value max -inf


 31%|███▏      | 6267/20000 [1:13:21<2:43:48,  1.40it/s]

q_target max 0.348562091588974
q_value max 0.3476243019104004


 32%|███▏      | 6329/20000 [1:14:04<2:51:02,  1.33it/s]

q_target max -10.0
q_value max -inf


 32%|███▏      | 6391/20000 [1:14:49<2:42:18,  1.40it/s]

q_target max 0.3249971866607666
q_value max 0.32479190826416016


 32%|███▏      | 6456/20000 [1:15:32<2:38:34,  1.42it/s]

q_target max -10.0
q_value max -inf


 33%|███▎      | 6521/20000 [1:16:18<2:58:11,  1.26it/s]

step 12500, model saved
q_target max -0.1331614851951599
q_value max -0.12251915782690048


 33%|███▎      | 6583/20000 [1:17:01<2:29:13,  1.50it/s]

q_target max -10.0
q_value max -inf


 33%|███▎      | 6645/20000 [1:17:46<2:24:38,  1.54it/s]

q_target max -10.0
q_value max -inf


 34%|███▎      | 6707/20000 [1:18:30<2:25:31,  1.52it/s]

q_target max 0.32716333866119385
q_value max 0.331076443195343


 34%|███▍      | 6770/20000 [1:19:15<2:30:32,  1.46it/s]

q_target max -10.0
q_value max -inf


 34%|███▍      | 6830/20000 [1:20:02<2:28:43,  1.48it/s]

q_target max 0.3595150113105774
q_value max 0.3539024889469147


 34%|███▍      | 6892/20000 [1:20:47<2:30:41,  1.45it/s]

q_target max -10.0
q_value max -inf


 35%|███▍      | 6953/20000 [1:21:31<2:57:57,  1.22it/s]

q_target max 0.3536679744720459
q_value max 0.35291585326194763


 35%|███▌      | 7015/20000 [1:22:14<2:23:10,  1.51it/s]

q_target max -10.0
q_value max -inf


 35%|███▌      | 7077/20000 [1:22:59<2:34:26,  1.39it/s]

q_target max -10.0
q_value max -inf


 36%|███▌      | 7138/20000 [1:23:44<2:42:12,  1.32it/s]

step 13750, model saved


 36%|███▌      | 7139/20000 [1:23:45<2:48:18,  1.27it/s]

q_target max -10.0
q_value max -inf


 36%|███▌      | 7205/20000 [1:24:30<2:10:38,  1.63it/s]

q_target max -10.0
q_value max -inf


 36%|███▋      | 7265/20000 [1:25:14<2:37:42,  1.35it/s]

q_target max -10.0
q_value max -inf


 37%|███▋      | 7331/20000 [1:25:58<2:47:47,  1.26it/s]

q_target max 0.327158659696579
q_value max 0.3297385573387146


 37%|███▋      | 7392/20000 [1:26:43<3:02:58,  1.15it/s]

q_target max -10.0
q_value max -inf


 37%|███▋      | 7454/20000 [1:27:27<2:21:00,  1.48it/s]

q_target max 0.36178430914878845
q_value max 0.3545554280281067


 38%|███▊      | 7516/20000 [1:28:12<2:19:22,  1.49it/s]

q_target max -10.0
q_value max -inf


 38%|███▊      | 7575/20000 [1:28:58<2:35:58,  1.33it/s]

q_target max -10.0
q_value max -inf


 38%|███▊      | 7637/20000 [1:29:44<2:21:31,  1.46it/s]

q_target max -10.0
q_value max -inf


 38%|███▊      | 7700/20000 [1:30:28<2:12:20,  1.55it/s]

q_target max 0.32650226354599
q_value max 0.3274083137512207


 39%|███▉      | 7764/20000 [1:31:11<2:39:17,  1.28it/s]

step 15000, model saved
q_target max 0.31113293766975403
q_value max 0.3064194917678833


 39%|███▉      | 7826/20000 [1:31:58<2:41:23,  1.26it/s]

q_target max 0.29774215817451477
q_value max 0.29843467473983765


 39%|███▉      | 7896/20000 [1:32:41<2:08:54,  1.56it/s]

q_target max 0.3551039397716522
q_value max 0.36087116599082947


 40%|███▉      | 7972/20000 [1:33:23<1:31:45,  2.18it/s]

q_target max 0.3613324463367462
q_value max 0.35075801610946655


 40%|████      | 8051/20000 [1:34:05<1:50:58,  1.79it/s]

q_target max 0.27859583497047424
q_value max 0.277724027633667


 41%|████      | 8122/20000 [1:34:49<2:49:02,  1.17it/s]

q_target max -10.0
q_value max -inf


 41%|████      | 8184/20000 [1:35:34<2:25:12,  1.36it/s]

q_target max 0.29774215817451477
q_value max 0.29816123843193054


 41%|████      | 8246/20000 [1:36:18<2:11:51,  1.49it/s]

q_target max -10.0
q_value max -inf


 42%|████▏     | 8308/20000 [1:37:02<1:57:33,  1.66it/s]

q_target max 0.3551039397716522
q_value max 0.36116689443588257


 42%|████▏     | 8378/20000 [1:37:44<2:20:56,  1.37it/s]

q_target max -10.0
q_value max -inf


 42%|████▏     | 8451/20000 [1:38:27<2:23:24,  1.34it/s]

step 16250, model saved
q_target max 0.35530441999435425
q_value max 0.35084959864616394


 43%|████▎     | 8541/20000 [1:39:05<37:56,  5.03it/s]  

q_target max 0.3583696186542511
q_value max 0.35575392842292786


 43%|████▎     | 8666/20000 [1:39:29<39:31,  4.78it/s]  

q_target max 0.3583696186542511
q_value max 0.3630915880203247


 44%|████▍     | 8791/20000 [1:39:52<36:41,  5.09it/s]

q_target max 0.3377017676830292
q_value max 0.3396942615509033


 45%|████▍     | 8913/20000 [1:40:16<33:45,  5.47it/s]  

q_target max -10.0
q_value max -inf


 45%|████▌     | 9034/20000 [1:40:41<37:04,  4.93it/s]  

q_target max 0.2974192500114441
q_value max 0.29989326000213623


 46%|████▌     | 9133/20000 [1:41:16<34:40,  5.22it/s]  

q_target max -10.0
q_value max -inf


 46%|████▌     | 9238/20000 [1:41:47<34:45,  5.16it/s]  

q_target max -10.0
q_value max -inf


 47%|████▋     | 9357/20000 [1:42:12<32:02,  5.54it/s]  

q_target max 0.29341351985931396
q_value max 0.29238003492355347


 47%|████▋     | 9481/20000 [1:42:35<30:07,  5.82it/s]

q_target max -10.0
q_value max -inf


 48%|████▊     | 9601/20000 [1:42:59<1:04:54,  2.67it/s]

step 17500, model saved


 48%|████▊     | 9603/20000 [1:43:00<1:16:39,  2.26it/s]

q_target max 0.34598246216773987
q_value max 0.3404519259929657


 48%|████▊     | 9664/20000 [1:43:45<2:01:38,  1.42it/s]

q_target max -10.0
q_value max -inf


 49%|████▊     | 9717/20000 [1:44:35<1:59:14,  1.44it/s] 

q_target max -10.0
q_value max -inf


 49%|████▉     | 9779/20000 [1:45:20<1:49:51,  1.55it/s]

q_target max 0.37939998507499695
q_value max 0.3796464502811432


 49%|████▉     | 9840/20000 [1:46:07<2:22:24,  1.19it/s]

q_target max 0.34297820925712585
q_value max 0.34104496240615845


 50%|████▉     | 9902/20000 [1:46:53<2:03:21,  1.36it/s]

q_target max 0.37939998507499695
q_value max 0.37865617871284485


 50%|████▉     | 9964/20000 [1:47:39<2:03:41,  1.35it/s]

q_target max 0.37939998507499695
q_value max 0.37768489122390747


 50%|█████     | 10026/20000 [1:48:25<1:55:06,  1.44it/s]

q_target max 0.3689422011375427
q_value max 0.37773245573043823


 50%|█████     | 10088/20000 [1:49:09<2:04:06,  1.33it/s]

q_target max 0.3689422011375427
q_value max 0.3759452700614929


 51%|█████     | 10150/20000 [1:49:55<2:11:38,  1.25it/s]

q_target max 0.37939998507499695
q_value max 0.37812837958335876


 51%|█████     | 10212/20000 [1:50:41<2:07:30,  1.28it/s]

step 18750, model saved
q_target max 0.3680908977985382
q_value max 0.35974740982055664


 51%|█████▏    | 10293/20000 [1:51:23<26:13,  6.17it/s]  

q_target max -10.0
q_value max -inf


 52%|█████▏    | 10353/20000 [1:52:08<2:01:36,  1.32it/s]

q_target max -10.0
q_value max -inf


 52%|█████▏    | 10415/20000 [1:52:54<1:59:15,  1.34it/s]

q_target max 0.3680908977985382
q_value max 0.3638133406639099


 52%|█████▏    | 10476/20000 [1:53:39<1:52:36,  1.41it/s]

q_target max 0.3720240592956543
q_value max 0.3746042251586914


 53%|█████▎    | 10539/20000 [1:54:24<2:01:44,  1.30it/s]

q_target max 0.3820160925388336
q_value max 0.38368669152259827


 53%|█████▎    | 10596/20000 [1:55:13<1:58:46,  1.32it/s]

q_target max -10.0
q_value max -inf


 53%|█████▎    | 10655/20000 [1:56:03<1:44:55,  1.48it/s]

q_target max 0.34663495421409607
q_value max 0.3477328419685364


 54%|█████▎    | 10717/20000 [1:56:48<1:49:58,  1.41it/s]

q_target max -10.0
q_value max -inf


 54%|█████▍    | 10776/20000 [1:57:38<2:30:15,  1.02it/s]

q_target max 0.34663495421409607
q_value max 0.34724366664886475


 54%|█████▍    | 10861/20000 [1:58:14<30:48,  4.94it/s]  

step 20000, model saved
q_target max -10.0
q_value max -inf


 55%|█████▍    | 10952/20000 [1:58:48<1:00:09,  2.51it/s]

q_target max 0.3858736455440521
q_value max 0.388548880815506


 55%|█████▌    | 11076/20000 [1:59:12<27:56,  5.32it/s]  

q_target max -10.0
q_value max -inf


 56%|█████▌    | 11174/20000 [1:59:48<1:53:11,  1.30it/s]

q_target max -10.0
q_value max -inf


 56%|█████▋    | 11256/20000 [2:00:30<27:03,  5.38it/s]  

q_target max -10.0
q_value max -inf


 57%|█████▋    | 11342/20000 [2:01:07<1:03:27,  2.27it/s]

q_target max -10.0
q_value max -inf


 57%|█████▋    | 11414/20000 [2:01:50<1:46:38,  1.34it/s]

q_target max -10.0
q_value max -inf


 57%|█████▋    | 11494/20000 [2:02:31<1:39:14,  1.43it/s]

q_target max -10.0
q_value max -inf


 58%|█████▊    | 11557/20000 [2:03:16<1:48:08,  1.30it/s]

q_target max -10.0
q_value max -inf


 58%|█████▊    | 11628/20000 [2:04:02<1:42:17,  1.36it/s]

q_target max -10.0
q_value max -inf


 59%|█████▊    | 11707/20000 [2:04:44<1:14:04,  1.87it/s]

step 21250, model saved
q_target max -10.0
q_value max -inf


 59%|█████▉    | 11779/20000 [2:05:28<1:43:13,  1.33it/s]

q_target max -10.0
q_value max -inf


 59%|█████▉    | 11834/20000 [2:06:27<2:09:38,  1.05it/s]

q_target max -10.0
q_value max -inf


 59%|█████▉    | 11895/20000 [2:07:17<1:51:01,  1.22it/s]

q_target max 0.37743568420410156
q_value max 0.3759405016899109


 60%|█████▉    | 11955/20000 [2:08:10<2:41:33,  1.20s/it]

q_target max -10.0
q_value max -inf


 60%|██████    | 12018/20000 [2:08:58<1:25:25,  1.56it/s]

q_target max 0.3774493932723999
q_value max 0.37635594606399536


 60%|██████    | 12091/20000 [2:09:42<1:16:30,  1.72it/s]

q_target max -10.0
q_value max -inf


 61%|██████    | 12160/20000 [2:10:28<1:26:24,  1.51it/s]

q_target max -10.0
q_value max -inf


 61%|██████    | 12214/20000 [2:11:22<2:46:13,  1.28s/it]

q_target max 0.33325061202049255
q_value max 0.3326117992401123


 61%|██████▏   | 12267/20000 [2:12:25<3:08:46,  1.46s/it]

q_target max -10.0
q_value max -inf


 62%|██████▏   | 12326/20000 [2:13:25<1:33:38,  1.37it/s]

step 22500, model saved
q_target max -10.0
q_value max -inf


 62%|██████▏   | 12386/20000 [2:14:21<2:01:58,  1.04it/s]

q_target max 0.3887426257133484
q_value max 0.3893219828605652


 62%|██████▏   | 12461/20000 [2:15:06<51:54,  2.42it/s]  

q_target max -10.0
q_value max -inf


 63%|██████▎   | 12538/20000 [2:15:51<57:30,  2.16it/s]  

q_target max -10.0
q_value max -inf


 63%|██████▎   | 12610/20000 [2:16:37<1:00:44,  2.03it/s]

q_target max -10.0
q_value max -inf


 63%|██████▎   | 12671/20000 [2:17:30<1:43:20,  1.18it/s]

q_target max -10.0
q_value max -inf


 64%|██████▎   | 12745/20000 [2:18:14<52:17,  2.31it/s]  

q_target max 0.39493346214294434
q_value max 0.3944188952445984


 64%|██████▍   | 12809/20000 [2:19:07<1:52:54,  1.06it/s]

q_target max 0.3942621648311615
q_value max 0.3954228460788727


 64%|██████▍   | 12871/20000 [2:19:59<1:38:48,  1.20it/s]

q_target max -10.0
q_value max -inf


 65%|██████▍   | 12933/20000 [2:20:50<1:34:48,  1.24it/s]

q_target max 0.39109280705451965
q_value max 0.3925008177757263


 65%|██████▍   | 12992/20000 [2:21:42<1:44:44,  1.12it/s]

step 23750, model saved
q_target max -10.0
q_value max -inf


 65%|██████▌   | 13051/20000 [2:22:32<1:22:21,  1.41it/s]

q_target max -10.0
q_value max -inf


 66%|██████▌   | 13106/20000 [2:23:24<2:07:49,  1.11s/it]

q_target max 0.3943887948989868
q_value max 0.4011768698692322


 66%|██████▌   | 13179/20000 [2:24:11<1:08:27,  1.66it/s]

q_target max -10.0
q_value max -inf


 66%|██████▌   | 13245/20000 [2:24:56<1:15:41,  1.49it/s]

q_target max 0.3889448642730713
q_value max 0.38878193497657776


 67%|██████▋   | 13304/20000 [2:25:46<2:30:30,  1.35s/it]

q_target max 0.39888933300971985
q_value max 0.4002114534378052


 67%|██████▋   | 13360/20000 [2:26:44<2:03:09,  1.11s/it]

q_target max -10.0
q_value max -inf


 67%|██████▋   | 13417/20000 [2:27:43<1:38:08,  1.12it/s]

q_target max -10.0
q_value max -inf


 68%|██████▊   | 13541/20000 [2:28:07<20:58,  5.13it/s]  

q_target max 0.39602136611938477
q_value max 0.39596977829933167


 68%|██████▊   | 13632/20000 [2:28:41<1:12:25,  1.47it/s]

q_target max 0.39888933300971985
q_value max 0.39764603972435


 68%|██████▊   | 13633/20000 [2:28:51<6:20:54,  3.59s/it]

done = self.gs.checkMate or self.gs.staleMate True, reward 0


 68%|██████▊   | 13656/20000 [2:29:23<6:26:12,  3.65s/it]

done = self.gs.checkMate or self.gs.staleMate True, reward 0


 68%|██████▊   | 13684/20000 [2:29:50<1:02:35,  1.68it/s]

step 25000, model saved
q_target max 0.3981894254684448
q_value max 0.3992394208908081


 69%|██████▊   | 13738/20000 [2:30:55<2:19:01,  1.33s/it]

q_target max 0.39468929171562195
q_value max 0.393189936876297


 69%|██████▉   | 13795/20000 [2:31:52<1:57:02,  1.13s/it]

q_target max -10.0
q_value max -inf


 69%|██████▉   | 13845/20000 [2:32:57<1:55:42,  1.13s/it]

q_target max -10.0
q_value max -inf


 70%|██████▉   | 13903/20000 [2:33:58<1:31:52,  1.11it/s]

q_target max 0.3981894254684448
q_value max 0.39667579531669617


 70%|██████▉   | 13971/20000 [2:34:50<2:59:18,  1.78s/it]

q_target max -10.0
q_value max -inf


 70%|███████   | 14014/20000 [2:36:00<2:24:11,  1.45s/it]

q_target max 0.3981894254684448
q_value max 0.3996274769306183


 70%|███████   | 14077/20000 [2:36:52<1:34:40,  1.04it/s]

q_target max 0.397710919380188
q_value max 0.39548301696777344


 71%|███████   | 14138/20000 [2:37:42<1:18:43,  1.24it/s]

q_target max -10.0
q_value max -inf


 71%|███████   | 14140/20000 [2:37:47<2:15:10,  1.38s/it]