In [268]:
import os, sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

import importlib
import ChessEngine
import Minimax.SmartMoveFinder as SmartMoveFinder
import Minimax.Evaluate as Evaluate
import ChessEnv
import replaybuffer
import network
import copy 
import random
import torch
import torch.nn as nn
import tqdm
from tqdm import trange

importlib.reload(ChessEngine)
importlib.reload(ChessEnv)
importlib.reload(replaybuffer)
importlib.reload(network)

<module 'network' from 'd:\\User\\ProjectGithub\\hiepnguyenn-99\\Chess\\RL\\network.py'>

In [269]:
env = ChessEnv.Env()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

q_net = network.DQN(env.action_size).to(device)

# load mô hình lưu
model_path = 'DQN.pth'
if os.path.exists(model_path):
    checkpoint = torch.load(model_path, map_location=device)
    q_net.load_state_dict(checkpoint)
    q_net.train()
    print("Đã load model")
else:
    print("Không tìm thấy model")

target_net = copy.deepcopy(q_net).to(device)
target_net.eval()
optimizer = torch.optim.Adam(q_net.parameters(), lr=1e-4)
criterion  = nn.MSELoss(reduction='mean')

cuda
Đã load model


In [None]:
env = ChessEnv.Env()
capacity = 32000
rb = replaybuffer.ReplayBuffer(capacity)
batch_size = 64
epsilon = 1.0
epsilon_final = 0.01
epsilon_decay = 0.995
gamma = 0.99
step = 0
target_update_freq = 2000
env.gs.whiteToMove = True # train trắng đi trước (minimax)

for episode in trange(100000):
    done = False
    state = env.reset()
    while not done:
        # nước đi của minimax
        white_capture = None # kiểm tra minimax có ăn quân của rl không
        if env.gs.whiteToMove:
            if Evaluate.check_mid_game(env.gs):
                SmartMoveFinder.DEPTH = 4
            else:
                SmartMoveFinder.DEPTH = 3
            MinimaxMove = SmartMoveFinder.findBestMinimaxMove(env.gs, env.gs.getValidMoves())
            if MinimaxMove is None:
                MinimaxMove = SmartMoveFinder.findRandomMove(env.gs.getValidMoves())
            if MinimaxMove.pieceCaptured != '--':
                white_capture = MinimaxMove.pieceCaptured[1]

            env.gs.makeMove(MinimaxMove)

        state_tensor = env.state_to_tensor()
        lenlegalmove = len(env.gs.getValidMoves())
        # khám phá
        if lenlegalmove != 0:    
            if random.random() < epsilon:
                move = random.choice(env.gs.getValidMoves())
                action = env.moveid_to_index[move.moveID]

            # khai thác
            else:
                q_value = q_net(state_tensor.unsqueeze(0).to(device)) 
                action = q_value.argmax()
            
        next_state_tensor, reward, done, before_legal_mask, after_legal_mask = env.step(action, white_capture, lenlegalmove) # đen đã đi, kiểm tra đen bị ăn quân ko
        rb.push(state_tensor, action, reward, next_state_tensor, done, before_legal_mask, after_legal_mask)

        if rb.__len__() >= batch_size:
            batch = rb.sample(batch_size)
            # chuyển về tensor
            states, actions, reward, next_states, dones, before_legal_masks, after_legal_mask= zip(*batch)
            states = torch.stack([s.to(device) for s in states])
            actions = torch.tensor(actions, device=device, dtype=torch.int64).unsqueeze(1) # (B, 1)
            reward = torch.tensor(reward, device=device, dtype=torch.float32).unsqueeze(1)
            next_states = torch.stack([ns.to(device) for ns in next_states])
            before_legal_masks = torch.stack([b.to(device) for b in before_legal_masks])
            after_legal_mask = torch.stack([a.to(device) for a in after_legal_mask])
            dones = torch.tensor(dones, device=device, dtype=torch.float32).unsqueeze(1)

            with torch.no_grad():
                next_q_values = target_net(next_states) # (B, action_size)
                next_q_values[~after_legal_mask] = -torch.inf
                next_q_max = next_q_values.max(1)[0].unsqueeze(1) # (B, 1)

                # nếu không còn nước đi hợp lệ của next_states thì đặt bằng 0
                legal_exists = after_legal_mask.any(dim=1, keepdim=True)
                next_q_max = torch.where(legal_exists, next_q_max, torch.zeros_like(next_q_max))

                q_target = reward + gamma * (1 - dones) * next_q_max # (B, 1)

            q_values = q_net(states.to(device)) # (B, action_size)
            q_values[~before_legal_masks] = -torch.inf
            q_value = q_values.gather(dim=1, index=actions) # tính q value với hành động đã chọn (B, 1)
            if step % 125 == 0:
                with open('train_log.txt', 'a') as f:
                    print(f'q_target max {q_target[0][0]}', file=f)
                    print(f'q_value max {q_value[0][0]}', file=f)
                print(f'q_target max {q_target[0][0]}')
                print(f'q_value max {q_value[0][0]}')

            loss = criterion(q_value, q_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            step += 1

            if step % target_update_freq == 0:
                target_net.load_state_dict(q_net.state_dict())
                torch.save(target_net.state_dict(), 'DQN.pth')
                print(f'step {step}, model saved')

        epsilon = max(epsilon_final, epsilon*epsilon_decay)


  0%|          | 7/100000 [01:34<233:41:59,  8.41s/it] 

q_target max 2.915268898010254
q_value max 2.945927381515503


  0%|          | 9/100000 [01:37<130:18:31,  4.69s/it]

total sum reward -0.10256410256410256


  0%|          | 10/100000 [01:53<223:59:05,  8.06s/it]

hết nước đi hợp lệ, self.gs.checkMate, -1


  0%|          | 22/100000 [02:49<163:54:55,  5.90s/it]

total sum reward 0.0


  0%|          | 43/100000 [03:28<41:33:14,  1.50s/it] 

q_target max 3.001744270324707
q_value max 2.965724229812622


  0%|          | 112/100000 [04:17<18:10:59,  1.53it/s]

q_target max -10.0
q_value max -inf


  0%|          | 188/100000 [05:00<16:15:34,  1.71it/s]

q_target max -10.0
q_value max -inf


  0%|          | 266/100000 [05:38<14:52:29,  1.86it/s]

q_target max 4.164337158203125
q_value max 3.878016948699951


  0%|          | 350/100000 [06:12<9:06:00,  3.04it/s] 

q_target max -10.0
q_value max -inf


  0%|          | 430/100000 [06:46<10:22:13,  2.67it/s]

q_target max -10.0
q_value max -inf


  1%|          | 514/100000 [07:20<14:37:37,  1.89it/s]

q_target max 3.16559100151062
q_value max 3.1469812393188477


  1%|          | 599/100000 [07:52<10:15:10,  2.69it/s]

q_target max -10.0
q_value max -inf


  1%|          | 684/100000 [08:25<15:33:28,  1.77it/s]

q_target max 3.6224052906036377
q_value max 3.6107985973358154


  1%|          | 769/100000 [08:59<13:02:32,  2.11it/s]

q_target max 3.501997232437134
q_value max 3.5044918060302734


  1%|          | 856/100000 [09:32<12:24:26,  2.22it/s]

q_target max 4.360225677490234
q_value max 4.368068695068359


  1%|          | 937/100000 [10:05<10:52:28,  2.53it/s]

q_target max -10.0
q_value max -inf


  1%|          | 1018/100000 [10:40<14:43:58,  1.87it/s]

q_target max -10.0
q_value max -inf


  1%|          | 1099/100000 [11:14<8:56:16,  3.07it/s] 

q_target max -10.0
q_value max -inf


  1%|          | 1182/100000 [11:47<11:20:15,  2.42it/s]

q_target max -10.0
q_value max -inf


  1%|▏         | 1261/100000 [12:20<12:41:33,  2.16it/s]

step 2000, model saved


  1%|▏         | 1262/100000 [12:21<15:29:17,  1.77it/s]

q_target max -10.0
q_value max -inf


  1%|▏         | 1344/100000 [12:55<9:35:49,  2.86it/s] 

q_target max -10.0
q_value max -inf


  1%|▏         | 1429/100000 [13:28<7:08:11,  3.84it/s] 

q_target max -10.0
q_value max -inf


  2%|▏         | 1518/100000 [14:00<11:39:18,  2.35it/s]

q_target max -10.0
q_value max -inf


  2%|▏         | 1600/100000 [14:33<8:20:05,  3.28it/s] 

q_target max -10.0
q_value max -inf


  2%|▏         | 1685/100000 [15:07<10:52:26,  2.51it/s]

q_target max 4.85248327255249
q_value max 4.854246616363525


  2%|▏         | 1770/100000 [15:39<14:36:05,  1.87it/s]

q_target max 4.85248327255249
q_value max 4.858062744140625


  2%|▏         | 1854/100000 [16:13<6:15:27,  4.36it/s] 

q_target max 4.85248327255249
q_value max 4.85323429107666


  2%|▏         | 1936/100000 [16:48<15:21:14,  1.77it/s]

q_target max -10.0
q_value max -inf


  2%|▏         | 2019/100000 [17:19<12:07:28,  2.24it/s]

q_target max -10.0
q_value max -inf


  2%|▏         | 2106/100000 [17:52<12:04:42,  2.25it/s]

q_target max 4.85248327255249
q_value max 4.831445693969727


  2%|▏         | 2200/100000 [18:24<4:57:51,  5.47it/s] 

q_target max -10.0
q_value max -inf


  2%|▏         | 2325/100000 [18:48<4:53:22,  5.55it/s]

q_target max -10.0
q_value max -inf


  2%|▏         | 2434/100000 [19:15<10:47:48,  2.51it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 2541/100000 [19:43<5:19:47,  5.08it/s] 

q_target max -10.0
q_value max -inf


  3%|▎         | 2641/100000 [20:13<5:11:04,  5.22it/s] 

q_target max -10.0
q_value max -inf


  3%|▎         | 2757/100000 [20:38<6:05:33,  4.43it/s] 

step 4000, model saved


  3%|▎         | 2758/100000 [20:39<6:00:38,  4.49it/s]

q_target max 5.270521640777588
q_value max 4.843506336212158


  3%|▎         | 2830/100000 [21:26<11:50:25,  2.28it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 2906/100000 [22:10<26:26:49,  1.02it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 2993/100000 [22:59<14:48:44,  1.82it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 3073/100000 [23:52<12:06:45,  2.22it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 3154/100000 [24:44<13:11:15,  2.04it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 3229/100000 [25:40<22:22:05,  1.20it/s]

q_target max -10.0
q_value max -inf


  3%|▎         | 3311/100000 [26:33<9:02:53,  2.97it/s] 

q_target max -10.0
q_value max -inf


  3%|▎         | 3394/100000 [27:26<10:51:20,  2.47it/s]

q_target max 5.347647666931152
q_value max 5.345228672027588


  3%|▎         | 3478/100000 [28:17<18:15:04,  1.47it/s]

q_target max 5.270521640777588
q_value max 5.244631767272949


  4%|▎         | 3561/100000 [29:10<22:54:41,  1.17it/s]

q_target max 5.270521640777588
q_value max 5.252681732177734


  4%|▎         | 3647/100000 [29:58<15:57:36,  1.68it/s]

q_target max 5.347647666931152
q_value max 5.32548713684082


  4%|▎         | 3733/100000 [30:49<10:24:26,  2.57it/s]

q_target max 5.270521640777588
q_value max 5.302727222442627


  4%|▍         | 3818/100000 [31:40<17:35:06,  1.52it/s]

q_target max 5.774869918823242
q_value max 5.680252552032471


  4%|▍         | 3901/100000 [32:32<19:26:56,  1.37it/s]

q_target max 5.347647666931152
q_value max 5.33099889755249


  4%|▍         | 3981/100000 [33:24<21:57:20,  1.21it/s]

q_target max 5.347647666931152
q_value max 5.3616437911987305


  4%|▍         | 4062/100000 [34:18<15:43:43,  1.69it/s]

step 6000, model saved
q_target max 4.48366117477417
q_value max 4.214996337890625


  4%|▍         | 4140/100000 [35:12<21:14:31,  1.25it/s]

q_target max 5.863487720489502
q_value max 5.8798346519470215


  4%|▍         | 4219/100000 [36:06<12:48:07,  2.08it/s]

q_target max -10.0
q_value max -inf


  4%|▍         | 4298/100000 [37:01<19:39:04,  1.35it/s]

q_target max -10.0
q_value max -inf


  4%|▍         | 4381/100000 [37:53<13:54:29,  1.91it/s]

q_target max -10.0
q_value max -inf


  4%|▍         | 4462/100000 [38:45<18:22:54,  1.44it/s]

q_target max -10.0
q_value max -inf


  5%|▍         | 4531/100000 [39:41<34:52:44,  1.32s/it]

q_target max 5.931649684906006
q_value max 5.918422698974609


  5%|▍         | 4602/100000 [40:39<26:05:59,  1.02it/s]

q_target max -10.0
q_value max -inf


  5%|▍         | 4677/100000 [41:33<21:41:32,  1.22it/s]

q_target max -10.0
q_value max -inf


  5%|▍         | 4747/100000 [42:26<14:07:12,  1.87it/s]

q_target max -10.0
q_value max -inf


  5%|▍         | 4812/100000 [43:18<20:13:31,  1.31it/s]

q_target max 5.863487720489502
q_value max 5.868526458740234


  5%|▍         | 4876/100000 [44:15<22:16:13,  1.19it/s]

q_target max -10.0
q_value max -inf


  5%|▍         | 4939/100000 [45:11<21:26:51,  1.23it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5002/100000 [46:07<18:56:31,  1.39it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5065/100000 [47:02<21:38:24,  1.22it/s]

q_target max 5.931649684906006
q_value max 5.941348552703857


  5%|▌         | 5127/100000 [47:57<21:34:05,  1.22it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5188/100000 [48:49<23:03:15,  1.14it/s]

step 8000, model saved


  5%|▌         | 5189/100000 [48:50<22:41:21,  1.16it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5250/100000 [49:32<16:25:01,  1.60it/s]

q_target max 6.468267917633057
q_value max 6.451265811920166


  5%|▌         | 5312/100000 [50:14<20:14:39,  1.30it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5367/100000 [51:09<20:54:44,  1.26it/s]

q_target max -10.0
q_value max -inf


  5%|▌         | 5421/100000 [52:01<32:09:50,  1.22s/it]

q_target max -10.0
q_value max -inf


  5%|▌         | 5476/100000 [53:00<17:13:07,  1.52it/s]

q_target max 6.425061225891113
q_value max 6.426077842712402


  6%|▌         | 5538/100000 [53:41<18:27:34,  1.42it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 5597/100000 [54:24<15:52:31,  1.65it/s]

q_target max 6.468267917633057
q_value max 6.442344665527344


  6%|▌         | 5654/100000 [55:10<17:02:21,  1.54it/s]

q_target max 5.817216873168945
q_value max 5.815366744995117


  6%|▌         | 5716/100000 [55:53<16:43:04,  1.57it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 5777/100000 [56:36<17:30:25,  1.49it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 5835/100000 [57:20<26:10:32,  1.00s/it]

q_target max -10.0
q_value max -inf


  6%|▌         | 5898/100000 [58:02<16:28:48,  1.59it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 5953/100000 [58:51<23:49:28,  1.10it/s]

q_target max 5.817216873168945
q_value max 5.843611717224121


  6%|▌         | 6015/100000 [59:35<17:04:10,  1.53it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 6077/100000 [1:00:18<18:01:10,  1.45it/s]

q_target max -10.0
q_value max -inf


  6%|▌         | 6137/100000 [1:00:59<17:07:19,  1.52it/s]

step 10000, model saved


  6%|▌         | 6138/100000 [1:00:59<17:32:30,  1.49it/s]

q_target max 5.659676551818848
q_value max 5.00449800491333


  6%|▌         | 6194/100000 [1:01:47<25:30:25,  1.02it/s]

q_target max 6.9810590744018555
q_value max 6.945750713348389


  6%|▋         | 6252/100000 [1:02:34<15:55:14,  1.64it/s]

q_target max 6.710628986358643
q_value max 6.703446388244629


  6%|▋         | 6314/100000 [1:03:21<17:19:13,  1.50it/s]

q_target max -10.0
q_value max -inf


  6%|▋         | 6374/100000 [1:04:06<17:29:44,  1.49it/s]

q_target max 5.906844615936279
q_value max 5.786782741546631


  6%|▋         | 6432/100000 [1:05:02<23:00:58,  1.13it/s]

q_target max -10.0
q_value max -inf


  6%|▋         | 6488/100000 [1:05:55<29:36:52,  1.14s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 6545/100000 [1:06:50<28:14:02,  1.09s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 6603/100000 [1:07:42<24:55:41,  1.04it/s]

q_target max 6.710628986358643
q_value max 6.718277931213379


  7%|▋         | 6658/100000 [1:08:40<43:01:54,  1.66s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 6711/100000 [1:09:39<27:04:24,  1.04s/it]

q_target max 5.553595542907715
q_value max 5.618645191192627


  7%|▋         | 6765/100000 [1:10:38<23:05:33,  1.12it/s]

q_target max 6.9810590744018555
q_value max 7.014998435974121


  7%|▋         | 6820/100000 [1:11:34<28:26:40,  1.10s/it]

q_target max 6.9810590744018555
q_value max 6.962626934051514


  7%|▋         | 6875/100000 [1:12:37<28:28:30,  1.10s/it]

q_target max 5.986892223358154
q_value max 5.75163459777832


  7%|▋         | 6927/100000 [1:13:36<19:22:53,  1.33it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 6979/100000 [1:14:36<25:52:52,  1.00s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 7034/100000 [1:15:28<17:34:43,  1.47it/s]

step 12000, model saved
q_target max 6.18437385559082
q_value max 6.0870866775512695


  7%|▋         | 7094/100000 [1:16:18<20:04:29,  1.29it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 7150/100000 [1:17:16<26:37:46,  1.03s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 7206/100000 [1:18:16<21:14:44,  1.21it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 7264/100000 [1:19:11<18:16:55,  1.41it/s]

q_target max 6.211728096008301
q_value max 6.186370849609375


  7%|▋         | 7316/100000 [1:20:12<31:52:49,  1.24s/it]

q_target max -10.0
q_value max -inf


  7%|▋         | 7366/100000 [1:21:13<22:51:51,  1.13it/s]

q_target max -10.0
q_value max -inf


  7%|▋         | 7420/100000 [1:22:14<24:02:19,  1.07it/s]

q_target max 6.366620063781738
q_value max 6.321382522583008


  7%|▋         | 7475/100000 [1:23:12<23:01:20,  1.12it/s]

q_target max 7.391057968139648
q_value max 7.4049296379089355


  8%|▊         | 7524/100000 [1:24:17<28:16:47,  1.10s/it]

q_target max 6.18437385559082
q_value max 6.399869441986084


  8%|▊         | 7575/100000 [1:25:19<26:14:05,  1.02s/it]

q_target max 7.040688514709473
q_value max 7.0707106590271


  8%|▊         | 7624/100000 [1:26:27<38:53:11,  1.52s/it]

q_target max -10.0
q_value max -inf


  8%|▊         | 7674/100000 [1:27:41<23:55:56,  1.07it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 7725/100000 [1:28:50<54:11:41,  2.11s/it]

q_target max -10.0
q_value max -inf


  8%|▊         | 7777/100000 [1:29:52<24:45:37,  1.03it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 7832/100000 [1:30:48<19:50:17,  1.29it/s]

q_target max 7.391057968139648
q_value max 7.368218898773193


  8%|▊         | 7883/100000 [1:31:49<30:25:20,  1.19s/it]

step 14000, model saved
q_target max -10.0
q_value max -inf


  8%|▊         | 7930/100000 [1:32:56<19:52:54,  1.29it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 7986/100000 [1:33:44<15:48:57,  1.62it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 8048/100000 [1:34:26<17:38:09,  1.45it/s]

q_target max 7.650979995727539
q_value max 7.614937782287598


  8%|▊         | 8109/100000 [1:35:11<19:55:26,  1.28it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 8167/100000 [1:35:59<14:47:29,  1.72it/s]

q_target max 7.130232810974121
q_value max 7.107038974761963


  8%|▊         | 8215/100000 [1:36:58<20:04:21,  1.27it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 8271/100000 [1:37:51<20:07:01,  1.27it/s]

q_target max 6.826359748840332
q_value max 6.580435752868652


  8%|▊         | 8327/100000 [1:38:48<16:42:11,  1.52it/s]

q_target max -10.0
q_value max -inf


  8%|▊         | 8378/100000 [1:39:44<20:16:59,  1.25it/s]

q_target max 7.650979995727539
q_value max 7.694397449493408


  8%|▊         | 8427/100000 [1:40:42<25:22:13,  1.00it/s]

q_target max 7.650979995727539
q_value max 7.6282734870910645


  8%|▊         | 8480/100000 [1:41:29<35:35:38,  1.40s/it]

q_target max -10.0
q_value max -inf


  9%|▊         | 8529/100000 [1:42:24<22:51:42,  1.11it/s]

q_target max 7.130232810974121
q_value max 7.105814456939697


  9%|▊         | 8570/100000 [1:43:25<26:41:16,  1.05s/it]

q_target max -10.0
q_value max -inf


  9%|▊         | 8614/100000 [1:44:29<20:18:06,  1.25it/s]

q_target max -10.0
q_value max -inf


  9%|▊         | 8664/100000 [1:45:22<30:14:52,  1.19s/it]

q_target max 7.650979995727539
q_value max 7.671422481536865


  9%|▊         | 8711/100000 [1:46:21<17:15:31,  1.47it/s]

step 16000, model saved


  9%|▊         | 8712/100000 [1:46:22<17:30:07,  1.45it/s]

q_target max -10.0
q_value max -inf


  9%|▉         | 8752/100000 [1:47:29<29:15:31,  1.15s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 8793/100000 [1:48:41<31:37:04,  1.25s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 8836/100000 [1:49:37<43:03:44,  1.70s/it]

q_target max 6.889532566070557
q_value max 6.576794147491455


  9%|▉         | 8876/100000 [1:50:42<34:01:12,  1.34s/it]

q_target max 7.61531400680542
q_value max 7.6086578369140625


  9%|▉         | 8929/100000 [1:51:33<12:47:54,  1.98it/s]

q_target max -10.0
q_value max -inf


  9%|▉         | 8975/100000 [1:52:27<24:10:42,  1.05it/s]

q_target max 7.61531400680542
q_value max 7.618844509124756


  9%|▉         | 9019/100000 [1:53:25<34:28:15,  1.36s/it]

q_target max 6.977466106414795
q_value max 7.005794525146484


  9%|▉         | 9061/100000 [1:54:26<41:42:42,  1.65s/it]

q_target max 5.314539909362793
q_value max 5.289798259735107


  9%|▉         | 9101/100000 [1:55:30<54:20:47,  2.15s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9142/100000 [1:56:30<40:53:10,  1.62s/it]

q_target max 6.977466106414795
q_value max 6.954480171203613


  9%|▉         | 9182/100000 [1:57:34<44:04:01,  1.75s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9223/100000 [1:58:45<67:53:41,  2.69s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9264/100000 [1:59:45<43:01:40,  1.71s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9302/100000 [2:00:55<42:04:15,  1.67s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9342/100000 [2:01:58<49:26:23,  1.96s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9381/100000 [2:02:57<28:54:23,  1.15s/it]

step 18000, model saved


  9%|▉         | 9382/100000 [2:03:00<39:19:25,  1.56s/it]

q_target max -10.0
q_value max -inf


  9%|▉         | 9441/100000 [2:03:45<17:10:14,  1.47it/s]

q_target max 7.7955780029296875
q_value max 7.773809432983398


  9%|▉         | 9499/100000 [2:04:35<25:55:49,  1.03s/it]

q_target max 7.7955780029296875
q_value max 7.801901340484619


 10%|▉         | 9556/100000 [2:05:27<23:31:29,  1.07it/s]

q_target max -10.0
q_value max -inf


 10%|▉         | 9613/100000 [2:06:26<19:10:22,  1.31it/s]

q_target max -10.0
q_value max -inf


 10%|▉         | 9673/100000 [2:07:13<17:35:45,  1.43it/s]

q_target max -10.0
q_value max -inf


 10%|▉         | 9735/100000 [2:07:56<17:35:43,  1.43it/s]

q_target max 7.338658332824707
q_value max 7.34231424331665


 10%|▉         | 9797/100000 [2:08:42<25:41:35,  1.03s/it]

q_target max -10.0
q_value max -inf


 10%|▉         | 9858/100000 [2:09:26<17:07:29,  1.46it/s]

q_target max 7.6943583488464355
q_value max 7.684103965759277


 10%|▉         | 9918/100000 [2:10:11<16:42:50,  1.50it/s]

q_target max 7.7955780029296875
q_value max 7.808414459228516


 10%|▉         | 9975/100000 [2:11:02<32:10:59,  1.29s/it]

q_target max 7.7955780029296875
q_value max 7.788575172424316


 10%|█         | 10037/100000 [2:11:49<16:34:53,  1.51it/s]

q_target max 7.026859283447266
q_value max 7.043907642364502


 10%|█         | 10099/100000 [2:12:34<17:29:28,  1.43it/s]

q_target max -10.0
q_value max -inf


 10%|█         | 10159/100000 [2:13:21<20:50:23,  1.20it/s]

q_target max -10.0
q_value max -inf


 10%|█         | 10220/100000 [2:14:06<16:40:34,  1.50it/s]

q_target max -10.0
q_value max -inf


 10%|█         | 10282/100000 [2:14:48<15:32:03,  1.60it/s]

q_target max -10.0
q_value max -inf


 10%|█         | 10344/100000 [2:15:29<17:04:04,  1.46it/s]

step 20000, model saved
q_target max -10.0
q_value max -inf


 10%|█         | 10406/100000 [2:16:12<16:47:37,  1.48it/s]

q_target max 7.4662957191467285
q_value max 7.4691338539123535


 10%|█         | 10468/100000 [2:16:57<15:11:05,  1.64it/s]

q_target max 8.63110065460205
q_value max 9.020142555236816


 11%|█         | 10530/100000 [2:17:41<15:12:14,  1.63it/s]

q_target max 8.157854080200195
q_value max 8.013752937316895


 11%|█         | 10592/100000 [2:18:25<16:19:09,  1.52it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 10654/100000 [2:19:08<16:51:02,  1.47it/s]

q_target max 8.013076782226562
q_value max 8.032901763916016


 11%|█         | 10715/100000 [2:19:51<17:43:59,  1.40it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 10778/100000 [2:20:34<17:20:58,  1.43it/s]

q_target max 8.013076782226562
q_value max 8.018590927124023


 11%|█         | 10838/100000 [2:21:18<20:04:38,  1.23it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 10899/100000 [2:22:00<16:35:40,  1.49it/s]

q_target max -10.0
q_value max -inf


 11%|█         | 10960/100000 [2:22:46<16:27:45,  1.50it/s]

q_target max 8.013076782226562
q_value max 8.00759220123291


 11%|█         | 11021/100000 [2:23:32<38:03:39,  1.54s/it]

q_target max -10.0
q_value max -inf


 11%|█         | 11084/100000 [2:24:14<16:25:03,  1.50it/s]

q_target max 8.013076782226562
q_value max 8.024239540100098


 11%|█         | 11145/100000 [2:25:03<41:58:01,  1.70s/it]

q_target max -10.0
q_value max -inf


 11%|█         | 11208/100000 [2:25:46<15:54:41,  1.55it/s]

q_target max -10.0
q_value max -inf


 11%|█▏        | 11270/100000 [2:26:30<17:02:14,  1.45it/s]

q_target max -10.0
q_value max -inf


 11%|█▏        | 11330/100000 [2:27:15<16:54:05,  1.46it/s]

step 22000, model saved


 11%|█▏        | 11331/100000 [2:27:16<21:30:44,  1.14it/s]

q_target max 8.917349815368652
q_value max 8.150965690612793


 11%|█▏        | 11393/100000 [2:28:01<21:20:51,  1.15it/s]

q_target max -10.0
q_value max -inf


 11%|█▏        | 11454/100000 [2:28:46<16:04:36,  1.53it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 11515/100000 [2:29:31<17:42:06,  1.39it/s]

q_target max 7.59906005859375
q_value max 7.614027976989746


 12%|█▏        | 11578/100000 [2:30:15<17:23:30,  1.41it/s]

q_target max 7.2376298904418945
q_value max 7.399476528167725


 12%|█▏        | 11640/100000 [2:30:59<18:50:48,  1.30it/s]

q_target max 8.27421760559082
q_value max 8.271750450134277


 12%|█▏        | 11702/100000 [2:31:44<19:49:09,  1.24it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 11765/100000 [2:32:44<20:32:02,  1.19it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 11826/100000 [2:33:28<22:25:02,  1.09it/s]

q_target max 7.915463924407959
q_value max 7.702535629272461


 12%|█▏        | 11887/100000 [2:34:14<18:34:17,  1.32it/s]

q_target max 8.917349815368652
q_value max 8.899357795715332


 12%|█▏        | 11949/100000 [2:34:56<16:11:36,  1.51it/s]

q_target max 8.27421760559082
q_value max 8.251766204833984


 12%|█▏        | 12009/100000 [2:35:43<15:06:33,  1.62it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 12071/100000 [2:36:26<15:56:06,  1.53it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 12134/100000 [2:37:12<17:59:40,  1.36it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 12196/100000 [2:37:58<17:28:27,  1.40it/s]

q_target max 8.243632316589355
q_value max 8.27068042755127


 12%|█▏        | 12256/100000 [2:38:43<17:46:39,  1.37it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 12318/100000 [2:39:25<14:31:44,  1.68it/s]

step 24000, model saved
q_target max -10.0
q_value max -inf


 12%|█▏        | 12380/100000 [2:40:07<17:22:53,  1.40it/s]

q_target max -10.0
q_value max -inf


 12%|█▏        | 12441/100000 [2:40:52<15:42:04,  1.55it/s]

q_target max 7.692677021026611
q_value max 7.630861282348633


 13%|█▎        | 12504/100000 [2:41:35<15:18:17,  1.59it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 12566/100000 [2:42:18<15:21:45,  1.58it/s]

q_target max 8.488420486450195
q_value max 8.470973014831543


 13%|█▎        | 12627/100000 [2:43:01<17:27:32,  1.39it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 12688/100000 [2:43:45<16:49:24,  1.44it/s]

q_target max 8.108646392822266
q_value max 7.862131118774414


 13%|█▎        | 12750/100000 [2:44:28<22:01:58,  1.10it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 12813/100000 [2:45:11<14:26:50,  1.68it/s]

q_target max 8.004753112792969
q_value max 7.984470844268799


 13%|█▎        | 12875/100000 [2:45:51<15:22:15,  1.57it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 12936/100000 [2:46:34<14:43:09,  1.64it/s]

q_target max 7.627223968505859
q_value max 7.876916885375977


 13%|█▎        | 12999/100000 [2:47:17<14:41:29,  1.64it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 13060/100000 [2:47:58<14:50:50,  1.63it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 13123/100000 [2:48:40<15:55:25,  1.52it/s]

q_target max -10.0
q_value max -inf


 13%|█▎        | 13152/100000 [2:49:00<15:54:36,  1.52it/s]