In [1]:
import chess
import RL_utils

In [2]:
# Our existing CNN model
model_path = "models/TORCH_250EPOCH 1.pth"

## Load stockfish

In [3]:
from RL_utils import PositionEvaluator

In [4]:
stockfish_path = "models/stockfish/stockfish-windows-x86-64-avx2.exe"
evaluator = PositionEvaluator(stockfish_path, elo_rating=1400)

Stockfish initialized with ELO: 1400


In [6]:
board = chess.Board()
board.push_uci("e2e4")
board.push_uci("h7h6")

Move.from_uci('h7h6')

In [6]:
eval_score = evaluator.evaluate_position(board)
print(f"Position evaluation: {eval_score}")

Position evaluation: 0.83


In [7]:
best_moves = evaluator.get_best_moves(board, 3)
print(f"Best moves: {best_moves}")

Best moves: [{'Move': 'd2d4', 'Centipawn': 83, 'Mate': None}, {'Move': 'g1f3', 'Centipawn': 71, 'Mate': None}, {'Move': 'b1c3', 'Centipawn': 68, 'Mate': None}]


## Load our CNN trained model

In [5]:
model, device = RL_utils.load_model(model_path)
model = model.to(device)

In [6]:
import torch
dummy_input = torch.randn(1, 19, 8, 8).to(device)
with torch.no_grad():
    output = model(dummy_input)
    print(f"Model output shape: {output.shape}")  # Should be [1, 4288]
    print("Model loaded successfully!")

Model output shape: torch.Size([1, 4288])
Model loaded successfully!


## Load the training data

In [7]:
pgn_file = "games/lichess_db_2016-04.pgn"
data = []
data += RL_utils.extract_middlegame_positions(pgn_file, evaluator, num_positions=10)
print(f"Total positions loaded: {len(data)}")

Extracted 10 middle game positions
Total positions loaded: 10


In [8]:
# Test creating a batch
if data:
    board_tensors, legal_masks, boards = RL_utils.create_training_batch(data, batch_size=4)
    print(f"Batch shapes:")
    print(f"  Board tensors: {board_tensors.shape}")
    print(f"  Legal masks: {legal_masks.shape}")
    print(f"  Number of boards: {len(boards)}")
    
    # Show a sample position
    print(f"\nSample position FEN: {boards[0].fen()}")

Batch shapes:
  Board tensors: torch.Size([4, 19, 8, 8])
  Legal masks: torch.Size([4, 4288])
  Number of boards: 4

Sample position FEN: 2br1rk1/p1q1bppp/1p3n2/3P4/4P3/1Q6/PP1NB1PP/R1B2RK1 w - - 1 17


In [9]:
boards[0]
eval_score = evaluator.evaluate_position(boards[0])
print(f"Position evaluation: {eval_score}")

Position evaluation: -0.27


## Training Loop

In [10]:
import random
import tqdm

In [None]:
num_iterations = 1000
games_per_iteration = 10

optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

for iteration in tqdm.tqdm(range(num_iterations)):
    # Collect experience from multiple games
    all_game_history = []
    all_game_results = []
    
    # Play several games per iteration
    for game_num in range(games_per_iteration):
        # Use higher temperature early in training for more exploration
        temperature = max(0.1, 1.0 - (iteration / num_iterations))

        # We choose a random starting position (this means the same position could be chosen multiple times)
        starting_pos = random.choice(data)
        game_history, game_result = RL_utils.play_self_play_game(model, device, starting_pos, max_moves=100, temperature=temperature)
        
        all_game_history.append(game_history)
        # Assign rewards: each position gets the final game result
        all_game_results.append(game_result)
    
    # Update model based on collected experience
    avg_loss = RL_utils.update_model(model, optimiser, all_game_history, all_game_results, device)
    
    # if iteration % 10 == 0:
    wins = sum(1 for result in all_game_results if result == 1)
    losses = sum(1 for result in all_game_results if result == -1)
    draws = sum(1 for result in all_game_results if result == 0)
    
    print(f"Iteration {iteration:4d} | Loss: {avg_loss:.4f} | "
            f"W/L/D: {wins}/{losses}/{draws} | Temp: {temperature:.2f}")

    if iteration % 100 == 0:
        print(f"Iteration {iteration} completed")
        torch.save({
                'model_state_dict': model.state_dict(),
                'optimiser_state_dict': optimiser.state_dict(),
                'iteration': iteration
            }, f'models/rl_model_iter_{iteration}.pth')

  0%|          | 1/1000 [00:12<3:24:39, 12.29s/it]

Iteration    0 | Loss: -0.0007 | W/L/D: 0/1/9 | Temp: 1.00
Iteration 0 completed


  1%|          | 11/1000 [01:57<2:53:18, 10.51s/it]

Iteration   10 | Loss: 0.0000 | W/L/D: 0/0/10 | Temp: 0.99


  2%|▏         | 21/1000 [03:35<2:27:33,  9.04s/it]

Iteration   20 | Loss: 0.0000 | W/L/D: 0/0/10 | Temp: 0.98


  3%|▎         | 30/1000 [04:55<2:17:55,  8.53s/it]