In [1]:
import chess
import RL_utils

In [2]:
# Our existing CNN model
model_path = "models/TORCH_250EPOCH_DoubleHead.pth"

## Load stockfish

In [3]:
from RL_utils import PositionEvaluator

In [4]:
stockfish_path = "models/stockfish/stockfish-windows-x86-64-avx2.exe"
evaluator = PositionEvaluator(stockfish_path, elo_rating=1400)

Stockfish initialized with ELO: 1400


In [5]:
board = chess.Board()
board.push_uci("e2e4")
board.push_uci("h7h6")

Move.from_uci('h7h6')

In [6]:
eval_score = evaluator.evaluate_position(board)
print(f"Position evaluation: {eval_score}")

Position evaluation: 0.83


In [7]:
best_moves = evaluator.get_best_moves(board, 3)
print(f"Best moves: {best_moves}")

Best moves: [{'Move': 'd2d4', 'Centipawn': 83, 'Mate': None}, {'Move': 'g1f3', 'Centipawn': 71, 'Mate': None}, {'Move': 'b1c3', 'Centipawn': 68, 'Mate': None}]


## Load our CNN trained model

In [3]:
model, device = RL_utils.load_model(model_path)
model = model.to(device)

In [4]:
import torch
dummy_input = torch.randn(1, 19, 8, 8).to(device)
with torch.no_grad():
    output = model(dummy_input)
    print(f"Model output shape: {output.shape}")  # Should be [1, 4288]
    print("Model loaded successfully!")

AttributeError: 'tuple' object has no attribute 'shape'

## Load the training data

In [6]:
pgn_file = "games/lichess_db_2016-04.pgn"
positions = []
positions += RL_utils.extract_middlegame_positions(pgn_file, evaluator=None, num_positions=1000)
print(f"Total positions loaded: {len(positions)}")

Extracted 1000 middle game positions
Total positions loaded: 1000


In [7]:
# Test creating a batch
# if positions:
#     board_tensors, legal_masks, boards = RL_utils.create_training_batch(positions, batch_size=4)
#     print(f"Batch shapes:")
#     print(f"  Board tensors: {board_tensors.shape}")
#     print(f"  Legal masks: {legal_masks.shape}")
#     print(f"  Number of boards: {len(boards)}")
    
#     # Show a sample position
#     print(f"\nSample position FEN: {boards[0].fen()}")

In [8]:
# boards[0]
# eval_score = evaluator.evaluate_position(boards[0])
# print(f"Position evaluation: {eval_score}")

## Training Loop

In [9]:
import random
import tqdm
import torch
import numpy as np

In [None]:
# Training Hyperparameters
LEARNING_RATE = 0.02
BATCH_SIZE = 64
EPOCHS = 5

# Self-play
NUM_SELF_PLAY_GAMES = 50
MAX_GAME_MOVES = 200
TEMPERATURE = 1.0

# Model Saving
MODEL_SAVE_PATH = "models/dual_head_model.pth"

In [11]:
def generate_self_play_data(model, device, start_positions, num_games=NUM_SELF_PLAY_GAMES):
    """Generate self-play data with error handling"""
    data = []
    successful_games = 0
    
    for i in range(num_games):
        try:
            board = random.choice(start_positions)
            game_history, result = RL_utils.play_self_play_game(model, device, board, MAX_GAME_MOVES, TEMPERATURE)
            
            if len(game_history) > 0:  # Only add if we have valid data
                data.append((game_history, result))
                successful_games += 1
                
        except Exception as e:
            print(f"Error in game {i}: {e}")
            continue
    
    print(f"Generated {successful_games} successful games out of {num_games} attempts")
    return data

In [12]:
def train_on_self_play(model, optimizer, game_histories, device):
    """Train model on self-play data with gradient clipping"""
    model.train()
    
    # Flatten all examples from all games
    all_examples = []
    for history, result in game_histories:
        for board_tensor, legal_mask, move_idx, turn in history:
            # Convert result to value from current player's perspective
            value = result if turn else -result
            all_examples.append((board_tensor, legal_mask, move_idx, value))
    
    if len(all_examples) == 0:
        print("No training examples available!")
        return
    
    print(f"Training on {len(all_examples)} examples")
    
    # Training loop
    for epoch in range(EPOCHS):
        random.shuffle(all_examples)
        epoch_losses = []
        
        for i in range(0, len(all_examples), BATCH_SIZE):
            batch = all_examples[i:i+BATCH_SIZE]
            if len(batch) == 0:
                continue
            
            try:
                # Prepare batch
                boards = torch.cat([ex[0] for ex in batch]).to(device)
                masks = torch.stack([ex[1] for ex in batch]).to(device)
                move_targets = torch.tensor([ex[2] for ex in batch], dtype=torch.long).to(device)
                value_targets = torch.tensor([ex[3] for ex in batch], dtype=torch.float).to(device)
                
                # Forward pass
                optimizer.zero_grad()
                policy_logits, value_preds = model(boards)
                
                # Compute loss
                loss, p_loss, v_loss = RL_utils.compute_loss(policy_logits, value_preds, move_targets, value_targets, masks)
                
                # Check for NaN
                if torch.isnan(loss):
                    print("Skipping batch due to NaN loss")
                    continue
                
                # Backward pass
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                epoch_losses.append(loss.item())
                
            except Exception as e:
                print(f"Error in batch {i//BATCH_SIZE}: {e}")
                continue
        
        if epoch_losses:
            avg_loss = np.mean(epoch_losses)
            print(f"Epoch {epoch+1}: avg loss={avg_loss:.4f}")
        else:
            print(f"Epoch {epoch+1}: No valid batches")


In [13]:
model, device = RL_utils.load_model(model_path)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)

# Main training loop
for iteration in range(5):  # 5 outer iterations of self-play + training
    print(f"\n=== Iteration {iteration+1} ===")
    
    # Generate self-play games
    self_play_data = generate_self_play_data(model, device, positions)
    
    # Train model on self-play games
    train_on_self_play(model, optimizer, self_play_data, device)

    # Save model
    torch.save({
        'model_state_dict': model.state_dict(),
    }, MODEL_SAVE_PATH)


=== Iteration 1 ===
Generated 50 successful games out of 50 attempts
Training on 8911 examples
Epoch 1: avg loss=2.5673
Epoch 2: avg loss=2.4884
Epoch 3: avg loss=2.3660
Epoch 4: avg loss=2.2527
Epoch 5: avg loss=2.2032

=== Iteration 2 ===
Generated 48 successful games out of 50 attempts
Training on 8545 examples
Epoch 1: avg loss=2.5461
Epoch 2: avg loss=2.4784
Epoch 3: avg loss=2.3943
Epoch 4: avg loss=2.3215
Epoch 5: avg loss=2.2797

=== Iteration 3 ===
Generated 50 successful games out of 50 attempts
Training on 9072 examples
Epoch 1: avg loss=2.5858
Epoch 2: avg loss=2.5293
Epoch 3: avg loss=2.4368
Epoch 4: avg loss=2.3901
Epoch 5: avg loss=2.3349

=== Iteration 4 ===
Generated 48 successful games out of 50 attempts
Training on 8791 examples
Epoch 1: avg loss=2.6681
Epoch 2: avg loss=2.6060
Epoch 3: avg loss=2.5448
Epoch 4: avg loss=2.4908
Epoch 5: avg loss=2.4262

=== Iteration 5 ===
Generated 45 successful games out of 50 attempts
Training on 8327 examples
Epoch 1: avg loss=2.