<a href="https://colab.research.google.com/github/julienif/julifsChessBot/blob/main/julifsChessBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install torchrl
!pip3 install gym[mujoco]
!pip3 install gymnasium
!pip3 install tqdm
!pip3 install chess
!pip install torch torchvision torchaudio
!pip3 install tensorflow

Collecting torchrl
  Downloading torchrl-0.4.0-cp310-cp310-manylinux1_x86_64.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensordict>=0.4.0 (from torchrl)
  Downloading tensordict-0.4.0-cp310-cp310-manylinux1_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.3.0->torchrl)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.3.0->torchrl)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.3.0->torchrl)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.3.0->torchrl)


In [2]:
# imports

In [46]:
import warnings
warnings.filterwarnings("ignore")
from torch import multiprocessing

import numpy as np
import chess
import torch
import tensorflow as tf
import gymnasium as gym
from torchrl.envs.libs.gym import GymWrapper
from torch import nn
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from tensordict.tensordict import TensorDict
from gym import spaces
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter, TransformedEnv)
from torchrl.modules import ProbabilisticActor, ValueOperator
import torch.distributions as dist
from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
from tqdm import tqdm


In [4]:
# chess RL environment

  and should_run_async(code)


In [42]:
class ChessEnv(gym.Env):
  def __init__(self, color_trained='white', device='cpu'):
    super(ChessEnv, self).__init__()
    self.device = device
    self.color_trained = color_trained
    self.board = chess.Board()
    self.action_space = spaces.Discrete(4096) # assume max 4096 legal moves
    self.observation_space = spaces.Box(low=0, high=1, shape=(8, 8, 13))
    self.reward_range = (-1, 1)

  def reset(self, seed=None, options=None):
    super().reset(seed=seed)
    self.board.reset()
    observation = self._get_observation()
    return observation

  def step(self, action):
    legal_moves = list(self.board.legal_moves)
    if 0 <= action < len(legal_moves):
      uci_move = legal_moves[action].uci()
      self.board.push(chess.Move.from_uci(uci_move))

    obs = self._get_observation()
    reward = self._get_reward()
    done = self.board.is_game_over()
    info = {}

    return obs, reward, done, info

  def render(self):
    print(self.board)
    return None

  def close(self):
    pass

  def _get_observation(self):
    board_state = np.zeros((8, 8, 13), dtype=np.float64)

    piece_map = {
      'p': 0, 'n': 1, 'b': 2, 'r': 3, 'q': 4, 'k': 5,
      'P': 6, 'N': 7, 'B': 8, 'R': 9, 'Q': 10, 'K': 11,
      '.': 12
    }

    for square in chess.SQUARES:
      piece = self.board.piece_at(square)
      if piece is not None:
        idx = piece_map[piece.symbol()]
        board_state[chess.square_rank(square), chess.square_file(square), idx] = 1
      else:
        board_state[chess.square_rank(square), chess.square_file(square), 12] = 1

    return board_state

  def _get_reward(self):
    piece_value = {
        'P': 1,
        'R': 5,
        'N': 3,
        'B': 3,
        'Q': 9,
        'K': 0,
        'p': 1,
        'r': 5,
        'n': 3,
        'b': 3,
        'q': 9,
        'k': 0,
    }
    # might change this as we would have a constant negative reward if the bot is lead in the game
    reward = 0.0
    if self.board.is_checkmate():
      if self.board.turn == chess.WHITE: # black won
        reward = -1.0
      else: # white won
        reward = 1.0
    elif self.board.is_stalemate() or self.board.is_insufficient_material() or self.board.is_seventyfive_moves():
      reward = 0.0
    else:
      white_score = sum([piece_value[piece.symbol()] for piece in self.board.piece_map().values() if piece.color == chess.WHITE])
      black_score = sum([piece_value[piece.symbol()] for piece in self.board.piece_map().values() if piece.color == chess.BLACK])

      material_adv = white_score - black_score
      norm_material_adv = material_adv / 39.0 # 39 is the max mat diff

      total_moves = len(list(self.board.move_stack))
      progress = min(total_moves / 150.0, 1.0) # 150 moves is the average game

      reward = 0.7 * norm_material_adv + 0.3 * progress

    return reward if self.color_trained == 'white' else -reward

In [38]:
# implement training using PPO

In [54]:
# define hyper parameters

#is_fork = multiprocessing.get_start_method() == "fork"
device = (
    torch.device(0)
    if torch.cuda.is_available()# and not is_fork
    else torch.device("cpu")
)
print(device)
num_cells = 256  # number of cells in each layer i.e. output dim.
lr = 3e-4
max_grad_norm = 1.0

frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 1_000_000

sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10  # optimization steps per batch of data collected
clip_epsilon = (
    0.2  # clip value for PPO loss
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4

# init environment
chess_env = GymWrapper(ChessEnv(color_trained='white', device=device)).to(device)

norm_chess_env = TransformedEnv(
    chess_env,
    Compose(
        # normalize observations
        ObservationNorm(in_keys=["observation"]),
        DoubleToFloat(),
        StepCounter(),
    ),
)

norm_chess_env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)

check_env_specs(norm_chess_env)

# networks

policy_net = nn.Sequential(
    nn.Linear(13, num_cells, device), # might cause problem idk
    nn.ReLU(),
    nn.Linear(num_cells, num_cells, device),
    nn.ReLU(),
    nn.Linear(num_cells, num_cells, device),
    nn.ReLU(),
    nn.Linear(num_cells, norm_chess_env.action_spec.shape[-1], device=device),
    NormalParamExtractor()
).to(device)

policy_module = TensorDictModule(
    policy_net, in_keys=["observation"], out_keys=["logits"]
)

policy_module = ProbabilisticActor(
    module=policy_module,
    spec=None,  # action_spec is not used for Categorical
    in_keys=["logits"],  # logits for the Categorical distribution
    distribution_class=dist.Categorical,  # distribution_class is Categorical
    return_log_prob=True,  # return log probability for importance sampling
)

value_net = nn.Sequential(
    nn.Linear(13, num_cells, device=device),
    nn.ReLU(),
    nn.Linear(num_cells, num_cells, device=device),
    nn.ReLU(),
    nn.Linear(num_cells, num_cells, device=device),
    nn.ReLU(),
    nn.Linear(num_cells, 1, device=device),
)

value_module = ValueOperator(
    module=value_net,
    in_keys=["observation"],
)

#print("Running policy:", policy_module(norm_chess_env.reset()))
#print("Running value:", value_module(norm_chess_env.reset()))

collector = SyncDataCollector(
    norm_chess_env,
    policy_module,
    frames_per_batch=frames_per_batch,
    total_frames=total_frames,
    split_trajs=False,
    device=device,
)

replay_buffer = ReplayBuffer(
    storage=LazyTensorStorage(max_size=frames_per_batch),
    sampler=SamplerWithoutReplacement(),
)

# loss function

advantage_module = GAE(
    gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
)

loss_module = ClipPPOLoss(
    actor_network=policy_module,
    critic_network=value_module,
    clip_epsilon=clip_epsilon,
    entropy_bonus=bool(entropy_eps),
    entropy_coef=entropy_eps,
    # these keys match by default but we set this for completeness
    critic_coef=1.0,
    loss_critic_type="smooth_l1",
)

optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optim, total_frames // frames_per_batch, 0.0
)

#rollout = norm_chess_env.rollout(3)
#print("rollout of three steps:", rollout)
#print("Shape of the rollout TensorDict:", rollout.batch_size)









cuda:0


2024-07-11 14:32:56,793 [torchrl][INFO] check_env_specs succeeded!


rollout of three steps: TensorDict(
    fields={
        action: Tensor(shape=torch.Size([3, 4096]), device=cuda:0, dtype=torch.int64, is_shared=True),
        done: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                observation: Tensor(shape=torch.Size([3, 8, 8, 13]), device=cuda:0, dtype=torch.float32, is_shared=True),
                reward: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.float32, is_shared=True),
                step_count: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.int64, is_shared=True),
                terminated: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                truncated: Tensor(shape=torch.Size([3, 1]), device=cuda:0, dtype=torch.bool, is_shared=True)},
            batch_size=torch.Size

In [None]:
# training loop