# Train a chess AI with RL

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/My\ Drive/Chess

Mounted at /content/drive
/content/drive/My Drive/Chess


Use OpenAI stable baselines to check that the environment is valid.

In [2]:
from stable_baselines3.common.env_checker import check_env
from training.chessEnv import ChessEnv

In [13]:
env = ChessEnv([-1e-5, -1e-3, 1, -1, 0], simple=True)
check_env(env, warn=True)

In [14]:
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env

# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [15]:
# Train the agent
model = A2C('MlpPolicy', env, verbose=0).learn(100000)

In [17]:
# Test the trained agent
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('reward=', reward, 'done=', done)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [759]
reward= [-0.001] done= [False]
Step 2
Action:  [759]
reward= [-0.001] done= [False]
Step 3
Action:  [759]
reward= [-0.001] done= [False]
Step 4
Action:  [759]
reward= [-0.001] done= [False]
Step 5
Action:  [759]
reward= [-0.001] done= [False]
Step 6
Action:  [759]
reward= [-0.001] done= [False]
Step 7
Action:  [759]
reward= [-0.001] done= [False]
Step 8
Action:  [759]
reward= [-0.001] done= [False]
Step 9
Action:  [759]
reward= [-0.001] done= [False]
Step 10
Action:  [759]
reward= [-0.001] done= [False]
Step 11
Action:  [759]
reward= [-0.001] done= [False]
Step 12
Action:  [759]
reward= [-0.001] done= [False]
Step 13
Action:  [759]
reward= [-0.001] done= [False]
Step 14
Action:  [759]
reward= [-0.001] done= [False]
Step 15
Action:  [759]
reward= [-0.001] done= [False]
Step 16
Action:  [759]
reward= [-0.001] done= [False]
Step 17
Action:  [759]
reward= [-0.001] done= [False]
Step 18
Action:  [759]
reward= [-0.001] done= [False]
Step 19
Action:  [759]
reward= [-0.00

## Own Training code

In [2]:
import models
import training

In [3]:
# Training parameters
parameters = {
    "num_frames" : 3000000,
    "batch_size" : 32,
    "gamma" : 0.95,

    "buffersize" : 50000,

    "epsilon_start" : 0.7,
    "epsilon_final" : 0.01,
    "epsilon_decay" : 500000,    # linear decay

    # Rewards: move, illegal, win, loss, draw
    "rewards" : [-1e-5, -1e-3, 1, -1, 0]
} # Explore ONLY legal moves

In [4]:
# Choose training method
trainer = training.DQN(parameters)

In [None]:
trainer.train()

1000 frames: 2-0-1, 29.3% illegal, 1.01% legal NN moves, 70.4% explored, nan loss
2000 frames: 3-0-2, 30.1% illegal, 1.95% legal NN moves, 69.3% explored, nan loss
3000 frames: 3-0-3, 30.1% illegal, 2.59% legal NN moves, 69.1% explored, nan loss
4000 frames: 3-0-2, 28.2% illegal, 2.76% legal NN moves, 71.0% explored, nan loss
5000 frames: 3-0-1, 28.7% illegal, 2.38% legal NN moves, 70.6% explored, nan loss
6000 frames: 2-0-3, 30.1% illegal, 1.31% legal NN moves, 69.5% explored, nan loss
7000 frames: 3-0-2, 30.1% illegal, 0.99% legal NN moves, 69.6% explored, nan loss
8000 frames: 3-0-2, 30.1% illegal, 2.9% legal NN moves, 69.0% explored, nan loss
9000 frames: 3-0-2, 29.4% illegal, 1.34% legal NN moves, 70.2% explored, nan loss
10000 frames: 1-0-4, 29.5% illegal, 1.99% legal NN moves, 69.9% explored, nan loss
11000 frames: 3-0-2, 29.6% illegal, 2.95% legal NN moves, 69.5% explored, nan loss
12000 frames: 4-0-2, 30.1% illegal, 1.31% legal NN moves, 69.5% explored, nan loss
13000 frames: 

In [None]:
drive.flush_and_unmount()