In [8]:
!pip install gym-chess

Collecting gym-chess
  Downloading gym_chess-0.1.1-py3-none-any.whl (27 kB)
Collecting python-chess<0.32.0,>=0.31.1
  Downloading python_chess-0.31.4-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 6.4 MB/s 
Installing collected packages: python-chess, gym-chess
  Attempting uninstall: python-chess
    Found existing installation: python-chess 0.23.11
    Uninstalling python-chess-0.23.11:
      Successfully uninstalled python-chess-0.23.11
Successfully installed gym-chess-0.1.1 python-chess-0.31.4


In [74]:
import gym
import gym_chess
import random

from collections import deque
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import backend

In [192]:
class DQNAgent:
  def __init__(self, state_size, action_size, learning_rate = 0.001):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = deque(maxlen=20)
    self.gamma = 0.95 # discount rate
    self.epsilon = 0.7 #exploration rate
    self.epsilon_min = 0.1
    self.epsilon_decay = 0.99
    self.learning_rate = learning_rate
    self.model = self._build_model()
    self.target_model = self._build_model()
    self.update_target_model()

  def _huber_loss(self, y_true, y_pred, clip_delta=1.0):
    error = y_true - y_pred
    cond  = backend.abs(error) <= clip_delta

    squared_loss = 0.5 * backend.square(error)
    quadratic_loss = 0.5 * backend.square(clip_delta) + clip_delta * (backend.abs(error) - clip_delta)

    return backend.mean(tf.where(cond, squared_loss, quadratic_loss))

  # Neural Net for Deep-Q learning Model
  def _build_model(self):
    model = models.Sequential()
    model.add(layers.Dense(units=24, input_dim=self.state_size, activation='relu'))
    model.add(layers.Dense(units=24, activation='relu'))
    model.add(layers.Dense(self.action_size, activation='linear'))
    model.compile(loss=self._huber_loss, optimizer=optimizers.Adam(learning_rate=self.learning_rate), metrics=['accuracy'])

    return model

  # copy weights from model to target model
  def update_target_model(self):
    self.target_model.set_weights(self.model.get_weights())

  def memorize(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):
    #state = state.astype(float)

    if np.random.rand() <= self.epsilon:
      action = random.choice(env.legal_moves)
      print('random: {}'.format(action))
      return action

    action_values = self.model.predict(state)
    best_action = np.argmax(action_values)
    action = action_values[best_action]
    print('not-random: {} - {}'.format(action_values, action))
    # return actions
    return action

  def replay(self, batch_size):
    mini_batch = random.sample(self.memory, batch_size)
    for state, action, reward, next_state, done in mini_batch:
      target = self.model.predict(state)
      if done:
        target[0][action] = reward
      else:
        a = self.model.predict(next_state)[0]
        t = self.target_model.predict(next_state)[0]
        target[0][action] = reward + self.gamma * np.amax(t)
      self.model.fit(state, target, epochs=1, verbose=0)
    
    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay

  def load(self, name):
    self.model.load_weights(name)

  def save(self, name):
    self.model.save_weights(name)

In [193]:
def make_matrix(board): #type(board) == chess.Board()
  pgn = board.epd()
  foo = []  #Final board
  pieces = pgn.split(" ", 1)[0]
  rows = pieces.split("/")

  mapped = {
  'P': 1,     # White Pawn
  'p': -1,    # Black Pawn
  'N': 2,     # White Knight
  'n': -2,    # Black Knight
  'B': 3,     # White Bishop
  'b': -3,    # Black Bishop
  'R': 4,     # White Rook
  'r': -4,    # Black Rook
  'Q': 5,     # White Queen
  'q': -5,    # Black Queen
  'K': 6,     # White King
  'k': -6     # Black King
  }
    
  for row in rows:
    foo2 = []  #This is the row I make
    for thing in row:
      if thing.isdigit():
        for i in range(0, int(thing)):
          foo2.append(0)
      else:
        foo2.append(mapped[thing])
    foo.append(foo2)
  return np.array(foo)

In [194]:
episode_rewards = []
epsilon_time = []
moving_average = []
action_time = []
EPISODES = 1

env = gym.make('Chess-v0')
state_size = 1
action_size = 1
batch_size = 64

agent = DQNAgent(state_size, action_size)

for episode in range(EPISODES):
  friendly_episode_display = episode+1
  epsilon_time.append(agent.epsilon)
  print('Episode {}/{}, Epsilon: {}'.format(friendly_episode_display, EPISODES, agent.epsilon))
  
  state = env.reset()
  #print(state)
  board_txt = state.fen().split()[0]
  board_encoded = ''.join(str(ord(c)) for c in board_txt)
  observation = [board_encoded]
  
  m = make_matrix(state)

  history = []
  episode_reward = 0
  done = False

  while not done:
    if state.is_checkmate() or state.is_stalemate() or state.is_insufficient_material() or state.is_game_over() or state.can_claim_threefold_repetition or state.can_claim_fifty_moves or state.can_claim_draw() or state.is_fivefold_repetition or state.is_seventyfive_moves():
      done = True
    
    a = np.reshape(observation, [1, len(observation)]).astype(float)
    #a = np.reshape(m, [1, m.shape[0]*m.shape[1]])
    #print(a[0])
    #action = agent.act(a[0])
    action = agent.act(a)

    if episode == EPISODES-1:
      action_time.append(action)

    next_state, reward, done, _ = env.step(action)
    reward = (reward*1000)-1

    old_observation = observation.copy()
    history.append(state.san(action))
    agent.memorize(np.reshape(old_observation, [1, len(old_observation)]), action, reward, np.reshape(observation, [1, len(observation)]), done)
    state = next_state

    episode_reward+=episode_reward

    if done:
      print(env.render(mode='unicode'))
      agent.update_target_model()
      print('Episode {}/{}, Score: {}, Epsilon: {}'.format(friendly_episode_display, EPISODES, reward, agent.epsilon))
      print('Episode: {}, Reward: {}'.format(episode_reward))
      break
    
    if len(agent.memory) > batch_size:
      agent.replay(batch_size)
  episode_rewards.append(episode_reward)

Episode 1/1, Epsilon: 0.7
random: a2a3
random: d7d5
random: d2d3
not-random: [[nan]] - [nan]


AttributeError: ignored

In [90]:
obs = ['start']
np.reshape(obs, [1, len(obs)])

array([['start']], dtype='<U5')