In [1]:
# Creating RL MLP for Tic Tac Toe

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
from IPython.display import display, clear_output
from time import sleep

from utils import show_field
import numpy as np
import time
import sys
from functools import reduce

In [31]:
# Setting up GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device('cpu')

In [5]:
class Board:
    def __init__(self, board_dim=3, win_condition=None):
        self.board_dim = board_dim
        if win_condition is None:
            self.win_condition = board_dim
        else:
            self.win_condition = win_condition
        # 0: empty
        # 1: cross
        # -1: nought
        self.board = np.zeros((board_dim, board_dim))
        self.empty_cells = board_dim * board_dim

    def deepcopy(self):
        b = Board()
        b.board_dim = self.board_dim
        b.win_condition = self.win_condition
        b.empty_cells = self.empty_cells
        b.board = np.copy(self.board)
        return b

    def check_win(self, i, j):
        # vertical
        same_cell = 0
        k = i + 1
        while k < self.board_dim and self.board[k][j] == self.board[i][j]:
            same_cell += 1
            k += 1

        k = i - 1
        while k >= 0 and self.board[k][j] == self.board[i][j]:
            same_cell += 1
            k -= 1

        if same_cell == (self.win_condition - 1):
            return self.board[i][j]


        # horizontals
        same_cell = 0
        k = j + 1
        while k < self.board_dim and self.board[i][k] == self.board[i][j]:
            same_cell += 1
            k += 1

        k = j - 1
        while k >= 0 and self.board[i][k] == self.board[i][j]:
            same_cell += 1
            k -= 1

        if same_cell == (self.win_condition - 1):
            return self.board[i][j]

        # diagonals
        same_cell = 0
        k = i + 1
        l = j + 1
        while k < self.board_dim and l < self.board_dim and self.board[k][l] == self.board[i][j]:
            same_cell += 1
            k += 1
            l += 1

        k = i - 1
        l = j - 1
        while k >= 0 and l >= 0 and self.board[k][l] == self.board[i][j]:
            same_cell += 1
            k -= 1
            l -= 1

        if same_cell == (self.win_condition - 1):
            return self.board[i][j]

        same_cell = 0
        k = i + 1
        l = j - 1
        while k < self.board_dim and l >= 0 and self.board[k][l] == self.board[i][j]:
            same_cell += 1
            k += 1
            l -= 1

        k = i - 1
        l = j + 1
        while k >= 0 and l < self.board_dim and self.board[k][l] == self.board[i][j]:
            same_cell += 1
            k -= 1
            l += 1

        if same_cell == (self.win_condition - 1):
            return self.board[i][j]

        return 0

    def show(self):
        show_field(self.board)

    def make_move(self, move, player) -> int:
        i = move[0]
        j = move[1]
        if player not in [1, -1]:
            raise ValueError(f"Illegal player {player}")


        if self.board[i][j] != 0:
            return player * -1 # illegal move

        self.board[i][j] = player
        self.empty_cells -= 1

        win = self.check_win(i, j)
        if win != 0:
            self.empty_cells = 0

        return win

In [6]:
class RandomAgent:
    def __init__(self, board_dim):
        self.board_dim = board_dim

    @staticmethod
    def get_move(board):
        available_cells = []
        for i in range(board.board_dim):
            for j in range(board.board_dim):
                if board.board[i][j] == 0:
                    available_cells.append((i,j))
        return available_cells[random.randint(0, len(available_cells) - 1)]

In [27]:
class IntelligentAgent:
    def __init__(self, nn_filename=None, board_dim=3, hidden_size=64):
        self.board_dim = board_dim
        if nn_filename is None:
            self.brain = AgentNN(board_dim, hidden_size)
        else:
            self.brain = torch.load(nn_filename)

        self.optimizer = optim.AdamW(self.brain.parameters(), amsgrad=True)
        self.brain.to(device)
        self.brain.eval()

        self.remap_move = {}
        pos = 0
        for i in range(board_dim):
            for j in range(board_dim):
                self.remap_move[pos] = (i, j)
                pos += 1

    def get_move(self, board):
        with torch.no_grad():
            thought = self.brain(torch.FloatTensor(board.board.flatten()).to(device))
            #print(thought, file=sys.stderr)
            return self.remap_move[thought.to('cpu').argmax().item()]

    def get_move_batch(self, boards):
        thoughts = self.brain(torch.stack([torch.FloatTensor(b[0].board.flatten()) for b in boards]).to(device))
        return thoughts

    def save(self, file_name):
        self.brain.save(file_name)

    def train(self, moves, results):
        self.brain.train()
        actual_moves = self.get_move_batch(moves)
        z = torch.zeros(actual_moves.shape[0], 9)
        for i, j in zip(range(actual_moves.shape[0]), actual_moves.argmax(dim=1)):
            z[i,j] = results[i]
        loss = F.cross_entropy(actual_moves, z.to(device))
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
        self.brain.eval()

In [8]:
class AgentNN(nn.Module):
    def __init__(self, board_dim=3, hidden_size=64):
        super().__init__()
        self.board_dim = board_dim
        self.hidden_size = hidden_size

        self.input = nn.Linear(board_dim ** 2, hidden_size)
        self.hidden = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, board_dim ** 2)

    def forward(self, x):
        x = self.input(x)
        x = F.relu(x)
        x = self.hidden(x)
        x = F.relu(x)
        return self.out(x)

    def save(self, file_name):
        torch.save(self, file_name)


In [9]:
def play_single_game(agent_x, agent_o, show=False, win_condition=None):
    # Play game and return moves in reversed order

    assert agent_x.board_dim == agent_o.board_dim, 'Agents have different dims'
    b = Board(agent_x.board_dim, win_condition)
    turn = 0
    player = [1, -1]
    agent = (agent_x, agent_o)
    moves = ([], [])
    while b.empty_cells > 0:
        if show:
            b.show()
            time.sleep(1)
        move = agent[turn].get_move(b)
        moves[turn].insert(0, (b.deepcopy(), move))
        result = b.make_move(move, player[turn])
        if result != 0:
            if show:
                b.show()
                remap = {1: 'x', -1: 'o'}
                print(f"Player '{remap[int(result)]}' wins!")
                time.sleep(1)
            return result, moves
        turn += 1
        turn %= 2
    if show:
        b.show()
        print(f'Draw!')
        time.sleep(1)
    return 0, moves

In [15]:
agent_o = RandomAgent(3)
agent_x = IntelligentAgent('data/test_nn_0')
play_single_game(agent_x, agent_o, True, 3)

    0   1   2   
    -   -   -   
0 |   |   |   |
    -   -   -   
1 |   |   |   |
    -   -   -   
2 | x | o |   |
    -   -   -   
Player 'o' wins!


(-1,
 ([(<__main__.Board at 0x7f08d4c734d0>, (2, 0)),
   (<__main__.Board at 0x7f097f180f10>, (2, 0))],
  [(<__main__.Board at 0x7f08d516cc90>, (2, 1))]))

In [151]:
z = torch.zeros(actual_moves.shape[0], 9)
for i, j in zip(range(actual_moves.shape[0]), actual_moves.argmax(dim=1)):
    z[i,j] = res
z


tensor([[ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.]])

In [24]:
tmp = [[1, 2, 3], [4, 5], [6]]
sum(tmp, [])

[1, 2, 3, 4, 5, 6]

In [38]:
START = 4
NUM_EPOCHS = 10
NUM_GAMES = 100000
BATCH_SIZE = 10000
PRINT_TIME = False
    

start_time = time.perf_counter()
for epoch in range(START, START + NUM_EPOCHS):
    file_name = None
    if epoch > 0:
        file_name = f'data/test_nn_{epoch - 1}'
    agent = IntelligentAgent(nn_filename=file_name, hidden_size=2048)
    random_agent = RandomAgent(3)
    
    wins = 0
    draws = 0
    total_moves = []
    results = []
    start_batch_time = time.perf_counter()
    for i in range(NUM_GAMES):
        res, moves = play_single_game(agent, random_agent)
        if res == 1:
            wins += 1
        if res == 0:
            draws += 1

        # custom loss for draw
        if res == 0:
            res = 0.9

        total_moves += moves[0]
        results += [res] * len(moves[0])
        
        if len(total_moves) >= BATCH_SIZE:
            if PRINT_TIME:
                print(f'Total time: {time.perf_counter() - start_time}, Time for creating current batch: {time.perf_counter() - start_batch_time}')
                start_train_time = time.perf_counter()

            agent.train(total_moves, results)

            if PRINT_TIME:
                print(f'Total time: {time.perf_counter() - start_time}, Time for training current batch: {time.perf_counter() - start_train_time}')


            # clear tmp variables
            total_moves = []
            results = []

            if PRINT_TIME:
                start_batch_time = time.perf_counter()

    print(f'Total time: {time.perf_counter() - start_time}, Win rate {wins / NUM_GAMES} Draw rate {draws / NUM_GAMES} Win or draw rate {(draws + wins) / NUM_GAMES}')
    agent.save(f'data/test_nn_{epoch}')

Total time: 151.6947169969999, Win rate 0.96598 Draw rate 0.00643 Win or draw rate 0.97241


KeyboardInterrupt: 

In [191]:
agent_o = RandomAgent(3)
agent_x = IntelligentAgent(nn_filename='test_nn_9')
wins = 0
games = 1000
for i in range(games):
    res, _ = play_single_game(agent_x, agent_o)
    if res == 1:
        wins += 1
print(f'Win rate {wins / games}')

Win rate 0.0


In [196]:
# agent_o = IntelligentAgent(nn_filename='test_nn_9')
agent_x = IntelligentAgent(nn_filename='test_nn_73')
agent_o = RandomAgent(3)
# agent_x = RandomAgent(3)
play_single_game(agent_x, agent_o, show=True)

    0   1   2   
    -   -   -   
0 |   |   | x |
    -   -   -   
1 | o | x | o |
    -   -   -   
2 | x |   |   |
    -   -   -   
Player 'x' wins!


(1.0,
 ([(<__main__.Board at 0x177ce43d0>, (1, 1)),
   (<__main__.Board at 0x293947a90>, (0, 2)),
   (<__main__.Board at 0x293946e00>, (2, 0))],
  [(<__main__.Board at 0x177ce6290>, (1, 0)),
   (<__main__.Board at 0x177ce40d0>, (1, 2))]))

In [None]:
# Our NN should return board size output with probabilities of best move
# We need to collect all steps of our agent
# How do we backprop with this information?