<a href="https://colab.research.google.com/github/harshii-02/DL-assignment/blob/main/tictactoe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pickle

BOARD_ROWS = 3
BOARD_COLS = 3


# =========================
# STATE (ENVIRONMENT)
# =========================
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        self.playerSymbol = 1  # p1 starts

    def getHash(self):
        self.boardHash = str(self.board.reshape(9))
        return self.boardHash

    def availablePositions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    def winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:
                self.isEnd = True
                return np.sign(sum(self.board[i, :]))
            if abs(sum(self.board[:, i])) == 3:
                self.isEnd = True
                return np.sign(sum(self.board[:, i]))

        diag1 = sum(self.board[i, i] for i in range(3))
        diag2 = sum(self.board[i, 2 - i] for i in range(3))
        if abs(diag1) == 3 or abs(diag2) == 3:
            self.isEnd = True
            return 1 if diag1 == 3 or diag2 == 3 else -1

        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0

        self.isEnd = False
        return None

    def giveReward(self):
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.3)
            self.p2.feedReward(0.3)

    def reset(self):
        self.board = np.zeros((3, 3))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    # FAST TRAINING
    def play(self, rounds=8000):
        for _ in range(rounds):
            while not self.isEnd:
                p1_action = self.p1.chooseAction(
                    self.availablePositions(), self.board, self.playerSymbol)
                self.updateState(p1_action)
                self.p1.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                p2_action = self.p2.chooseAction(
                    self.availablePositions(), self.board, self.playerSymbol)
                self.updateState(p2_action)
                self.p2.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

            # epsilon decay
            self.p1.exp_rate = max(0.01, self.p1.exp_rate * 0.995)
            self.p2.exp_rate = max(0.01, self.p2.exp_rate * 0.995)

    # HUMAN PLAY
    def play2(self):
        while not self.isEnd:
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            self.updateState(p1_action)
            self.showBoard()

            win = self.winner()
            if win is not None:
                print("Computer wins!" if win == 1 else "Tie!")
                self.reset()
                break

            positions = self.availablePositions()
            p2_action = self.p2.chooseAction(positions)
            self.updateState(p2_action)
            self.showBoard()

            win = self.winner()
            if win is not None:
                print("Human wins!" if win == -1 else "Tie!")
                self.reset()
                break

    def showBoard(self):
        for i in range(3):
            print('-------------')
            for j in range(3):
                print('|', 'X' if self.board[i, j] == 1 else 'O' if self.board[i, j] == -1 else ' ', end=' ')
            print('|')
        print('-------------')


# =========================
# AI PLAYER
# =========================
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []
        self.lr = 0.1
        self.exp_rate = exp_rate
        self.gamma = 0.8
        self.states_value = {}

    def getHash(self, board):
        return str(board.reshape(9))

    def chooseAction(self, positions, current_board, symbol):
        if np.random.rand() < self.exp_rate:
            return positions[np.random.choice(len(positions))]
        value_max = -999
        for p in positions:
            next_board = current_board.copy()
            next_board[p] = symbol
            value = self.states_value.get(self.getHash(next_board), 0)
            if value >= value_max:
                value_max = value
                action = p
        return action

    def addState(self, state):
        self.states.append(state)

    def feedReward(self, reward):
        for st in reversed(self.states):
            self.states_value[st] = self.states_value.get(st, 0)
            self.states_value[st] += self.lr * (self.gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        with open('policy_' + self.name, 'wb') as f:
            pickle.dump(self.states_value, f)

    def loadPolicy(self, file):
        with open(file, 'rb') as f:
            self.states_value = pickle.load(f)


# =========================
# HUMAN PLAYER
# =========================
class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Row (0-2): "))
            col = int(input("Col (0-2): "))
            if (row, col) in positions:
                return (row, col)


# =========================
# MAIN
# =========================
if __name__ == "__main__":

    print("Training fast RL agent...")
    p1 = Player("p1")
    p2 = Player("p2")
    st = State(p1, p2)
    st.play(8000)

    p1.savePolicy()
    print("Training completed.\n")

    p1 = Player("computer", exp_rate=0)
    p1.loadPolicy("policy_p1")
    p2 = HumanPlayer("human")

    st = State(p1, p2)

    while True:
        st.play2()
        c = input("Play again? (y/n): ")
        if c.lower() != 'y':
            break


Training fast RL agent...
Training completed.

-------------
|   |   | X |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Row (0-2): 0
Col (0-2): 1
-------------
|   | O | X |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
-------------
|   | O | X |
-------------
|   |   |   |
-------------
|   |   | X |
-------------
Row (0-2): 1
Col (0-2): 2
-------------
|   | O | X |
-------------
|   |   | O |
-------------
|   |   | X |
-------------
-------------
|   | O | X |
-------------
|   |   | O |
-------------
|   | X | X |
-------------
Row (0-2): 2
Col (0-2): 0
-------------
|   | O | X |
-------------
|   |   | O |
-------------
| O | X | X |
-------------
-------------
|   | O | X |
-------------
|   | X | O |
-------------
| O | X | X |
-------------
Row (0-2): 0
Col (0-2): 0
-------------
| O | O | X |
-------------
|   | X | O |
-------------
| O | X | X |
-------------
-------------
| O | O | X |
-------------
| X | X | O |
---------