# Imports

In [230]:
import random

* Make a move for X -- pick best move, train the model based on picked state and associated reward, update state
* determine best move -- based on Q grid
* Train Model -- x value is the state as an array, y value (target) is the q-value of best next move + reward at current state 
* Calc_value_of_state -- use model to predict value of state
* calc target -- is the q-value of best next move + reward at current state 
* run experiment -- runs through a tictactoe game 

# Piece Class

In [231]:
from enum import Enum

class Piece(Enum):
    EMPTY = 'EMPTY'
    BLACK = 'BLACK'
    WHITE = 'WHITE'
    RED = ' RED '

# Board Class

In [232]:
import copy

class Board:
    def __init__(self):
        self.pieces = [[[Piece.EMPTY for k in range(3)] for j in range(3)] for i in range(3)]
        for x in range(9):
          a = random.randint(0,2)
          b = random.randint(0,2)
          c = random.randint(0,2)
          while self.pieces[a][b][c] != Piece.EMPTY:
            a = random.randint(0,2)
            b = random.randint(0,2)
            c = random.randint(0,2)
          self.pieces[a][b][c] = Piece.BLACK
        self.winningRuns = self.getWinningRuns()

    def validMove(self,x,y,z,dir):
        if not x in range(3) or not y in range(3) or not z in range(3):
              return False
        if dir == 'UP':
              return (z == 2) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x][y][z-1] == Piece.EMPTY or self.pieces[x][y][z-2] == Piece.EMPTY)
        if dir == 'DOWN':
              return (z == 0) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x][y][z+1] == Piece.EMPTY or self.pieces[x][y][z+2] == Piece.EMPTY)
        if dir == 'LEFT':
              return (x == 2) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x-1][y][z] == Piece.EMPTY or self.pieces[x-2][y][z] == Piece.EMPTY)
        if dir == 'RIGHT':
              return (x == 0) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x+1][y][z] == Piece.EMPTY or self.pieces[x+2][y][z] == Piece.EMPTY)
        if dir == 'FRONT':
              return (y == 2) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x][y-1][z] == Piece.EMPTY or self.pieces[x][y-2][z] == Piece.EMPTY)
        if dir == 'BACK':
              return (y == 0) and (self.pieces[x][y][z] == Piece.EMPTY or self.pieces[x][y+1][z] == Piece.EMPTY or self.pieces[x][y+2][z] == Piece.EMPTY)
        else:
              return False
            
    def move(self,x,y,z,dir,player: Piece):
        if not self.validMove(x,y,z,dir):
             raise ValueError
        else:
            if (self.pieces[x][y][z] == Piece.EMPTY):
                self.pieces[x][y][z] = player
            else:
                if dir == 'UP':
                    if (self.pieces[x][y][z-1] == Piece.EMPTY):
                        self.pieces[x][y][z-1] = self.pieces[x][y][z]
                        self.pieces[x][y][z] = player
                    else:
                        self.pieces[x][y][z-2] = self.pieces[x][y][z-1]
                        self.pieces[x][y][z-1] = self.pieces[x][y][z]
                        self.pieces[x][y][z] = player
                elif dir == 'DOWN':
                      if (self.pieces[x][y][z+1] == Piece.EMPTY):
                          self.pieces[x][y][z+1] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                      else:
                          self.pieces[x][y][z+2] = self.pieces[x][y][z+1]
                          self.pieces[x][y][z+1] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                elif dir == 'LEFT':
                      if (self.pieces[x-1][y][z] == Piece.EMPTY):
                          self.pieces[x-1][y][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                      else:
                          self.pieces[x-2][y][z] = self.pieces[x-1][y][z]
                          self.pieces[x-1][y][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                elif dir == 'RIGHT':
                      if (self.pieces[x+1][y][z] == Piece.EMPTY):
                          self.pieces[x+1][y][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                      else:
                          self.pieces[x+2][y][z] = self.pieces[x+1][y][z]
                          self.pieces[x+1][y][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                elif dir == 'FRONT':
                      if (self.pieces[x][y-1][z] == Piece.EMPTY):
                          self.pieces[x][y-1][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                      else:
                          self.pieces[x][y-2][z] = self.pieces[x][y-1][z]
                          self.pieces[x][y-1][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                elif dir == 'BACK':
                      if (self.pieces[x][y+1][z] == Piece.EMPTY):
                          self.pieces[x][y+1][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player
                      else:
                          self.pieces[x][y+2][z] = self.pieces[x][y+1][z]
                          self.pieces[x][y+1][z] = self.pieces[x][y][z]
                          self.pieces[x][y][z] = player

    def getGameState(self):
        gameState = "+----------------------\n"
        gameState += "| \ " + self.pieces[0][2][0].value + "  " + self.pieces[1][2][0].value + "  " + self.pieces[2][2][0].value + " \\\n"
        gameState += "|   \                     \\\n"
        gameState += "|     \ " + self.pieces[0][1][0].value + "  " + self.pieces[1][1][0].value + "  " + self.pieces[2][1][0].value + " \\\n"
        gameState += "|       \                     \\\n"
        gameState += "|         \ " + self.pieces[0][0][0].value + "  " + self.pieces[1][0][0].value + "  " + self.pieces[2][0][0].value + " \\\n"
        gameState += "|          ---------------------|\n"
        gameState += "|   " + self.pieces[0][2][1].value + " |" + self.pieces[1][2][1].value + "  " + self.pieces[2][2][1].value + "         |\n"
        gameState += "|         |                     |\n"
        gameState += "|       " + self.pieces[0][1][1].value + "  " + self.pieces[1][1][1].value + "  " + self.pieces[2][1][1].value + "     |\n"
        gameState += "|         |                     |\n"
        gameState += "|         | " + self.pieces[0][0][1].value + "  " + self.pieces[1][0][1].value + "  " + self.pieces[2][0][1].value + " |\n"
        gameState += "|         |                     |\n"
        gameState += " \ " + self.pieces[0][2][2].value + "  " + self.pieces[1][2][2].value + "  " + self.pieces[2][2][2].value + "          |\n"
        gameState += "   \      |                     |\n"
        gameState += "     \ " + self.pieces[0][1][2].value + "  " + self.pieces[1][1][2].value + "  " + self.pieces[2][1][2].value + "      |\n"
        gameState += "       \  |                     |\n"
        gameState += "         \| " + self.pieces[0][0][2].value + "  " + self.pieces[1][0][2].value + "  " + self.pieces[2][0][2].value + " |\n"
        gameState += "           ---------------------+\n\n"
        return gameState
    

    def getWinningRuns(self):
        runs = []

        runs.append([(0,0,0),(0,0,1),(0,0,2)])
        runs.append([(0,0,0),(0,1,0),(0,2,0)])
        runs.append([(0,0,0),(1,0,0),(2,0,0)])

        runs.append([(2,2,0),(1,2,0),(0,2,0)])
        runs.append([(2,2,0),(2,1,0),(2,0,0)])
        runs.append([(2,2,0),(2,2,1),(2,2,2)])

        runs.append([(0,2,2),(0,1,2),(0,0,2)])
        runs.append([(0,2,2),(1,2,2),(2,2,2)])
        runs.append([(0,2,2),(0,2,1),(0,2,0)])

        runs.append([(2,0,2),(2,0,1),(2,0,0)])
        runs.append([(2,0,2),(1,0,2),(0,0,2)])
        runs.append([(2,0,2),(2,1,2),(2,2,2)])
        # Front
        runs.append([(0,0,0),(1,0,1),(2,0,2)])
        runs.append([(0,0,2),(1,0,1),(2,0,0)])
        runs.append([(1,0,0),(1,0,1),(1,0,2)])
        runs.append([(0,0,1),(1,0,1),(2,0,1)])
        # Top
        runs.append([(0,0,0),(1,1,0),(2,2,0)])
        runs.append([(0,2,0),(1,1,0),(2,0,0)])
        runs.append([(0,1,0),(1,1,0),(2,1,0)])
        runs.append([(1,2,0),(1,1,0),(1,0,0)])
        # Left
        runs.append([(0,0,0),(0,1,1),(0,2,2)])
        runs.append([(0,0,2),(0,1,1),(0,2,0)])
        runs.append([(0,0,1),(0,1,1),(0,2,1)])
        runs.append([(0,1,0),(0,1,1),(0,1,2)])
        # Back
        runs.append([(0,2,2),(1,2,1),(2,2,0)])
        runs.append([(0,2,0),(1,2,1),(2,2,2)])
        runs.append([(1,2,0),(1,2,1),(1,2,2)])
        runs.append([(0,2,1),(1,2,1),(2,2,1)])
        # Right
        runs.append([(2,0,2),(2,1,1),(2,2,0)])
        runs.append([(2,0,0),(2,1,1),(2,2,2)])
        runs.append([(2,0,1),(2,1,1),(2,2,1)])
        runs.append([(2,1,0),(2,1,1),(2,1,2)])
        # Bottom
        runs.append([(2,0,2),(1,1,2),(0,2,2)])
        runs.append([(0,0,2),(1,1,2),(2,2,2)])
        runs.append([(0,1,2),(1,1,2),(2,1,2)])
        runs.append([(1,0,2),(1,1,2),(1,2,2)])
        # Corners
        runs.append([(0,0,0),(1,1,1),(2,2,2)])
        runs.append([(2,0,0),(1,1,1),(0,2,2)])
        runs.append([(2,2,0),(1,1,1),(0,0,2)])
        runs.append([(0,2,0),(1,1,1),(2,0,2)])
        # Edges
        runs.append([(1,0,0),(1,1,1),(1,2,2)])
        runs.append([(2,1,0),(1,1,1),(0,1,2)])
        runs.append([(1,2,0),(1,1,1),(1,0,2)])
        runs.append([(0,1,0),(1,1,1),(2,1,2)])
        runs.append([(0,0,1),(1,1,1),(2,2,1)])
        runs.append([(2,0,1),(1,1,1),(0,2,1)])
        # Middles
        runs.append([(1,1,0),(1,1,1),(1,1,2)])
        runs.append([(1,0,1),(1,1,1),(1,2,1)])
        runs.append([(0,1,1),(1,1,1),(2,1,1)])

        return runs

    def getPossibleMoves(self):
        directions = ['UP','DOWN','LEFT','RIGHT','FRONT','BACK']
        moves = []
        for x in range(2):
          for y in range(2):
            for z in range(2):
              for dir in directions:
                if self.validMove(x,y,z,dir):
                  moves.append((x,y,z,dir))
        return moves

    def getWinInOne(self,player: Piece):
        for (x,y,z,dir) in self.getPossibleMoves():
          c = copy.deepcopy(self)
          c.move(x,y,z,dir,player)
          if c.hasWon(player):
            return (x,y,z,dir)
        return None

    def otherPlayer(self,player: Piece):
        return Piece.RED if player == Piece.WHITE else Piece.WHITE

    def getDefendingMove(self,player: Piece):
        potential_moves = []
        for (x,y,z,dir) in self.getPossibleMoves():
          c = copy.deepcopy(self)
          c.move(x,y,z,dir,player)
          if c.getWinInOne(self.otherPlayer(player)) == None:
            potential_moves.append((x,y,z,dir))
        if potential_moves:
          return random.choice(potential_moves)
        return None

    def getWinInTwo(self,player: Piece):
        potential_moves = []
        for (x,y,z,dir) in self.getPossibleMoves():
          c = copy.deepcopy(self)
          c.move(x,y,z,dir,player)
          if c.getWinInOne(self.otherPlayer(player)) == None:
            winner = True
            for (x2,y2,z2,dir2) in c.getPossibleMoves():
              c2 = copy.deepcopy(c)
              c2.move(x2,y2,z2,dir2,self.otherPlayer(player))
              if c2.getWinInOne(player) == None:
                winner = False
            if winner:
              potential_moves.append((x,y,z,dir))
        if potential_moves:
          return random.choice(potential_moves)
        return None

    def getRandomMove(self,player: Piece):
        directions = ['UP','DOWN','LEFT','RIGHT','FRONT','BACK']
        x = random.randint(0,2)
        y = random.randint(0,2)
        z = random.randint(0,2)
        dir = random.choice(directions)
        while not self.validMove(x,y,z,dir):
            x = random.randint(0,2)
            y = random.randint(0,2)
            z = random.randint(0,2)
            dir = random.choice(directions)
        return (x,y,z,dir)

    def hasWon(self,player: Piece):
        for run in self.winningRuns:
            if all(self.pieces[x][y][z] == player for (x,y,z) in run):
                return True
        return False
    
    def gameOver(self):
        return self.hasWon(Piece.RED) or self.hasWon(Piece.WHITE)

# Random Agent

In [233]:
class RandomAgent:
    def __init__(self,player):
        self.player = player

    def getMove(self, board: Board):
        return board.getRandomMove(self.player)

# Easy Agent

In [234]:
class EasyAgent:
    def __init__(self,player):
        self.player = player

    def getMove(self, board: Board):
        winningMove = board.getWinInOne(self.player)
        if winningMove:
          return winningMove
        else:
          return board.getRandomMove(self.player)

# Medium Agent

In [235]:
class MediumAgent:
    def __init__(self,player):
        self.player = player

    def getMove(self, board: Board):
        winningMove = board.getWinInOne(self.player)
        if winningMove:
          return winningMove
        else:
          defendingMove = board.getDefendingMove(self.player)
          if defendingMove:
            return defendingMove
          else:
            return board.getRandomMove(self.player)

# Hard Agent

In [236]:
class HardAgent:
    def __init__(self,player):
        self.player = player

    def getMove(self, board: Board):
        winningMove = board.getWinInOne(self.player)
        if winningMove:
          return winningMove
        else:
          winInTwo = board.getWinInTwo(self.player)
          if winInTwo:
            return winInTwo
          else:
            defendingMove = board.getDefendingMove(self.player)
            if defendingMove:
              return defendingMove
            else:
              return board.getRandomMove(self.player)

# Minmax Agent

In [282]:
import sys

class MinMaxAgent():
  global P1WIN
  P1WIN = 1000000
  global P2WIN
  P2WIN = -1000000
    

  def __init__(self,player):
        self.player1 = player
        self.player2 = Board().otherPlayer(player)

  # P1 corner piece -> 1 pt
  # P2 corner piece -> -1 pt
  # P1 runs of 2 -> 10 pts
  # P2 runs of 2 -> -10 pts
  # P1 win in 1's / win in 2's -> 1000000 pts
  # P2 win in 1's / win in 2's -> -1000000 pts
  def evalPos(self,board: Board, p1, p2):
    value = 0
    for x in [0,2]:
      for y in [0,2]:
        for z in [0,2]:
          if board.pieces[x][y][z] == p1:
            value += 1
          if board.pieces[x][y][z] == p2:
            value -= 1
    for run in board.getWinningRuns():
      if (run[0] == run[1] == p1) or (run[1] == run[2] == self.player1) or (run[0] == run[2] == self.player1):
        value += 10
      if (run[0] == run[1] == self.player2) or (run[1] == run[2] == self.player2) or (run[0] == run[2] == self.player2):
        value -= 10
    if board.getWinInOne(self.player1) != None:
      return P1WIN
    if board.getWinInOne(self.player2) != None:
      return P2WIN
    return value

  def minMax(self, board, player, depth, alpha, beta):
      maxDepth = 4
      
      # evaluate the current board using eval(...) and then return its score if this is a leaf node, i.e.,
      # you have reached the maximum depth, or it is a win for one of the players
      cur_score = self.evalPos(board, player, board.otherPlayer(player))
      if depth == maxDepth:
        return (cur_score,None)
      if cur_score in [P2WIN, P1WIN]:
        return (cur_score,None)
      
      # Otherwise, as in lecture, consider whether this is a maximizing player (O) or minimizing
      # player (X) and perform the min-max algorithm with alpha-beta pruning, plus whatever
      # additional strategies you can come up with
      
      # Note that when you make a recursive call, you don't need the move, so can just do:
      #         (score,_) = minMax(.....)
      if player == self.player1:
        value = P2WIN
        move = board.getRandomMove(player)
        for (x,y,z,dir) in board.getPossibleMoves():
          board_copy = copy.deepcopy(board.pieces)
          board.move(x,y,z,dir,player)
          new_score = self.minMax(board,self.player2,depth+1,alpha,beta)[0]
          board.pieces = board_copy
          if new_score > value:
            value = new_score
            move = (x,y,z,dir)
          alpha = max(alpha, value)
          if alpha >= beta:
            break
        return (value,move)
      else:
        value = P1WIN
        move = board.getRandomMove(player)
        for (x,y,z,dir) in board.getPossibleMoves():
          board_copy = copy.deepcopy(board.pieces)
          board.move(x,y,z,dir,player)
          new_score = self.minMax(board,self.player1,depth+1,alpha,beta)[0]
          board.pieces = board_copy
          if new_score < value:
            value = new_score
            move = (x,y,z,dir)
            beta = min(beta, value)
            if alpha >= beta:
              break
        return (value,move)
      
  # You will use this function in your interactive version below

  def getMove(self, board): 
      (_,move) = self.minMax(board,self.player1,0,-sys.maxsize,sys.maxsize)    # only place we need the move
      if move == None:
        print(board.getGameState())
        print(board.getWinInOne(self.player1))
        return board.getWinInOne(self.player1) or board.getRandomMove(self.player1)
      return move

In [270]:
class GamePlayer:
    def __init__(self,player1,player2):
      self.player1 = player1
      self.player2 = player2
      self.board = Board()

    def playGame(self):
       self.board = Board()
       while 1 == 1:
          # Player 1 moves
          (x1,y1,z1,dir1) = self.player1.getMove(self.board)
          self.board.move(x1,y1,z1,dir1,Piece.RED)
          if self.board.hasWon(Piece.RED):
             return 'RED'
          # Player 2 moves
          (x2,y2,z2,dir2) = self.player2.getMove(self.board)
          self.board.move(x2,y2,z2,dir2,Piece.WHITE)
          if self.board.hasWon(Piece.WHITE):
             return 'WHITE'

# Main Class

In [283]:
from tqdm import tqdm

if __name__ == '__main__':
    game = GamePlayer(MinMaxAgent(Piece.RED),MediumAgent(Piece.WHITE))
    red_wins = 0
    games_to_play = 10
    for i in tqdm(range(games_to_play)):
      winner = game.playGame()
      if winner == 'RED':
        red_wins += 1
    print(f'Red player wins {red_wins * 100 / games_to_play}% of the time!')

 20%|██        | 2/10 [00:49<03:20, 25.08s/it]

+----------------------
| \ EMPTY  BLACK  BLACK \
|   \                     \
|     \ EMPTY  EMPTY  BLACK \
|       \                     \
|         \  RED   WHITE  EMPTY \
|          ---------------------|
|   BLACK |EMPTY  BLACK         |
|         |                     |
|       BLACK  EMPTY  EMPTY     |
|         |                     |
|         |  RED   WHITE  EMPTY |
|         |                     |
 \ EMPTY  BLACK  EMPTY          |
   \      |                     |
     \ EMPTY  BLACK  EMPTY      |
       \  |                     |
         \| WHITE   RED   BLACK |
           ---------------------+


None


 40%|████      | 4/10 [01:34<02:22, 23.71s/it]

+----------------------
| \ EMPTY  BLACK  EMPTY \
|   \                     \
|     \  RED   BLACK  BLACK \
|       \                     \
|         \ WHITE   RED   WHITE \
|          ---------------------|
|   BLACK |EMPTY  EMPTY         |
|         |                     |
|       EMPTY  EMPTY  EMPTY     |
|         |                     |
|         | BLACK  BLACK  EMPTY |
|         |                     |
 \ BLACK  BLACK  BLACK          |
   \      |                     |
     \ EMPTY  EMPTY  EMPTY      |
       \  |                     |
         \| EMPTY  EMPTY  EMPTY |
           ---------------------+


None


 50%|█████     | 5/10 [01:58<02:00, 24.05s/it]

+----------------------
| \ EMPTY  BLACK  EMPTY \
|   \                     \
|     \ WHITE   RED   WHITE \
|       \                     \
|         \  RED   BLACK  EMPTY \
|          ---------------------|
|   BLACK |EMPTY  EMPTY         |
|         |                     |
|        RED   EMPTY  BLACK     |
|         |                     |
|         | WHITE  BLACK  BLACK |
|         |                     |
 \ EMPTY  EMPTY  EMPTY          |
   \      |                     |
     \ BLACK  EMPTY  BLACK      |
       \  |                     |
         \| EMPTY  EMPTY  BLACK |
           ---------------------+


None


 60%|██████    | 6/10 [02:27<01:42, 25.69s/it]

+----------------------
| \ EMPTY  BLACK  EMPTY \
|   \                     \
|     \ WHITE   RED   EMPTY \
|       \                     \
|         \  RED   WHITE  EMPTY \
|          ---------------------|
|   EMPTY |BLACK  EMPTY         |
|         |                     |
|        RED   BLACK  EMPTY     |
|         |                     |
|         | WHITE   RED   BLACK |
|         |                     |
 \ EMPTY  EMPTY  BLACK          |
   \      |                     |
     \ BLACK  BLACK  BLACK      |
       \  |                     |
         \| WHITE  BLACK  EMPTY |
           ---------------------+


None


 70%|███████   | 7/10 [02:40<01:04, 21.61s/it]

+----------------------
| \ BLACK  EMPTY  EMPTY \
|   \                     \
|     \ BLACK  BLACK  BLACK \
|       \                     \
|         \ WHITE   RED   EMPTY \
|          ---------------------|
|   BLACK |EMPTY  EMPTY         |
|         |                     |
|       EMPTY  BLACK  BLACK     |
|         |                     |
|         |  RED   EMPTY  BLACK |
|         |                     |
 \ EMPTY  BLACK  EMPTY          |
   \      |                     |
     \ EMPTY  EMPTY  EMPTY      |
       \  |                     |
         \| WHITE  EMPTY  EMPTY |
           ---------------------+


None


 90%|█████████ | 9/10 [03:33<00:24, 24.27s/it]

+----------------------
| \ BLACK  EMPTY  EMPTY \
|   \                     \
|     \ WHITE  WHITE  EMPTY \
|       \                     \
|         \  RED    RED   WHITE \
|          ---------------------|
|   EMPTY |BLACK  EMPTY         |
|         |                     |
|       EMPTY   RED   EMPTY     |
|         |                     |
|         | BLACK  BLACK  BLACK |
|         |                     |
 \ EMPTY  BLACK  EMPTY          |
   \      |                     |
     \ EMPTY  BLACK  EMPTY      |
       \  |                     |
         \| BLACK  EMPTY  BLACK |
           ---------------------+


None


100%|██████████| 10/10 [03:57<00:00, 23.76s/it]

+----------------------
| \ EMPTY  EMPTY  EMPTY \
|   \                     \
|     \ EMPTY   RED   BLACK \
|       \                     \
|         \  RED   WHITE  WHITE \
|          ---------------------|
|   EMPTY |BLACK  BLACK         |
|         |                     |
|       BLACK  BLACK  EMPTY     |
|         |                     |
|         | WHITE   RED   EMPTY |
|         |                     |
 \ EMPTY  EMPTY  BLACK          |
   \      |                     |
     \ EMPTY  BLACK  EMPTY      |
       \  |                     |
         \| BLACK  BLACK  EMPTY |
           ---------------------+


None
Red player wins 20.0% of the time!





# Deep Learning Agent 

## Method we might need 
* Make a move for X -- pick best move, train the model based on picked state and associated reward, update state
* determine best move -- based on Q grid
* Train Model -- x value is the state as an array, y value (target) is the q-value of best next move + reward at current state 
* Calc_value_of_state -- use model to predict value of state
* calc target -- is the q-value of best next move + reward at current state 
* run experiment -- runs through a tictactoe game 

### Notes
The value function determines how good it is to be in state s,
Agent can also learn the value of a state-action pair, which is a q value. The q funciton 
measueres tha value of choosing a particualar action when in a particular state.


Deep Q-Learning replaces the regular Q-table with a neural network. Rather than mapping a state-action pair to a q-value, a neural network maps input states to (action, Q-value) pairs.

The Bellman equation is a recursive equation that relates the value of a state to the values of its successor states. It decomposes the value function into two parts: an immediate reward and the expected discounted value of the next state. The Bellman equation is used to update the value function iteratively, until it converges to the true value function.

https://towardsdatascience.com/deep-q-learning-tutorial-mindqn-2a4c855abffc

## Good code references:
   * https://github.com/giladariel/TicTacToe_RL/blob/master/DeepTicTacToe_org.py
   * https://github.com/mswang12/minDQN/blob/main/minDQN.py

In [None]:
def initialize_model():
    """ Initializes keras sequential model that will read in state as an array and return list of q values associated
    with each possible action. 
    """
    pass

In [None]:
import copy

directions = ['UP','DOWN','LEFT','RIGHT','FRONT','BACK']

class DeepLearningAgent:
    # epsilon initialized to 1 since we start random
    def __init__(self,player=Piece.WHITE,epsilon=1, lam=1.0):
        self.player = player
        self.epsilon = epsilon
        self.lam = lam
        self.model = initialize_model()
        
    """ Gets the immediate reward of taking an action. Since this is tiktaktoe, if the move
    resulted in a win, reward is 1, if a loss, reward is -1, if a tie, reward is 0.5, otherwise
    the game has not ended and the reward is 0.
    """
    def get_reward(self,board,move):
      c = copy.deepcopy(board)
      c.move(move)
      # Return 1 if the move wins
      if c.hasWon(self.player):
        return 1
      # Returns 0 if the move loses (opponent can win in one)
      elif c.getWinInOne(Piece.RED if self.player == Piece.WHITE else Piece.WHITE):
        return -1
      # Return 0 otherwise
      return 0
    
    def calculate_value():
    """ The value of a specific state is predicted by the model which has been trained
    on previously visisted states.
    """
    
    def calculate_target():
    """ The target (value estimate of state s) is calculated based on the bellman equation.
    The equation combines the immediate reward from the current state and the discounted
    value of the best next state. y is discount factor which determines the balance of caring about short 
    vs long term rewards. Higher value, more weight towards long term rewards. The bellman equation is recursive.
    """
        # target (value of state s) = (reward of state s) + y * (best q value of all possible actions from state s1)
        # (best q value of all possible actions from state s1) is calculated by passing s1 into the model

    
    def train_model():
    """ The model is learning the policy that the agent will use to move around the environment
    and choose the best action. Each time the agent decides on an action, the reward (aka the 
    target) for the state (s1) is calculated. Then using s1 as x and the reward as y, the model
    undergoes one iteration of stochastic gradient descent. The output of the model is basically 
    the models prediction for the q value of the best action. 
    """
        
        # Use bellman equation (calculate_target()) to get y value
        
        # train model using model.fit
        
    def choose_best_move():
        
        
    def play_move():
        
        # initialize empty replay_memory, as no moves have been made yet
        
        # if random number is less than epsilon, do random action
        
        # else use model to predict the best move (model.predict). This will return q value which u need to find
        # associated move for 
        
        
        # add move to replay_memory. Replay_memory is a list of tuples(state,action,reward,new state)
        
        # call train model to update model
        
        # update epsilon 
        


IndentationError: expected an indented block after function definition on line 9 (3076497746.py, line 10)

# Game Player

In [None]:
# from board import Board
# from piece import Piece
# from random_agent import RandomAgent

class GamePlayer:
    def __init__(self, player1= DeepLearningAgent(Piece.WHITE), player2= RandomAgent(Piece.RED)):
        self.player1 = player1
        self.player2 = player2
        self.board = Board()

#     def playGame(self):
#         while 1 == 1:
#             # Player 1 moves
#             (x1,y1,z1,dir1) = self.player1.getMove(self.board)
#             self.board.move(x1,y1,z1,dir1,Piece.WHITE)
#             print(self.board.getGameState())
#             if self.board.hasWon(Piece.WHITE):
#                 print('White wins!')
#                 return
#             # Player 2 moves
#             (x2,y2,z2,dir2) = self.player2.getMove(self.board)
#             self.board.move(x2,y2,z2,dir2,Piece.RED)
#             print(self.board.getGameState())
#             if self.board.hasWon(Piece.RED):
#                 print('Red wins!')
#                 return

        def 