## <font color='darkblue'><b>Tic Tac Toc Imp - Part 1</b></font>
([course link](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160004#overview))
Prepare class <font color='blue'><b>Player</b></font>, <font color='blue'><b>HumanPlayer</b></font>.

In [45]:
import logging
import random

BLANK = ' '
AI_PLAYER = 'X'
HUMAN_PLAYER = 'O'
TRAINING_EPOCHS = 40000
TRAINING_EPSILON = .4
REWARD_WIN = 10
REWARD_LOSE = -100
REWARD_TIE = 0

class Player:
  def __init__(self, name: str | None = None):
    self._name = name or self.__class__.__name__
    
  @property
  def name(self):
    return self._name
  
  @classmethod
  def show_board(cls, board):
    temp_board = board[:]
    for i in range(1, 10):
      if temp_board[i-1] == BLANK:
        temp_board[i-1] = str(i)
        
    print('|'.join(temp_board[0:3]))
    print('|'.join(temp_board[3:6]))
    print('|'.join(temp_board[6:9]))
    
    
class HumanPlayer(Player):
  
  def reward(self, value, board):
    """Human side won't do learning"""
    pass
  
  def make_move(self, board, ticker):
    """Makes move."""
    while True:
      try:
        self.show_board(board)
        move = input(f'Your next move with ticker={ticker} (cell index 1-9): ')
        move = int(move)
        if not (move - 1 in range(9)):
          raise ValueError('Illegal movel')
      except ValueError as ex:
        print('Invalid move! Try again...\n')
      else:
        return move - 1

## <font color='darkblue'><b>Tic Tac Toc Imp - Part 2, 3, 4</b></font>
([course link1](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160010#overview), [course link2](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160014#overview), [course link3](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160016#overview))

In [46]:
class AIPlayer(Player):
  
  def __init__(
    self, name=None, epsilon=0.4, alpha=0.3, gamma=0.9, default_q=1):
    super().__init__(name)
    # Q(s, a) = Q(s, a) + α [R(s, a) + γ max Q(s', a') - Q(s, a)]
    
    # Epsilon parameter as the probability of exploration.
    self.epsilon = epsilon
    
    # Learning rate
    self.alpha = alpha
    
    # Discount parameter for future reward.
    self.gamma = gamma
    
    # If the given move at the given state is not defined yet, 
    # we have a default Q value.
    self.default_q = default_q
    
    # Q(s, a) function is a dict in this implementation.
    # This is the Q function to return a value from the given
    # (state, action) pair.
    self.q = {}
    
    # Previous move during the game
    self.move = None
    
    # Board in the previous iteration
    self.board = (BLANK, ) * 9
    
  def available_moves(self, board):
    """Gets availale move or empty cell list at given board."""
    return [i for i in range(9) if board[i] == BLANK]
  
  def get_q(self, state, action):
    """Gets Q value from given (state, action) pair."""
    if self.q.get((state, action)) is None:
      self.q[(state, action)] = self.default_q
      
    return self.q[(state, action)]
  
  def make_move(self, board, ticker):
    """Makes a move
    
    With epsilon probability to select move randomly (exploration)
    or pick the action with highest Q value. (exploitation)
    """
    self.board = tuple(board)
    actions = self.available_moves(board)
    
    # Action with epsilon probability
    if random.random() < self.epsilon:
      # This is a exploration move
      self.move = random.choice(actions)
      return self.move
    
    # Take the action with highest Q value
    q_values = [self.get_q(self.board, a) for a in actions]
    max_q_value = max(q_values)
    
    # If multiple best actions, choose one at random
    # otherwise, just return the best action (exploitation)
    if q_values.count(max_q_value) > 1:
      best_actions = [i for i in range(len(actions)) if q_values[i] == max_q_value]
      best_move = actions[random.choice(best_actions)]
    else:
      best_move = actions[q_values.index(max_q_value)]
      
    self.move = best_move
    return self.move
  
  def reward(self, reward, board):
    """Updates Q table accordingly."""
    # Q(s, a) = Q(s, a) + α [R(s, a) + γ max Q(s', a') - Q(s, a)]
    if self.move:
      prev_q = self.get_q(self.board, self.move)
      max_next_q = max(
        [self.get_q(tuple(board), a) for a in self.available_moves(self.board)]
      )
      self.q[(self.board, self.move)] = (
        prev_q + self.alpha * (
          reward + self.gamma * max_next_q - prev_q
        )
      )

## <font color='darkblue'>Tic Tac Toc Imp - Part 5, 6</font>
([course link 1](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160018#overview), [course link 2](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160020#overview))

In [47]:
class TicTacToe:
  
  def __init__(self, player1, player2):
    self.player1 = player1
    self.player2 = player2
    self.first_player_turn = random.choice([True, False])
    self.board = [BLANK] * 9

  def is_game_over(self, player_tickers):
    # consider both players (X and O players - these are the tickers)
    for player_ticker in player_tickers:
        # check horizontal dimension (so the rows)
        for i in range(3):
            if self.board[3 * i + 0] == player_ticker and \
               self.board[3 * i + 1] == player_ticker and \
               self.board[3 * i + 2] == player_ticker:
              return True, player_ticker

        # check vertical dimension (so the columns)
        for j in range(3):
            if self.board[j + 0] == player_ticker and \
               self.board[j + 3] == player_ticker and \
               self.board[j + 6] == player_ticker:
              return True, player_ticker

        # check diagonal dimensions (top left to bottom right + top right to bottom left)
        if self.board[0] == player_ticker and self.board[4] == player_ticker and\
            self.board[8] == player_ticker:
            return True, player_ticker

        if self.board[2] == player_ticker and self.board[4] == player_ticker and self.board[6] == player_ticker:
            return True, player_ticker

        # finally we can deal with the 'draw' cases
        if self.board.count(BLANK) == 0:
            return True, None
        else:
            return False, None
    
    
  def play(self):
    # This is the "Game loop"
    while True:
      if self.first_player_turn:
        player = self.player1
        other_player = self.player2
        player_tickers = (AI_PLAYER, HUMAN_PLAYER)
      else:
        player = self.player2
        other_player = self.player1
        player_tickers = (HUMAN_PLAYER, AI_PLAYER)
        
      # Actual player's best move (based on Q(s, a) table)
      move = player.make_move(self.board, player_tickers[0])
      self.board[move] = player_tickers[0]
      
      # Check the state of the game (win, lose or draw)
      game_over, winner = self.is_game_over(player_tickers)
      
      # Game is over: Handle the records
      if game_over:
        if winner == player_tickers[0]:
          player.show_board(self.board[:])
          print(f'=== {player.name} === ({winner}) won!\n\n')
          player.reward(REWARD_WIN, self.board[:])
          other_player.reward(REWARD_LOSE, self.board[:])
        elif winner == player_tickers[1]:
          player.show_board(self.board[:])
          print(f'=== {other_player.name} === ({winner}) won!\n\n')
          other_player.reward(REWARD_WIN, self.board[:])
          player.reward(REWARD_LOSE, self.board[:])
        else:
          player.show_board(self.board[:])
          print('Tie!\n\n')
          other_player.reward(REWARD_TIE, self.board[:])
          player.reward(REWARD_TIE, self.board[:])
          
        break
      
      # Switch to next player to make move
      self.first_player_turn = not self.first_player_turn

## <font color='darkblue'>Tic Tac Toc Imp - Part 7</font>
([course link](https://www.udemy.com/course/ai-and-combinatorial-optimization-with-meta-heuristics/learn/lecture/31160024#content))

### <font color='darkgreen'>Before training</font>

In [58]:
ai_player_1 = AIPlayer('ai_player1')
ai_player_2 = AIPlayer('ai_player2')

In [59]:
# epsilon=0 means no exploration - it will use the Q(s,a) function to make the moves
ai_player_1.epsilon = 0
human_player = HumanPlayer()

In [60]:
game = TicTacToe(ai_player_1, human_player)
game.play()

1|2|3
4|X|6
7|8|9
Your next move with ticker=O (cell index 1-9): 9
1|X|3
4|X|6
7|8|O
Your next move with ticker=O (cell index 1-9): 8
1|X|X
4|X|6
7|O|O
Your next move with ticker=O (cell index 1-9): 7
1|X|X
4|X|6
O|O|O
=== HumanPlayer === (O) won!




### <font color='darkgreen'>After training</font>

In [62]:
%%time
print('Training the AI player(s)...')
ai_player_1.epsilon = ai_player_2.epsilon = TRAINING_EPSILON
for _ in range(10000):
  game = TicTacToe(ai_player_1, ai_player_2)
  game.play()

print('\nTraining is Done')

Training the AI player(s)...
X|O|3
X|5|O
X|8|O
=== ai_player1 === (X) won!


O|X|X
4|X|6
O|X|O
=== ai_player1 === (X) won!


O|O|O
X|O|X
X|X|O
=== ai_player2 === (O) won!


O|X|X
X|O|O
X|O|X
Tie!


O|2|3
O|X|6
O|8|X
=== ai_player2 === (O) won!


O|X|3
4|O|6
X|8|O
=== ai_player2 === (O) won!


1|X|O
4|O|6
O|8|X
=== ai_player2 === (O) won!


X|X|X
4|O|6
7|O|9
=== ai_player1 === (X) won!


X|O|O
4|X|O
7|8|X
=== ai_player1 === (X) won!


1|X|3
O|O|O
X|X|9
=== ai_player2 === (O) won!


1|2|O
X|X|O
O|X|O
=== ai_player2 === (O) won!


O|2|X
O|X|6
X|X|O
=== ai_player1 === (X) won!


X|2|O
X|O|6
O|8|9
=== ai_player2 === (O) won!


O|O|X
4|O|X
X|X|O
=== ai_player2 === (O) won!


O|2|X
O|X|O
X|X|O
=== ai_player1 === (X) won!


X|O|O
X|X|X
7|O|O
=== ai_player1 === (X) won!


O|X|3
4|X|6
O|X|9
=== ai_player1 === (X) won!


X|O|O
O|O|X
X|X|O
Tie!


1|2|O
X|X|O
O|X|O
=== ai_player2 === (O) won!


O|O|3
O|X|X
O|X|9
=== ai_player2 === (O) won!


O|O|O
O|X|X
X|8|X
=== ai_player2 === (O) won!


O|O|X
X|O

X|X|X
O|X|O
=== ai_player1 === (X) won!


O|O|X
4|X|6
X|8|9
=== ai_player1 === (X) won!


X|O|X
O|O|X
X|X|O
Tie!


X|O|X
X|O|O
O|X|O
Tie!


O|O|3
X|X|X
7|8|9
=== ai_player1 === (X) won!


O|X|O
X|X|O
O|O|X
Tie!


X|2|X
4|5|X
O|O|O
=== ai_player2 === (O) won!


1|O|O
4|5|6
X|X|X
=== ai_player1 === (X) won!


X|2|O
4|5|O
X|8|O
=== ai_player2 === (O) won!


O|2|O
4|X|O
X|X|X
=== ai_player1 === (X) won!


O|X|O
O|X|O
X|O|X
Tie!


X|X|3
X|O|6
X|O|O
=== ai_player1 === (X) won!


O|X|X
X|X|O
O|O|X
Tie!


X|2|O
4|O|6
O|X|9
=== ai_player2 === (O) won!


1|X|X
O|O|O
7|O|X
=== ai_player2 === (O) won!


X|O|X
4|O|6
7|O|9
=== ai_player2 === (O) won!


O|X|X
O|O|6
X|8|O
=== ai_player2 === (O) won!


X|X|O
4|O|6
O|8|X
=== ai_player2 === (O) won!


O|2|3
O|X|X
O|8|X
=== ai_player2 === (O) won!


X|X|X
X|O|6
7|O|O
=== ai_player1 === (X) won!


X|O|O
O|O|X
O|X|X
=== ai_player2 === (O) won!


X|X|O
4|O|6
O|8|X
=== ai_player2 === (O) won!


X|X|X
X|O|6
7|O|O
=== ai_player1 === (X) won!


X|O|3
X|O|6
X|8|O

Let human player to play with AI player:

In [63]:
# epsilon=0 means no exploration - it will use the Q(s,a) function to make the moves
ai_player_1.epsilon = 0
human_player = HumanPlayer()

In [64]:
game = TicTacToe(ai_player_1, human_player)
game.play()

1|2|3
4|5|6
7|X|9
Your next move with ticker=O (cell index 1-9): 5
X|2|3
4|O|6
7|X|9
Your next move with ticker=O (cell index 1-9): 7
X|2|X
4|O|6
O|X|9
Your next move with ticker=O (cell index 1-9): 6
X|X|X
4|O|O
O|X|9
=== ai_player1 === (X) won!


