In [1]:
'''
Task to carry out: Let’s consider the example of Tic Tac Toe. Before, we saw how to solve it by 
hand. Now that we have a formal  structure and code in place, learn to play Tic Tac Toe as the 
first player using the Q-learning algorithm. Start  by defining the states and rewards. Then, 
construct a Q matrix. Think about how you would update this  matrix as you iterate over possible 
states and actions. Now, run your training and testing loops and print your  learned Q matrix. 
Finally, show the testing part – that is, how, given a start state, you could get to a solution 
or at least a reasonable end state. 
'''

# import necessary library
import numpy as np

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, epsilon=0.3, seed=None):
        self.q = {}  
        self.alpha = alpha 
        self.epsilon = epsilon  
        np.random.seed(seed)

    def get_state(self, board):
        return str(board.reshape(9))

    def choose_action(self, state, actions):
        if np.random.uniform(0, 1) <= self.epsilon:
            return np.random.choice(actions)
        q_values = [self.q.get((state, action), 0) for action in actions]
        return actions[np.argmax(q_values)]

    def update_q_value(self, state, action, reward, next_state, next_actions):
        max_q_next = max([self.q.get((next_state, a), 0) for a in next_actions], default=0)
        current_q = self.q.get((state, action), 0)
        self.q[(state, action)] = current_q + self.alpha * (reward + max_q_next - current_q)

    def train(self, episodes):
        for _ in range(episodes):
            board = np.zeros(9) 
            actions = [i for i in range(9)]
            done = False
            while not done:
                state = self.get_state(board)
                action = self.choose_action(state, actions)
                board[action] = 1 
                actions.remove(action)
                if self.check_winner(board):
                    reward = 1
                    done = True
                elif len(actions) == 0:
                    reward = 0
                    done = True
                else:
                    reward = 0
                next_state = self.get_state(board)
                self.update_q_value(state, action, reward, next_state, actions)

    def check_winner(self, board):
        winning_combinations = [(0, 1, 2), 
                                (3, 4, 5), 
                                (6, 7, 8), 
                                (0, 3, 6), 
                                (1, 4, 7), 
                                (2, 5, 8), 
                                (0, 4, 8), 
                                (2, 4, 6)]
        for combo in winning_combinations:
            if board[combo[0]] == board[combo[1]] == board[combo[2]] != 0:
                return True
        return False      

In [2]:
if __name__ == "__main__":
    agent = TicTacToeQLearning(seed=1234)
    agent.train(1000)
    print("Training complete. Q size:", len(agent.q))
    print("Q:\n", agent.q)

Training complete. Q size: 314
Q:
 {('[0. 0. 0. 0. 0. 0. 0. 0. 0.]', 6): 0.42779492172774736, ('[0. 0. 0. 0. 0. 0. 1. 0. 0.]', 0): 0.6257771046327505, ('[1. 0. 0. 0. 0. 0. 1. 0. 0.]', 1): 0.8524015497823421, ('[1. 1. 0. 0. 0. 0. 1. 0. 0.]', 2): 0.9774716004550608, ('[0. 0. 0. 0. 0. 0. 0. 0. 0.]', 0): 0.9999999999999987, ('[1. 0. 0. 0. 0. 0. 0. 0. 0.]', 7): 0.647419874771055, ('[1. 0. 0. 0. 0. 0. 0. 1. 0.]', 1): 0.904634579932763, ('[1. 1. 0. 0. 0. 0. 0. 1. 0.]', 2): 0.9903022627021246, ('[1. 0. 0. 0. 0. 0. 0. 0. 0.]', 1): 0.9999999999999991, ('[1. 1. 0. 0. 0. 0. 0. 0. 0.]', 2): 0.9999999999999996, ('[1. 1. 0. 0. 0. 0. 0. 0. 0.]', 4): 0.8456199621747019, ('[1. 1. 0. 0. 1. 0. 0. 0. 0.]', 6): 0.20505073942342003, ('[1. 1. 0. 0. 1. 0. 1. 0. 0.]', 2): 0.814697981114816, ('[1. 0. 0. 0. 0. 0. 0. 0. 0.]', 4): 0.5775614956694106, ('[1. 0. 0. 0. 1. 0. 0. 0. 0.]', 6): 0.008900200000000002, ('[1. 0. 0. 0. 1. 0. 1. 0. 0.]', 1): 0.2781702357625352, ('[0. 0. 0. 0. 0. 0. 0. 0. 0.]', 5): 0.241892662852