In [None]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
    
    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        return self.get_state()
    
    def get_state(self):
        # Flatten board state for easier storage in Q-table
        return tuple(self.board.flatten())
    
    def is_winner(self, player):
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False
    
    def is_draw(self):
        return np.all(self.board != 0)
    
    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]
    
    def step(self, action, player):
        self.board[action] = player
        if self.is_winner(player):
            return self.get_state(), 1, True  # Win reward
        elif self.is_draw():
            return self.get_state(), 0, True  # Draw reward
        else:
            return self.get_state(), -0.1, False  # Slight penalty to continue the game

# Q-learning Agent
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.q_table = {}
        self.epsilon = epsilon  # Exploration factor
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
    
    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)
    
    def choose_action(self, state, available_actions):
        # Epsilon-greedy policy
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)
        else:
            q_values = [self.get_q_value(state, action) for action in available_actions]
            max_q = max(q_values)
            # Choose the best action with the highest Q-value
            return available_actions[q_values.index(max_q)]
    
    def update_q_table(self, state, action, reward, next_state, done, available_actions):
        # Get max Q-value for the next state
        best_next_q = max([self.get_q_value(next_state, a) for a in available_actions], default=0)
        q_value = self.get_q_value(state, action)
        # Q-learning update rule
        if done:
            # If game ended, do not consider future reward
            self.q_table[(state, action)] = q_value + self.alpha * (reward - q_value)
        else:
            self.q_table[(state, action)] = q_value + self.alpha * (reward + self.gamma * best_next_q - q_value)

# Training the Agent
def train(agent, episodes=1000):
    env = TicTacToe()
    for episode in range(episodes):
        state = env.reset()
        done = False
        player = 1  # Agent plays as '1'
        
        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action, player)
            
            if done:
                # Update the Q-table for the final move
                agent.update_q_table(state, action, reward, next_state, done, available_actions)
            else:
                # Opponent plays randomly
                opponent_action = random.choice(env.available_actions())
                next_state, opponent_reward, done = env.step(opponent_action, -player)
                
                # Update the Q-table for the agent's move
                agent.update_q_table(state, action, reward, next_state, done, available_actions)
                
            state = next_state

# Testing the Agent
def play(agent):
    env = TicTacToe()
    state = env.reset()
    done = False
    player = 1  # Agent plays as '1'
    
    while not done:
        print("Current Board:")
        print(env.board)
        
        if player == 1:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            state, reward, done = env.step(action, player)
            if done:
                if reward == 1:
                    print("Agent Wins!")
                elif reward == 0:
                    print("It's a Draw!")
                else:
                    print("Agent Loses.")
        else:
            # Human player input
            print("Your Move:")
            action = tuple(map(int, input("Enter row, column (0-2 for both): ").split(',')))
            state, _, done = env.step(action, player)
            if done:
                print("You Win!")
        
        player = -player  # Switch player

    print("Final Board:")
    print(env.board)

# Initialize and train the agent
agent = QLearningAgent()
train(agent, episodes=10000)  # Train for 10,000 episodes for better performance
play(agent)


Current Board:
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Current Board:
[[0 0 1]
 [0 0 0]
 [0 0 0]]
Your Move:
Current Board:
[[-1  0  1]
 [ 0  0  0]
 [ 0  0  0]]
Current Board:
[[-1  0  1]
 [ 0  0  0]
 [ 0  1  0]]
Your Move:
Current Board:
[[-1  0  1]
 [ 0  0  0]
 [ 0  1 -1]]
Current Board:
[[-1  1  1]
 [ 0  0  0]
 [ 0  1 -1]]
Your Move:
You Win!
Final Board:
[[-1  1  1]
 [ 0 -1  0]
 [ 0  1 -1]]
