In [1]:
import numpy as np
import pandas as pd

In [2]:
# Helper functions

def generate_boards(board, player):
    if is_game_over(board):
        boards.append(board.flatten())
        outcomes.append(get_winner(board))
        return
    
    for i in range(3):
        for j in range(3):
            if board[i, j] == 0:
                new_board = np.copy(board)
                new_board[i, j] = player
                generate_boards(new_board, 3 - player)

def is_game_over(board):
    return get_winner(board) != 0 or np.count_nonzero(board) == 9

def get_winner(board):
    # Check rows
    for i in range(3):
        if board[i, 0] == board[i, 1] == board[i, 2] != 0:
            return board[i, 0]

    # Check columns
    for j in range(3):
        if board[0, j] == board[1, j] == board[2, j] != 0:
            return board[0, j]

    # Check diagonals
    if board[0, 0] == board[1, 1] == board[2, 2] != 0:
        return board[0, 0]

    if board[0, 2] == board[1, 1] == board[2, 0] != 0:
        return board[0, 2]

    return 0


In [3]:
# Generate all possible Tic Tac Toe board configurations
boards = []
outcomes = []

# Q-learning parameters
alpha = 0.5  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Q-table
q_table = np.zeros((19683, 9))  # 3^9 possible states, 9 possible actions per state


In [4]:
# Q-learning update
def update_q_table(state, action, reward, next_state):
    current_q = q_table[state, action]
    max_next_q = np.max(q_table[next_state])
    new_q = current_q + alpha * (reward + gamma * max_next_q - current_q)
    q_table[state, action] = new_q


In [5]:
# Generate the dataset with RL agent
generate_boards(np.zeros((3, 3)), 1)

for _ in range(10000):  # Number of RL agent episodes
    state = 0  # Starting state
    player = 1

    while True:
        # Epsilon-greedy exploration/exploitation
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(np.where(boards[state] == 0)[0])
        else:
            action = np.argmax(q_table[state])

        next_state = np.dot(3, state) + (player if boards[state][action] == 0 else 0)
        reward = outcomes[next_state]

        update_q_table(state, action, reward, next_state)

        if is_game_over(np.reshape(boards[next_state], (3, 3))):
            break

        state = next_state
        player = 3 - player


In [6]:
# Create the dataset
dataset = pd.DataFrame(np.concatenate((np.array(boards), np.array(outcomes).reshape(-1, 1)), axis=1), columns=['1', '2', '3', '4', '5', '6', '7', '8', '9', 'Outcome'])

# Save the dataset to a CSV file
dataset.to_csv('tic_tac_toe_dataset.csv', index=False)
