<a href="https://colab.research.google.com/github/itsZENR/TicTacToe-RL/blob/main/TicTacToe_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Библиотеки
import numpy as np
import random
import tensorflow as tf

In [2]:
class TicTacToeEnvironment:
    def __init__(self):
        self.state = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        self.done = False

    def get_state(self):
        return self.state.copy()

    def get_valid_actions(self):
        return np.where(self.state.flatten() == 0)[0]

    def step(self, action):
        if self.done:
            raise ValueError("Game is already finished")

        i, j = np.unravel_index(action, (3, 3))
        if self.state[i, j] != 0:
            raise ValueError("Invalid action")

        self.state[i, j] = self.current_player
        winner = self.get_winner()
        if winner is not None or np.all(self.state != 0):
            self.done = True

        reward = 0
        if winner is not None:
            reward = 1 if winner == 1 else -1
        self.current_player = -self.current_player

        return self.get_state(), reward, self.done

    def reset(self):
        self.state = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        self.done = False

    def get_winner(self):
        print(self.state)

        # Check rows
        for i in range(3):
            if np.all(self.state[i, :] == 1):
                return 1
            elif np.all(self.state[i, :] == -1):
                return -1

        # Check columns
        for j in range(3):
            if np.all(self.state[:, j] == 1):
                return 1
            elif np.all(self.state[:, j] == -1):
                return -1

        # Check diagonals
        if np.all(np.diag(self.state) == 1) or np.all(np.diag(np.fliplr(self.state)) == 1):
            return 1
        elif np.all(np.diag(self.state) == -1) or np.all(np.diag(np.fliplr(self.state)) == -1):
            return -1

        # Check tie
        if np.all(self.state != 0):
            return 0

        return None


In [3]:
# Определение модели Q-функции
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(9,)),
    tf.keras.layers.Dense(9)
])

model.compile(optimizer='Adam', loss=tf.keras.losses.BinaryCrossentropy())

In [4]:
# Определение функции выбора действия
def choose_action(state, epsilon):
    if np.random.random() < epsilon:
        # Случайное действие
        action = np.random.choice(np.where(env.state.flatten() == 0)[0])
    else:
        # Выбор действия с наибольшим значением Q-функции
        q_values = model.predict(state.reshape(1, -1))
        action = np.argmax(q_values[0] * (env.state.flatten() == 0))
        print('q_values', q_values)
    return action

# Определение функции обновления Q-функции
def update_q_function(state, action, reward, next_state, alpha, gamma):
    q_values = model.predict(state.reshape(1, -1))
    next_q_values = model.predict(next_state.reshape(1, -1))
    td_target = reward + gamma * np.amax(next_q_values)
    td_error = td_target - q_values[0][action]
    q_values[0][action] += alpha * td_error
    model.fit(state.reshape(1, -1), q_values)

# Основной цикл обучения
num_episodes = 100
epsilon = 1.0
epsilon_decay = 0.9995
alpha = 0.01
gamma = 0.99
env = TicTacToeEnvironment()

for i in range(num_episodes):
    print('-'*100)
    env.reset()
    done = False
    while not done:
        action = choose_action(env.state, epsilon)
        print("action", action)
        next_state, reward, done = env.step(action)
        update_q_function(env.state, action, reward, next_state, alpha, gamma)
        state = next_state

    epsilon *= epsilon_decay

----------------------------------------------------------------------------------------------------
action 7
[[0 0 0]
 [0 0 0]
 [0 1 0]]
action 5
[[ 0  0  0]
 [ 0  0 -1]
 [ 0  1  0]]
action 6
[[ 0  0  0]
 [ 0  0 -1]
 [ 1  1  0]]
action 1
[[ 0 -1  0]
 [ 0  0 -1]
 [ 1  1  0]]
action 0
[[ 1 -1  0]
 [ 0  0 -1]
 [ 1  1  0]]
action 2
[[ 1 -1 -1]
 [ 0  0 -1]
 [ 1  1  0]]
action 8
[[ 1 -1 -1]
 [ 0  0 -1]
 [ 1  1  1]]
----------------------------------------------------------------------------------------------------
action 1
[[0 1 0]
 [0 0 0]
 [0 0 0]]
action 6
[[ 0  1  0]
 [ 0  0  0]
 [-1  0  0]]
action 5
[[ 0  1  0]
 [ 0  0  1]
 [-1  0  0]]
action 4
[[ 0  1  0]
 [ 0 -1  1]
 [-1  0  0]]
action 3
[[ 0  1  0]
 [ 1 -1  1]
 [-1  0  0]]
action 8
[[ 0  1  0]
 [ 1 -1  1]
 [-1  0 -1]]
action 0
[[ 1  1  0]
 [ 1 -1  1]
 [-1  0 -1]]
action 2
[[ 1  1 -1]
 [ 1 -1  1]
 [-1  0 -1]]
----------------------------------------------------------------------------------------------------
action 4
[[0 0 0]
 [0 1 0

KeyboardInterrupt: ignored