In [None]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer

# Tic Toc Toe environment

Our 1st game is the famous Tic Toc Toe. You can read about the game and its rules here: https://en.wikipedia.org/wiki/Tic-tac-toe

We implemented the game as an environment in the style of games in the [Python GYM library](https://gym.openai.com/). The commented source code is available in the file "tic_env.py". Here, we give a brief introduction to the environment and how it can be used.

### Initialization and attributes

You can initialize the environment / game as following:

In [None]:
env = TictactoeEnv()

Which then has the following attributes with the corresponding initial values:

In [None]:
env.__dict__

The game is played by two players: player 'X' and player 'O'. The attribute 'current_player' shows whose turn it is. We assume that player 'X' always plays first.

The attribute 'grid' is a 3x3 numpy array and presents the board in the real game and the state $s_t$ in the reinfocement learning language. Each elements can take a value in {0, 1, -1}:
     0 : place unmarked
     1 : place marked with X 
    -1 : place marked with O 
        
The attribute 'end' shows if the game is over or not, and the attribute 'winner' shows the winner of the game: either "X", "O", or None.  

You can use function 'render' to visualize the current position of the board:

In [None]:
env.render()

### Taking actions

The game environment will recieve action from two players in turn and update the grid. At each time, one player can take the action $a_t$, where $a_t$ can either be an integer between 0 to 8 or a touple, corresponding to the 9 possible.

Function 'step' is used to recieve the action of the player, update the grid:

In [None]:
env.step(2)

In [None]:
env.render()

In [None]:
env.__dict__

In [None]:
env.step((1,1))

In [None]:
env.render()

In [None]:
env.__dict__

But not all actions are available at each time: One cannot choose a place which has been taken before. There is an error if an unavailable action is taken:

In [None]:
# env.step((0,2))

### Reward

Reward is always 0 until the end of the game. When the game is over, the reward is 1 if you win the game, -1 if you lose, and 0 besides. Function 'observe' can be used after each step to recieve the new state $s_t$, whether the game is over, and the winner, and function 'reward' to get the reward value $r_t$:

In [None]:
env.observe()

In [None]:
env.reward(player='X')

In [None]:
env.reward(player='O')

An example of finishing the game:

In [None]:
env.step(0)
env.step(3)
env.step(1)

In [None]:
env.render()

In [None]:
env.observe()

In [None]:
env.reward(player='X')

In [None]:
env.reward(player='O')

# Optimal policy for Tic Toc Toe environment

Fortunately, we know the exact optimal policy for Tic Toc Toe. We have implemented and $\epsilon$-greedy version of optimal polciy which you can use for the project.

In [None]:
env.reset();

In [None]:
opt_player = OptimalPlayer(epsilon = 0., player = 'X')

In [None]:
opt_player.act(env.grid)

In [None]:
opt_player.player

### An example of optimal player playing against random player

In [None]:
Turns = np.array(['X','O'])
for i in range(5):
    env.reset()
    grid, _, __ = env.observe()
    Turns = Turns[np.random.permutation(2)]
    player_opt = OptimalPlayer(epsilon=0., player=Turns[0])
    player_rnd = OptimalPlayer(epsilon=1., player=Turns[1])
    for j in range(9):
        if env.current_player == player_opt.player:
            move = player_opt.act(grid)
        else:
            move = player_rnd.act(grid)

        grid, end, winner = env.step(move, print_grid=False)

        if end:
            print('-------------------------------------------')
            print('Game end, winner is player ' + str(winner))
            print('Optimal player = ' +  Turns[0])
            print('Random player = ' +  Turns[1])
            env.render()
            env.reset()
            break


### An example of optimal player playing against optimal player

In [None]:
Turns = np.array(['X','O'])
for i in range(5):
    env.reset()
    grid, _, __ = env.observe()
    Turns = Turns[np.random.permutation(2)]
    player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[0])
    player_opt_2 = OptimalPlayer(epsilon=0., player=Turns[1])
    for j in range(9):
        if env.current_player == player_opt_1.player:
            move = player_opt_1.act(grid)
        else:
            move = player_opt_2.act(grid)

        grid, end, winner = env.step(move, print_grid=False)

        if end:
            print('-------------------------------------------')
            print('Game end, winner is player ' + str(winner))
            print('Optimal player 1 = ' +  Turns[0])
            print('Optimal player 2 = ' +  Turns[1])
            env.render()
            env.reset()
            break


# Question 1
In this section, you will study whether Q-learning can learn to play Tic Tac Toe by playing against
Opt(eps_opt) for some eps_opt ∈ [0, 1]. To do so, implement the Q-learning algorithm. To check the algorithm,
run a Q-learning agent, with a fixed and arbitrary eps ∈ [0, 1), against Opt(0.5) for 20’000 games – switch
the 1st player after every game.
Question 1. Plot average reward for every 250 games during training – i.e. after the 50th game, plot
the average reward of the first 250 games, after the 100th game, plot the average reward of games 51 to
100, etc. Does the agent learn to play Tic Tac Toe?
Expected answer: A figure of average reward over time (caption length < 50 words). Specify your choice
of eps.

In [None]:
import random
Q_values = {Turns[0]: {}, Turns[1]: {}}
max_games = 20000
epsilon = 0.2
gamma = 0.99
alpha = 0.05
rewards = [None for x in range(max_games)]
# str = lambda grid: (tuple(x) for x in grid)

for game in range(max_games):
    if game % 1000 == 0:
        print('Game ', game, ' begins.')
    env.reset()
    grid, _, __ = env.observe()
    Turns = Turns[np.random.permutation(2)]
    opponent =  OptimalPlayer(epsilon=0.5, player=Turns[0])
    our_player = Turns[1]
    player_Q_values = Q_values[Turns[1]]

    prev_grid, prev_move = None, None
    for turn in range(9):
        # Reset just to be sure
        curr_grid = grid
        chosen_move = None
        opponent_turn = env.current_player == opponent.player
        if opponent_turn:
            chosen_move = player_opt_1.act(grid)
        else:
            # Get moves
            possible_moves = player_opt_1.empty(grid)
            # Init Q_values
            if str(grid) not in player_Q_values:
                player_Q_values[str(grid)] = {}
            for mv in possible_moves:
                if mv not in player_Q_values[str(grid)]: player_Q_values[str(grid)][mv] = 0
            # Choose move (eps.greedy)
            if random.random() >= epsilon:
                chosen_move = max(possible_moves, key=player_Q_values[str(grid)].get)
            else:
                chosen_move = random.choice(possible_moves)

        grid, end, winner = env.step(chosen_move, print_grid=False)

        if opponent_turn:
            # Now we got to S', observe reward and update
            r = env.reward(our_player)
            if prev_grid and prev_move:
                # Get max_a (Q(S', a))
                possible_moves_s_dash = player_opt_1.empty(grid)
                # Init Q_values
                if str(grid) not in player_Q_values:
                    player_Q_values[str(grid)] = {}
                if possible_moves_s_dash:
                    for mv in possible_moves_s_dash:
                        if mv not in player_Q_values[str(grid)]: player_Q_values[str(grid)][mv] = 0
                    max_val_action = max(possible_moves_s_dash, key=player_Q_values[str(grid)].get)
                    max_q_value = player_Q_values[str(grid)][max_val_action]
                else:
                    max_q_value = 0

                # Update according to Q-learning formula
                prev_q_val = player_Q_values[prev_grid][prev_move]
                player_Q_values[prev_grid][prev_move] += alpha*(r + gamma*max_q_value - prev_q_val)

            if end:
                break
        else:
            if end and winner and prev_grid and prev_move:
                r = env.reward(our_player)
                assert r == 1, 'r is not 1!'
                # Init Q_values
                prev_q_val = player_Q_values[prev_grid][prev_move]
                player_Q_values[prev_grid][prev_move] += alpha*(r - prev_q_val)
                break
            else:
                prev_grid = str(curr_grid)
                prev_move = chosen_move
    rewards[game] = r
    # print('Game ', game, '. Our player was: ', our_player)
    # env.render()




In [None]:
avgs = []
for x in range(0,len(rewards), 250):
    lower_index = x
    upper_index = min(x+250, len(rewards)-1)
    slice = rewards[lower_index:upper_index]
    # print(lower_index, upper_index, slice)
    avgs.append(sum(slice)/len(slice))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
ax = plt.plot(avgs)
# plt.xticks([str(250*i) for i in range(len(avgs))])


In [None]:
env.render()

In [None]:
chosen_move, env.current_player, our_player

In [None]:
max(Q, key=Q.get)

In [None]:
Q[(12)] = 10

In [None]:
import numpy as np

dictionary = {}
n = np.array([1.234, 21.33, 3.413, 4.4, 15.0000])
n = tuple(n) # Conversion
dictionary[n] = "Hello World"
print(dictionary)


In [None]:
tuple(np.array([1.234, 21.33, 3.413, 4.4, 15.0000])) in dictionary