In [None]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer

DEBUG = True

env = TictactoeEnv()
Turns = np.array(['X','O'])

# Question 1
In this section, you will study whether Q-learning can learn to play Tic Tac Toe by playing against
Opt(eps_opt) for some eps_opt ∈ [0, 1]. To do so, implement the Q-learning algorithm. To check the algorithm,
run a Q-learning agent, with a fixed and arbitrary eps ∈ [0, 1), against Opt(0.5) for 20’000 games – switch
the 1st player after every game.
Question 1. Plot average reward for every 250 games during training – i.e. after the 50th game, plot
the average reward of the first 250 games, after the 100th game, plot the average reward of games 51 to
100, etc. Does the agent learn to play Tic Tac Toe?
Expected answer: A figure of average reward over time (caption length < 50 words). Specify your choice
of eps.

In [None]:
def grid_to_string(bts, is_buffer = True):
        _grid = np.reshape(np.frombuffer(bts),(3,3)) if is_buffer else bts
        str_rep = ''
        value2player = {0: '-', 1: 'X', -1: 'O'}
        for i in range(3):
            str_rep +='|'
            for j in range(3):
                str_rep += value2player[int(_grid[i,j])] + (' ' if j<2 else '')
            str_rep+='|\n'
        str_rep+='\n'
        return str_rep

def print_Q_val(index: int, player = 'O'):
    extract = list(hq[player].items())[index]
    print('Avail moves: ', len(extract[1]))
    print(grid_to_string(extract[0]),extract[1])

In [None]:
import random

# desparate debugging
epsilons = []

class BasePlayer:
    def get_empty_positions(self, grid):
        '''return all empty positions'''
        avail = []
        for i in range(9):
            pos = (int(i/3), i % 3)
            if grid[pos] == 0:
                avail.append(pos)
        return avail

    def hash_grid(self, grid: np.ndarray):
        return grid.tobytes()

class QLearntPlayer(BasePlayer):
    def __init__(self, game_env: TictactoeEnv, epsilon: float, discount_rate_gamma = 0.99, learning_rate_alpha = 0.5):
        super()
        self.game_env = game_env
        self.epsilon = epsilon
        self.discount_rate_gamma = discount_rate_gamma
        self.learning_rate_alpha = learning_rate_alpha
        self.Q_values = {Turns[0]: {}, Turns[1]: {}}
        self.prev_move = None
        self.prev_grid = None
        self.player = None # 'X' or 'O'
        self.player_Q_values = None

    def prepare_new_game_(self, player):
        self.prev_move = None
        self.prev_grid = None
        self.curr_grid = None
        self.curr_move = None
        assert player == 'X' or player == 'O'
        self.player = player
        self.player_Q_values = self.Q_values[player]
        return self

    def get_max_val_action(self, possible_moves, grid_hash):
        if len(possible_moves) == 0:
            q_val = self.player_Q_values[grid_hash]['']
            assert type(q_val) is int
            return ''
        return max(possible_moves, key=self.player_Q_values[grid_hash].get)

    def init_q_values_(self, grid_hash, possible_moves):
        if grid_hash not in self.player_Q_values:
            self.player_Q_values[grid_hash] = {} if len(possible_moves) > 0 else {'': 0}
        for mv in possible_moves:
            if mv not in self.player_Q_values[grid_hash]: self.player_Q_values[grid_hash][mv] = 0

    def choose_move_(self, grid):
        grid_hash = self.hash_grid(grid)
        # Get moves
        possible_moves = self.get_empty_positions(grid)
        assert len(possible_moves) > 0
        # Init Q_values
        self.init_q_values_(grid_hash, possible_moves)
        # Choose move (eps.greedy)
        random_sample = random.random()
        play_best_move = random_sample >= self.epsilon
        if play_best_move:
            chosen_move = self.get_max_val_action(possible_moves, grid_hash)
        else:
            chosen_move = random.choice(possible_moves)
        self.curr_grid = grid
        self.curr_move = chosen_move
        if DEBUG:
            print('-----------------------------------')
            print('Current position: ', '\n' + grid_to_string(grid, False))
            print('Current Q-vals', self.Q_values[self.player][grid_hash])
            print('Random sample ', random_sample, ' epsilon ', self.epsilon, ' hence I chose ', \
                '*best*' if play_best_move else '*random*', ' move: ', chosen_move )
            print('-----------------------------------')

        return chosen_move

    def update_q_values_(self, new_grid, game_over):
        """
        update Q values by Q-learning formula.

        new_grid ~ S' in the formula
        """
        prev_grid, prev_move = self.prev_grid, self.prev_move
        self.prev_grid = self.curr_grid
        self.prev_move = self.curr_move
        self.curr_grid, self.curr_move = None, None
        if prev_grid is not None and prev_move is not None:
            new_grid_hash = self.hash_grid(new_grid)
            prev_grid_hash = self.hash_grid(prev_grid)
            reward = self.game_env.reward(self.player)
            # Get max_a (Q(S', a))
            possible_moves_s_dash = [] if game_over else self.get_empty_positions(new_grid)
            self.init_q_values_(new_grid_hash, possible_moves_s_dash)
            max_val_action = self.get_max_val_action(possible_moves_s_dash, new_grid_hash)
            max_q_value = self.player_Q_values[new_grid_hash][max_val_action]

            if DEBUG:
                print('*** UPDATING Q VALS ****')
                print('Prev_grid: ', '\n' + grid_to_string(prev_grid, False))
                print('Prev_move: ', prev_move)
                print('new_grid: ', '\n' + grid_to_string(new_grid, False))
                print('max_val_action: ', max_val_action)
                print('Q-vals before: ', self.Q_values[self.player][prev_grid_hash])

            # Update according to Q-learning formula
            prev_q_val = self.player_Q_values[prev_grid_hash][prev_move]
            self.player_Q_values[prev_grid_hash][prev_move] += self.learning_rate_alpha*(reward + self.discount_rate_gamma*max_q_value - prev_q_val)
            if DEBUG:
                print('reward: ', reward, 'max-q-val', max_q_value, 'discount', self.discount_rate_gamma,\
                    'self.learning_rate_alpha',self.learning_rate_alpha)
                print('Q-vals after: ', self.Q_values[self.player][prev_grid_hash])
                print('*****')

In [None]:
def run_n_games(max_games_number, our_player, opponent_epsilon, our_player_new_game_epsilon, update_q_values, progress_print=None):
    _rewards = [None for _ in range(max_games_number)]
    _turns = ['X','O']
    opponent =  OptimalPlayer(epsilon=opponent_epsilon, player=_turns[0])

    for game in range(1):
        env.reset()
        grid, _, __ = env.observe()
        _turns = _turns[::-1] # Reverse after every game to ensure both sides played equally
        opponent.player = _turns[0]
        our_player = q_learnt_player.prepare_new_game_(_turns[1])
        assert opponent.player != our_player.player
        our_player.epsilon = our_player_new_game_epsilon(game_number_n=game)
        # epsilons[-1].append(our_player.epsilon)

        if (progress_print and game % progress_print == 0) or DEBUG:
            print('Game ', game, ' begins.')
            if DEBUG:
                print('We play: ', our_player.player)
                input('awaiting input: ')

        for turn in range(9):
            opponent_turn = env.current_player == opponent.player
            if opponent_turn:
                chosen_move = opponent.act(grid)
            else:
                chosen_move = our_player.choose_move_(grid)

            grid, end, winner = env.step(chosen_move, print_grid=False)

            if opponent_turn:
                update_q_values and q_learnt_player.update_q_values_(grid, game_over=end)
            if end:
                update_q_values and q_learnt_player.update_q_values_(grid, game_over=end)
                _rewards[game] = env.reward(our_player.player)
                break
    return _rewards

In [None]:
import matplotlib.pyplot as plt

q1_epsilon = 0.1 # Chosen because they use this in Q2 so this will allow us to nicely compare

max_games = 20000
q_learnt_player = QLearntPlayer(env, epsilon=q1_epsilon)
avgs = []
rewards = []
total_wins = 0
DEBUG = True

for game_epoch in range(max_games//250):
    if game_epoch % 10 == 0:
        print('Game ', game_epoch*250, ' begins.')
    run_rewards = run_n_games(max_games_number=250, our_player=q_learnt_player, opponent_epsilon=0.5, \
            our_player_new_game_epsilon=lambda game_number_n: q1_epsilon, update_q_values=True)
    rewards+=run_rewards
    avgs.append(np.average(run_rewards))
    total_wins += sum(1 if rew ==1 else 0 for rew in run_rewards)

print('Our agent won {} times'.format(total_wins))
plt.figure(figsize=(15,15))
ax = plt.plot(avgs)
plt.xticks(ticks=range(len(avgs)), labels=[str(x*250 // 1000) if x*250 % 1000 == 0 else '' for x in range(len(avgs))])
plt.ylabel('Average reward per 250 episodes')
plt.xlabel('Episode (thousands)')
plt.show()

In [None]:
qv = q_learnt_player.Q_values['X']
test = {grid_to_string(k): qv[k] for k in qv if any(map(lambda a: qv[k][a] != 0, qv[k]))}
for (i,j) in test.items():
    print(i,j)

Question 2. Plot average reward for every 250 games during training. Does decreasing epsilon help training
compared to having a fixed epsilon? What is the effect of n∗?
Expected answer: A figure showing average reward over time for different values of n∗ (caption length < 200 words)

In [None]:
max_games = 20000
n_stars =  np.geomspace(1, 40000, num=5) # Includes 1 and 40000
epoch_size = 250

rewards = {n_star: [] for n_star in n_stars}
M_opt = {n_star: [] for n_star in n_stars}
M_rand = {n_star: [] for n_star in n_stars}

players = {}

min_epsilon = 0.1
max_epsilon = 0.8
def calc_epsilon_factory(n_star, epoch_size, game_epoch):
        def calc_epsilon(game_number_n):
            real_game_number = game_epoch*epoch_size + game_number_n
            return max(min_epsilon, max_epsilon*(1-(real_game_number/n_star)))
        return calc_epsilon

for n_star in n_stars:
    epsilons.append([])
    q_learnt_player = QLearntPlayer(env, epsilon=max_epsilon)
    players[n_star] = q_learnt_player
    print('Current n_star = {}'.format(n_star))

    for game_epoch in range(max_games//epoch_size):
        calc_epsilon = calc_epsilon_factory(n_star=n_star, epoch_size=epoch_size, game_epoch=game_epoch)
        if game_epoch % 20 == 0:
            print('Game ', game_epoch*epoch_size, ' begins.')

        # Run 250 games with updating Q-vals and observe reward (exec 2)
        run_rewards = run_n_games(max_games_number=epoch_size, our_player=q_learnt_player, opponent_epsilon=0.5, \
            our_player_new_game_epsilon=calc_epsilon, update_q_values=True)
        rewards[n_star] += run_rewards

        # # Run 500 games for M_opt calculation
        # M_opt_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=0, \
        #     our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        # M_opt[n_star].append(np.average(M_opt_rewards))


        # # Run 500 games for M_rand calculation
        # M_rand_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=1, \
        #     our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        # M_rand[n_star].append(np.average(M_rand_rewards))


In [None]:
epses = {n_stars[i]: v for (i,v) in enumerate(epsilons[5:])}
epses
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

q2_data = pd.DataFrame(epses)
q2_data.index.name = 'epoch'
q2_data.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=q2_data)
g.set_ylabel('Epsilon')

In [None]:
pl = list(players.values())

In [None]:
def buffer_to_string(bts):
        _grid = np.reshape(np.frombuffer(bts),(3,3))
        str_rep = ''
        value2player = {0: '-', 1: 'X', -1: 'O'}
        for i in range(3):
            str_rep +='|'
            for j in range(3):
                str_rep += value2player[int(_grid[i,j])] + (' ' if j<2 else '')
            str_rep+='|\n'
        str_rep+='\n'
        return str_rep

def print_Q_val(index: int, player = 'O'):
    extract = list(hq[player].items())[index]
    print('Avail moves: ', len(extract[1]))
    print(buffer_to_string(extract[0]),extract[1])

def humanize_q_vals(q_vals):
    return q_vals
    # _O = {str(np.frombuffer(k)): v for (k,v) in q_vals['O'].items()}
    # _X = {str(np.frombuffer(k)): v for (k,v) in q_vals['X'].items()}
    # return {'O': _O, 'X': _X}

hq = humanize_q_vals(pl[0].Q_values)

In [None]:


print_Q_val(1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
avgs = {n_star: [] for n_star in n_stars}
for x in range(0,max_games, 250):
    lower_index = x
    upper_index = min(x+250, max_games-1)
    for n_star in n_stars:
        slice = rewards[n_star][lower_index:upper_index]
        avgs[n_star].append(sum(slice)/len(slice))

q2_data = pd.DataFrame(avgs)
q2_data.index.name = 'epochs by 250'
q2_data.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=q2_data)
g.set_ylabel('Average rewards per 250 epochs')

In [None]:
# Print the same plot as in exe 1 to ensure results are the same

plt.figure(figsize=(15,15))
ax = plt.plot(avgs[1.0])
plt.xticks(ticks=range(len(avgs[1.0])), labels=[str(x*250 // 1000) if x*250 % 1000 == 0 else '' for x in range(len(avgs[1.0]))])
plt.ylabel('Average reward per 250 episodes')
plt.xlabel('Episode (thousands)')
plt.show()

In [None]:
for (n_star, rews) in rewards.items():
    print('{} won {} games'.format(n_star, sum(1 if rew ==1 else 0 for rew in rews)))

In [None]:
len(rewards[1.0])

#### Q2
*Does decreasing epsilon help with training compared to fix epsilon?*

(jl, April 22, 8:00pm)  **I cannot see a bug but it seems it virtually does not, which is very counterintuitive.** But the best value we seem to observe is when n_star = 1 but that is the same as not having any decrease at all and just hardcoding epsilon to 0.1. This is very strange indeed.


*What is the effect of n\*?*

Based on our small sample of 4 n*, it seems the smaller n* the better: n*=1 gets better perf than n*=34.2 which in turn gets better than n*=1169 which is in turn much better than n*=40,000.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_opt_df = pd.DataFrame(M_opt)
M_opt_df.index.name = 'epochs by 250'
M_opt_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_opt_df)
g.set_ylabel('M_opt')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_rand_df = pd.DataFrame(M_rand)
M_rand_df.index.name = 'epochs by 250'
M_rand_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_rand_df)
g.set_ylabel('M_rand')

#### Q3
*Describe the differences and similarities between the curves and the one in the previous question*

Just like in the previous question, we can see lower values of n* significantly outperforming higher values

# Q4
Choose the best value of $n^∗$ that you found in the previous section. Run Q-learning against Opt($\epsilon_{opt}$) for
different values of $\epsilon_{opt}$ for 20’000 games – switch the 1st player after every game. Choose several values
of $\epsilon_{opt}$ from a reasonably wide interval between 0 to 1 – particularly, include $\epsilon_{opt}$ = 0.


Question 4. After every 250 games during training, compute the ‘test’ $M_{opt}$ and $M_{rand}$ for your agents
– for each value of $\epsilon_{opt}$. Plot $M_{opt}$ and $M_{rand}$ over time. What do you observe? How can you explain it?
Expected answer: A figure showing $M_{opt}$ and $M_{rand}$ over time for different values of $\epsilon_{opt}$ (caption length
< 250 words).

In [None]:
best_n_star = 1

In [None]:
max_games = 20000
epoch_size = 250

eps_opts = np.linspace(0,1,num=5)

rewards_eps = {eps_opt: [] for eps_opt in eps_opts}
M_opt_eps = {eps_opt: [] for eps_opt in eps_opts}
M_rand_eps = {eps_opt: [] for eps_opt in eps_opts}

min_epsilon = 0.1
max_epsilon = 0.8

n_star = None
calc_epsilon = None


def calc_epsilon_factory_eps(epoch_size, game_epoch):
        def calc_epsilon(game_number_n):
            real_game_number = game_epoch*epoch_size + game_number_n
            return max(min_epsilon, max_epsilon*(1-(real_game_number/best_n_star)))
        return calc_epsilon

for eps_opt in eps_opts:
    q_learnt_player = QLearntPlayer(env, epsilon=min_epsilon)
    print('Current eps_opt = {}'.format(eps_opt))

    for game_epoch in range(max_games//epoch_size):
        calc_epsilon = calc_epsilon_factory_eps(epoch_size=epoch_size, game_epoch=game_epoch)
        if game_epoch % 20 == 0:
            print('Game ', game_epoch*epoch_size, ' begins.')

        # Run 250 games with updating Q-vals and observe reward (exec 4)
        run_rewards = run_n_games(max_games_number=epoch_size, our_player=q_learnt_player, opponent_epsilon=eps_opt, \
            our_player_new_game_epsilon=calc_epsilon, update_q_values=True)
        rewards_eps[eps_opt] += run_rewards

        # Run 500 games for M_opt_eps calculation
        M_opt_eps_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=0, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_opt_eps[eps_opt].append(np.average(M_opt_eps_rewards))


        # Run 500 games for M_rand_eps calculation
        M_rand_eps_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=1, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_rand_eps[eps_opt].append(np.average(M_rand_eps_rewards))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_opt_eps_df = pd.DataFrame(M_opt_eps)
M_opt_eps_df.index.name = 'epochs by 250'
M_opt_eps_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_opt_eps_df)
g.set_ylabel('M_opt_eps')

(jl, April 22, 8:00pm) **I think most likely there must be a bug** because it seems quite strange that the eps=0 option would learn to play the opponent perfectly right away after only 250 epochs. I can't remember anymore, but I don't think that is what we observed before? Definitely something to check. Also it seems weird to me that the process is not monotone. Would we not expect to only improve? How come we suddenly get worse? I guess there is some variance due to our epsilon (i.e. sometimes we explore and we don't choose the perfect option) so maybe that explains it?

**EDIT (10pm)**: Maybe the reason why it underperforms against random player is that since its trained against a perfect player, it learns quickly how to almost perfectly avoid losses, but it never learns how to achieve wins since it never gets a chance to do that during training – hence it continues to play those positions randomly. **So perhaps this is not a bug afterall.** Still it does not make sense though why performance of 0.25 is bad against rand when it's so good here – 0.25 already gets exposure to winning positions so it should perform well? After 20k games I'd think it would have a chance to see winning positions often enough but perhaps I'm wrong.

# Q5
What are the highest values of $M_{opt}$ and $M_{rand}$ that you could achieve after playing 20’000 games?

The highest value of $M_{opt}$ is achieved by $\epsilon=0.0$ and $\epsilon=0.25$ and it is the value 0 which is the best that we can hope for against  $M_{opt}$. The highest value of $M_{rand}$ is near $0.8$ achieved by  $\epsilon=0.75$ 



In [None]:
M_opt_eps_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_rand_eps_df = pd.DataFrame(M_rand_eps)
M_rand_eps_df.index.name = 'epochs by 250'
M_rand_eps_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_rand_eps_df)
g.set_ylabel('M_rand_eps')

**Again** it is super weird that the M_rand plot is not a mirror image of the M_opt; i.e. we'd expect that if 0.0 gets great performance again M_opt, it should get great performance here. This is not what seems to be happening though.

# Q6 (answer)
*Question 6. (Theory) Assume that Agent 1 learns by playing against $\text{Opt}(0)$ and find the optimal Q-
values $Q_1(s, a)$. In addition, assume that Agent 2 learns by playing against $\text{Opt}(1)$ and find the optimal
Q-values $Q_2(s, a)$. Do $Q_1(s, a)$ and $Q_2(s, a)$ have the same values? Justify your answer. (answer length
< 150 words)*

No, they will not have the same values. This is because if we play an optimal agent, we will never win and thus never observe a positive reward. Therefore all Q-values will be at best 0. However, playing against a random oponent, we will definitely win sometimes and as we will get closer and closer to convergence, we will play better and better and win more and more. We will therefore definitely observe positive rewards at times. Hence, the Q-values will be different.

# 2.2 Learning by practice
In this section, your are supposed to ask whether Q-learning can learn to play Tic Tac Toe by only
playing against itself. For different values of $\epsilon \in [0, 1)$, run a Q-learning agent against itself for 20’000
games – i.e. both players use the same set of Q-values and update the same set of Q-values.



# Q7 (code below)
*Question 7. After every 250 games during training, compute the ‘test’ $M_{opt}$ and $M_{rand}$ for different
values of $\epsilon \in [0, 1)$. Does the agent learn to play Tic Tac Toe? What is the effect of $\epsilon$?
Expected answer: A figure showing $M_{opt}$ and $M_{rand}$ over time for different values of $\epsilon \in [0, 1)$ (caption
length < 100 words).*

In [None]:
def run_n_games_against_self(max_games_number, our_player, our_player_new_game_epsilon, update_q_values, progress_print=None):
    _rewards = {'our_player': [None for _ in range(max_games_number)],'opponent': [None for _ in range(max_games_number)]}
    _turns = ['X','O']
    opponent = QLearntPlayer(our_player.game_env, our_player.epsilon,our_player.discount_rate_gamma, our_player.learning_rate_alpha)

    # Ensure they share Q_values... I think this should work but it seems (as of April 22, 8pm)
    # that there is a bug: see more below
    opponent.Q_values = our_player.Q_values


    for game in range(max_games_number):
        if progress_print and game % progress_print == 0:
            print('Game ', game, ' begins.')
        env.reset()
        grid, _, __ = env.observe()
        _turns = _turns[::-1] # Reverse after every game to ensure both sides played equally
        opponent = opponent.prepare_new_game_(_turns[0])
        our_player = q_learnt_player.prepare_new_game_(_turns[1])
        assert opponent.player != our_player.player
        our_player.epsilon = our_player_new_game_epsilon(game_number_n=game)
        opponent.epsilon = our_player_new_game_epsilon(game_number_n=game)

        for turn in range(9):
            opponent_turn = env.current_player == opponent.player
            if opponent_turn:
                chosen_move = opponent.choose_move_(grid)
            else:
                chosen_move = our_player.choose_move_(grid)

            grid, end, winner = env.step(chosen_move, print_grid=False)

            if end:
                update_q_values and q_learnt_player.update_q_values_(grid)
                update_q_values and opponent.update_q_values_(grid)
                _rewards['our_player'][game] = env.reward(our_player.player)
                _rewards['opponent'][game] = env.reward(opponent.player)
                break
            else:
                if opponent_turn:
                    update_q_values and q_learnt_player.update_q_values_(grid)
                else:
                    update_q_values and opponent.update_q_values_(grid)

    return _rewards

In [None]:
max_games = 20000
epoch_size = 250

eps_selfs = np.linspace(0,0.99,num=5)

M_opt_self = {eps_opt: [] for eps_opt in eps_selfs}
M_rand_self = {eps_opt: [] for eps_opt in eps_selfs}

min_epsilon = 0.1
max_epsilon = 0.8

n_star = None
calc_epsilon = None

for eps_s in eps_selfs:
    q_learnt_player = QLearntPlayer(env, epsilon=min_epsilon)
    print('Current eps_s = {}'.format(eps_s))

    for game_epoch in range(max_games//epoch_size):
        calc_epsilon = lambda game_number_n: eps_s
        if game_epoch % 20 == 0:
            print('Game ', game_epoch*epoch_size, ' begins.')

        # Run 250 games with updating Q-vals and observe reward (exec 7)
        run_rewards = run_n_games_against_self(max_games_number=epoch_size, our_player=q_learnt_player, \
            our_player_new_game_epsilon=calc_epsilon, update_q_values=True)

        # Run 500 games for M_opt_self calculation
        M_opt_self_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=0, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_opt_self[eps_s].append(np.average(M_opt_self_rewards))


        # Run 500 games for M_rand_self calculation
        M_rand_self_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=1, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_rand_self[eps_s].append(np.average(M_rand_self_rewards))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_opt_self_df = pd.DataFrame(M_opt_self)
M_opt_self_df.index.name = 'epochs by 250'
M_opt_self_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_opt_self_df)
g.set_ylabel('M_opt_self')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_opt_self_df = pd.DataFrame(M_opt_self[eps_selfs[1]])
M_opt_self_df.index.name = 'epochs by 250'
M_opt_self_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_opt_self_df)
g.set_ylabel('M_opt_self')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_rand_self_df = pd.DataFrame(M_rand_self)
M_rand_self_df.index.name = 'epochs by 250'
M_rand_self_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_rand_self_df)
g.set_ylabel('M_rand_self')

# Q7 (answer)
*Question 7. After every 250 games during training, compute the ‘test’ $M_{opt}$ and $M_{rand}$ for different
values of $\epsilon \in [0, 1)$. Does the agent learn to play Tic Tac Toe? What is the effect of $\epsilon$?
Expected answer: A figure showing $M_{opt}$ and $M_{rand}$ over time for different values of $\epsilon \in [0, 1)$ (caption
length < 100 words).*

(jl, April 22 8:00 pm) **It seems that there is some bug here** because very little learning seems to be going on. I would expect the agent here to learn at least as well as when playing a random oponent.

# Q8 (code below)
For rest of this section, use $\epsilon(n)$ in Equation 1 with different values of $n^∗$ – instead of fixing $\epsilon$.
Question 8. After every 250 games during training, compute the ‘test’ $M_{opt}$ and $M_{rand}$ for your agents.
Does decreasing $\epsilon$ help training compared to having a fixed $\epsilon$? What is the effect of $n^∗$?
Expected answer: A figure showing $M_{opt}$ and $M_{rand}$ over time for different values of speeds of $n^∗$ (caption
length < 100 words).

In [None]:
max_games = 20000
n_stars =  np.geomspace(1, 40000, num=4) # Includes 1 and 40000
epoch_size = 250

rewards_self_n_stars = {n_star: {'our_player': [], 'opponent':[]} for n_star in n_stars}
M_opt_self_n_stars = {n_star: [] for n_star in n_stars}
M_rand_self_n_stars = {n_star: [] for n_star in n_stars}

min_epsilon = 0.1
max_epsilon = 0.8
def calc_epsilon_factory(n_star, epoch_size, game_epoch):
        def calc_epsilon(game_number_n):
            real_game_number = game_epoch*epoch_size + game_number_n
            return max(min_epsilon, max_epsilon*(1-(real_game_number/n_star)))
        return calc_epsilon

for n_star in n_stars:
    q_learnt_player = QLearntPlayer(env, epsilon=max_epsilon)
    print('Current n_star = {}'.format(n_star))

    for game_epoch in range(max_games//epoch_size):
        calc_epsilon = calc_epsilon_factory(n_star=n_star, epoch_size=epoch_size, game_epoch=game_epoch)
        if game_epoch % 20 == 0:
            print('Game ', game_epoch*epoch_size, ' begins.')

        # Run 250 games with updating Q-vals and observe reward (exec 8)
        run_rewards = run_n_games_against_self(max_games_number=epoch_size, our_player=q_learnt_player, \
            our_player_new_game_epsilon=calc_epsilon, update_q_values=True)
        rewards_self_n_stars[n_star]['our_player'] += run_rewards['our_player']
        rewards_self_n_stars[n_star]['opponent'] += run_rewards['opponent']

        # Run 500 games for M_opt_self_n_stars calculation
        M_opt_self_n_stars_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=0, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_opt_self_n_stars[n_star].append(np.average(M_opt_self_n_stars_rewards))


        # Run 500 games for M_rand_self_n_stars calculation
        M_rand_self_n_stars_rewards = run_n_games(max_games_number=500, our_player=q_learnt_player, opponent_epsilon=1, \
            our_player_new_game_epsilon=lambda game_number_n: 0, update_q_values=False)
        M_rand_self_n_stars[n_star].append(np.average(M_rand_self_n_stars_rewards))



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_rand_self_n_stars_df = pd.DataFrame(M_rand_self_n_stars)
M_rand_self_n_stars_df.index.name = 'epochs by 250'
M_rand_self_n_stars_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_rand_self_n_stars_df)
g.set_ylabel('M_rand_self_n_stars')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

M_opt_self_n_stars_df = pd.DataFrame(M_opt_self_n_stars)
M_opt_self_n_stars_df.index.name = 'epochs by 250'
M_opt_self_n_stars_df.transpose().index.name = 'n_star'
sns.set(rc={'figure.figsize':(24,12)})
g = sns.lineplot(data=M_opt_self_n_stars_df)
g.set_ylabel('M_opt_self_n_stars')

# Q8 (answer)
For rest of this section, use $\epsilon(n)$ in Equation 1 with different values of $n^∗$ – instead of fixing $\epsilon$.
Question 8. After every 250 games during training, compute the ‘test’ $M_{opt}$ and $M_{rand}$ for your agents.
Does decreasing $\epsilon$ help training compared to having a fixed $\epsilon$? What is the effect of $n^∗$?
Expected answer: A figure showing $M_{opt}$ and $M_{rand}$ over time for different values of speeds of $n^∗$ (caption
length < 100 words).

(jl, April 22 8:00 pm) **Just like in Q7, I think this is buggy** because very little learning seems to be going on. I would expect the agent here to learn at least as well as when playing a random oponent.

# Q9
*Question 9. What are the highest values of $M_{opt}$ and $M_{rand}$ that you could achieve after playing 20’000 games?*

**TODO: Answer once sure the above code is not buggy**

# Q10
*Question 10. For three board arrangements (i.e. states s), visualize Q-values of available actions (e.g.
using heat maps). Does the result make sense? Did the agent learn the game well?
Expected answer: A figure with 3 subplots of 3 different states with Q-values shown at available actions
(caption length < 200 words).*

**TODO: Answer once sure the above code is not buggy**

# 3 Deep Q-Learning
As our 2nd algorithm, we use Deep Q-Learning (DQN) combined with $\epsilon$-greedy policy. You can watch
again Part 1 of Deep Reinforcement Learning Lecture 1 for an introduction to DQN and Part 1 of
Deep Reinforcement Learning Lecture 2 (in particular slide 8) for more details. The idea in DQN is
to approximate Q-values by a neural network instead of a look-up table as in Tabular Q-learning. For
implementation, you can use ideas from the DQN tutorials of Keras and PyTorch.

# 3.2 Learning from experts
Implement the DQN algorithm. To check the algorithm, run a DQN agent with a fixed and arbitrary
$\epsilon \in [0,1)$ against Opt(0.5) for 20’000 games – switch the 1st player after every game.

## Question 11
*Plot average reward and average training loss for every 250 games during training. Does
the loss decrease? Does the agent learn to play Tic Tac Toe?
Expected answer: A figure with two subplots (caption length $<$ 50 words). Specify your choice of $\epsilon$.*

**Answer:** We use $\epsilon = 0.05$ as that is the value that Mnih at el (2015) use.

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torchsummary import summary 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def state_to_torch(game_state: np.ndarray):
    return torch.cat((torch.from_numpy((x == 1).astype(int)).view(3,3,1), torch.from_numpy((x == -1).astype(int)).view(3,3,1)), dim=2).to(device)

def game_state_converts_to_pytorch_correctly():
    game_state = np.array([ [ 1.,  1.,  1.], \
                            [ 1.,  0., -1.], \
                            [-1., -1.,  1.]])
    torch_repre = state_to_torch(game_state)
    expected_our_positions = torch.from_numpy((x==1).astype(int))
    torch_repre_our_positions = torch_repre[:,:,0]
    assert (expected_our_positions == torch_repre_our_positions).all()
    expected_opponent_positions = torch.from_numpy((x==-1).astype(int))
    torch_repre_opponent_positions = torch_repre[:,:,1]
    assert (expected_opponent_positions == torch_repre_opponent_positions).all()
    return True

assert game_state_converts_to_pytorch_correctly()

In [None]:
# This cell's code is taken from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html?highlight=huber

from collections import namedtuple, deque
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DQNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net =  nn.Sequential(
            nn.LazyLinear(128),
            nn.ReLU(),
            nn.LazyLinear(128),
            nn.ReLU(),
            nn.LazyLinear(9))

    def forward(self, inp):
        input_in_1D = inp.view(inp.size(0), -1)
        return self.net(input_in_1D)

summary(DQNet().to(device), (3,3,2))

In [None]:
class DQN_Player(BasePlayer):
    def __init__(self):
        self.model = DQNet()
        LEARNING_RATE = 5 * 10e-4 # Given in the instructions PDF
        DISCOUNT_RATE_GAMMA = 0.99
        BATCH_SIZE = 64
        BUFFER_SIZE = 10_000
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.criterion = nn.SmoothL1Loss() # "When delta is set to 1, this loss is equivalent to SmoothL1Loss." (PyTorch HuberLoss documentation)

    def choose_move(self, grid, epsilon):
        possible_moves
        if random.random() > 



In [None]:
(x == 1).astype(int), (x == -1).astype(int)

In [None]:
y = torch.cat((torch.from_numpy((x == 1).astype(int)).view(3,3,1), torch.from_numpy((x == -1).astype(int)).view(3,3,1)), dim=2)
y

In [None]:
y[:,:,0], y[:,:,1]

In [None]:
(y[:,:,0] == torch.from_numpy((x==1).astype(int))).all()