#  Answers: Q-Learning Tutorial

---

This notebook contains the answers for prompts in the  `Q-Learning (answers)` notebook found [here](https://github.com/js-fitz/Q-Learning/). Try each function out on your own, but come back here if you get frustrated. The answers in this notebook represent the bare minimum—the simplest possible answer for each prompt—so if you want to see the backend code including bonus features, animations and additional print statements, check out the `ql_functions.py` module file used throughout. Also visit the module file if you want to adapt your functions for a bigger boards (above 3x3) — this raises unexpected and interesting challenges.

---

### 1.1 `new_board()`

In [1]:
def new_board(size=3):
    return [' ']*size**2

---

### 1.2 `show()`

In [2]:
def show(b, helpers=False):
    board = list(b).copy() # accepts board as list or string
    size = int(math.sqrt(len(board)))  
    

    if helpers: # placeholders (starting at 1)    
        board = [str(e+1) if i==' ' else i for e,i in enumerate(board)]
        
    # recolor for visibility:
    for e,b in enumerate(board):
        if 'X' in b: board[e] = f"\x1b[31m{b}\x1b[0m"   # 31=red
        elif 'O' in b: board[e] = f"\x1b[34m{b}\x1b[0m" # 34=blue
        else: board[e] = f"\x1b[37m{b}\x1b[0m"          # 37=gray
            
    # print grid and values:
    for row in range(0, len(board), size): # start of each row
        print('—'*(4*size+1))
        for col in range(size):  # add column to row start
            print(f'| {board[(row+col)]} |', end='\b')
        print('|')
        print('—'*(4*size+1), end='\r')
    print()

---

### 1.3 `evaluate()`

First we set up a helper function to define the minimum number of tiles in a row required to win. This step is not totally, necessary but set us up for easier customization of the environment later on.

In [3]:
def check_for_win(b_line, min_contig=3):
    for p in 'XO':
        for i in list(range(len(b_line)-min_contig+1)): # check chunks in the line
            if b_line[i:i+3].count(p) == 3: return p
    return False

Then we use `evaluate` to run `check_for_win` across lines in the board:

In [4]:
def evaluate(b):

    size = int(math.sqrt(len(b))) # define row / column length

    # verticals
    for col in range(size):
        v_line = [b[row+col] for row in range(0, len(b), size)]
        winner = check_for_win(v_line)
        if winner: return winner+' Wins!'

    # horizontals
    for row in range(0, len(b), size):
        h_line = [b[row+col] for col in range(size)]
        winner = check_for_win(h_line)
        if winner: return winner+' Wins!'

    # down-right diagonal
    dr_line = [b[int(row+row/size)] for row in range(0, len(b), size)]
    winner = check_for_win(dr_line)
    if winner: return winner+' Wins!'

    # up-right diagonal
    ur_line = [b[int(size*col-col)] for col in range(1, size+1)[::-1]]   
    winner = check_for_win(ur_line)
    if winner: return winner+' Wins!'

    # If no win, check for empty spaces:
    if b.count(' ')>0:
        return 'Continue'

    # If no win and no empty spaces:
    else: return 'Tie!'

---

### 2.1 `flip_board()`

In [5]:
def flip_board(b_key): 
    return b_key.replace('X','o').replace('O','X').upper()

---

### 2.2 `get_move()`

In [6]:
def get_move(b, epsilon=.5, player='X', init_q=.3):
    
    global q_table # work with the global Q table
    
    b_key = ''.join(b) # accept b as a string or a list
    
    # reverse tiles if player is O
    if player =='O': b_key = flip_board(b_key)
    
    # list possible moves
    opts = [i for i in range(len(b)) if b[i]==' ']

    # if state is new, initialize in the q_table
    if b_key not in q_table.keys():
        # nested dicts Q1 and Q2 for every board state:
        q_table[b_key] = {v: {o:init_q for o in opts} for v in ['Q1', 'Q2']}
    
    # get average Q values from both versions of the table
    q_vals = {o: sum([q[o] for q in q_table[b_key].values() ])/2 for o in opts}
     
    # e-greedy decision
    random_move = epsilon > random.uniform(0, 1)
    if random_move: 
        return random.choice(opts) # random move
    else:
        return max(q_vals, key=q_vals.get)  # smart move

---

### 3.1 `simulate_game()`

In [7]:
def simulate_game(epsilon_x=1, epsilon_o=1):
    
    global size # check current environment settings
    
    b=new_board(size)
    steps = []
    
    while True:    
        # iterate between players with e-vals attached
        for player, epsilon in zip(['X', 'O'], [epsilon_x, epsilon_o]):
            
            result = evaluate(b)
             
            # non-terminal state
            if 'C' in result:
                
                # get next move using player's e-value
                move = get_move(b, epsilon, player)
                
                # store state & action in steps
                steps.append({'state':''.join(b.copy()), 'move': move,})
                    
                # update board
                b[move] = player
            
            # terminal state
            else: return steps, result[0]

---

### 3.2 `get_new_q()`

In [8]:
def get_new_q(current_q, reward, max_future_q):
    return (1-lrate)*current_q + lrate*(reward + discount * max_future_q)

---

### 3.3 `backpropagate()`

In [9]:
def backpropagate(steps, winner, alpha=.9, wait_seconds=False):
    
    global q_table # make updates to global q table
        
    # backprop for each player
    for player in ['X', 'O']:
        p_steps = steps.copy()
            
    # isolate target player's moves:
        # if O, drop X's first move
        if player=='O':  p_steps = p_steps[1:]
        # board size affects final player in ties!
        if winner == 'T':  # is  number of board tiles odd or even?
            if player =='O' and size**2%2==1: p_steps = p_steps[:-1]
            if player =='X' and size**2%2==0: p_steps = p_steps[:-1]
        # if loser, drop opponent's last move
        elif player!=winner : 
            p_steps = p_steps[:-1] 
        # slice for player steps in reverse
        p_steps = p_steps[::-2]
        
    # iterate backwards over steps (where player moved)
        for n_steps_left, step in enumerate(p_steps):
            # extract state and move from step
            state, move = step['state'], step['move']
             # select random q table version to update
            qv = random.choice(['Q1', 'Q2']) 
            # reverse tiles for O
            if player=='O': state = flip_board(state) 

    # define key variables for get_new_q():
        # define reward
            reward = alpha**(n_steps_left+1)            
            if winner=='T':
                reward *= -.1 # slight penalty for tie
            elif player!=winner:
                reward *= -1 # full penalty for hard loss
            
        # define max_future_q
            # if mid-game move, check future state for player to move:
            if n_steps_left>0: 
                future = p_steps[n_steps_left-1]['state']
                if player=='O': future = flip_board(future)
                max_future_q = max(q_table[future][qv].values())
                
            # if final move, specify max_future_q manually:
            elif player==winner: max_future_q = 1 
            elif winner=='T': max_future_q = .5
            else: max_future_q = 0
            
            current_q = q_table[state][qv][move]

        # UPDATE Q TABLE:
            new_q = get_new_q(current_q, reward, max_future_q)   
            # overwrite target table with new q
            q_table[state][qv][move] = new_q 
                
    return

---

### 4.1 `train_agent()`

We including a plotting function using the `Animation` module from `matplotlib` in our agent training, so take a look at the `ql_functions.py` file for the full code. The following is a simplified example to achieve a similar affect, without the live plotting.

In [10]:
def train_agent(iters):
    
    # define game types
    game_types = {
        'Training X (random O)':
            {'random X': False, 'random O': True},
        'Training O (random X)':
            {'random X': True, 'random O': False},
        'Training both (versus)':
            {'random X': False, 'random O': False}     }
    
    # iterate through game types
    for game_type, params in game_types.items():
        # play 1/3  iters per game type
        for i in range(int(iters/3)):
            
            # set player epsilons
            if params['random X']: epsilon_x = 1
            else: epsilon_x = 1-(i/(iters/3)) # *** how we shrink e over one batch *** 
            if params['random O']: epsilon_o = 1
            else: epsilon_o = 1-(i/(iters/3)) # *** how we shrink e over one batch ***
                
            
            # simulate and backpropagate
            steps, winner = simulate_game(epsilon_x, epsilon_o)
            backpropagate(steps, winner)
        
        print(f'{game_type}: | Completed {i+1} simulations.')
    
    print('Training completed.')

For functions with additional print statements, `matplotlib` animations, and helper functions for compatibility with bigger board sizes, please see the `ql_functions.py` module in this repo.

---

**Testing:**

In [11]:
import time
import math
import random

q_table = {}
size=3
lrate = .1
discount = .9

train_agent(3000)

Training X (random O): | Completed 1000 simulations.
Training O (random X): | Completed 1000 simulations.
Training both (versus): | Completed 1000 simulations.
Training completed.
