In [56]:
import random

class Env:
    state = [2, 0]
    rewards = [[1, -1, -1], [-1, -10, -1], [-1, -1, 10]]
    q_values = []
    epsilon = 1.0
    episode = 0
    learning_rate = 0.7
    gamma = 0.99
    
    def __init__(self):
        q_values = []
        
        for i in range(9):
            row = []
            for j in range(4):
                row.append(0)
            q_values.append(row)
            
        self.q_values = q_values
        
    def get_position(self, state):
        return state[0] * 3 + state[1]
        
    def select_action(self):
        action = None
        sample = random.random()
        epsilon_threshold = self.epsilon - (self.episode * 0.1)
        
        print('Epsilon Threshold: %f' % epsilon_threshold)
        print('Random Sample: %f' % sample)
        
        if sample > epsilon_threshold:
            print('EXPLOITATION')
            position = self.get_position(self.state)
            row = self.q_values[position]
            action = row.index(max(row))
        else:
            print('EXPLORATION')
            action = random.randrange(4)
            
        return action

    def step(self, action):
        state = self.state.copy()
        next_state = self.state
        
        # Calculate next state
        if action == 0:
            # Move left
            next_state[1] -= 1
        elif action == 1:
            # Move right
            next_state[1] += 1
        elif action == 2:
            # Move up
            next_state[0] -= 1
        elif action == 3:
            # Move down
            next_state[0] += 1
        else:
            raise ValueError('Invalid action')
        
        for i in range(2):
            if next_state[i] < 0:
                next_state[i] = 0
            elif next_state[i] > 2:
                next_state[i] = 2
                
        reward = self.rewards[next_state[0]][next_state[1]]
        done = False
        
        position = self.get_position(state)
        next_position = self.get_position(next_state)
        
        old_q = self.q_values[position][action]
        next_q = max(self.q_values[next_position])
        
        a = self.learning_rate
        
        new_q = ((1 - a) * old_q) + (a * (reward + self.gamma * next_q))
        self.q_values[position][action] = new_q
        
        print('Old Q-Value: %.2f' % old_q)
        print('Next Q-Value: %.2f' % next_q)
        print('(1 - a)old_q + a(r + g(next_q)))')
        print('(%.2f)(%.2f) + %.2f(%.2f + %.2f(%.2f))' % (1 - a, old_q, a, reward, self.gamma, next_q))
        print('%.2f + %.2f(%.2f)' % (((1-a) * old_q), a, (reward + self.gamma * next_q)))
        print('%.2f' % new_q)
        
        if next_state == [1, 1] or next_state == [2, 2]:
            done = True
        
        self.print_state()
        self.print_q_table()
        
        return state, reward, done
    
    def reset(self):
        self.state = [2, 0]
        self.episode += 1
    
    def print_state(self):        
        rows = [
            '|C| | |',
            '| |B| |',
            '| | |W|'
        ]

        row = list(rows[self.state[0]])
        row[self.state[1] * 2 + 1] = 'L'
        rows[self.state[0]] = ''.join(row)
        
        print('\nSTATE:')
        print(rows[0])
        print(rows[1])
        print(rows[2])
        
    def print_q_table(self):
        positions = ['C', 1, 2, 3, 'B', 4, 5, 6, 'W']
        
        print('\nQ Table:\n   [L, R, U, D]')

        for position, row in zip(positions, self.q_values):
            print('%s: [%.2f, %.2f, %.2f, %.2f]' % (position, row[0], row[1], row[2], row[3]))
        
    
action_name = ['Left', 'Right', 'Up', 'Down']
env = Env()
env.print_state()
done = False
max_episodes = 11

rewards = []

for i in range(max_episodes):
    print('\n=============')
    print('EPISODE %d' % i)
    print('==============')
    
    total_reward = 0
    
    while not done:
        print('\n---------------')
        action = env.select_action()
        print('Action Selected: %s' % action_name[action])
        _, reward, done = env.step(action)
        total_reward += reward
        print('\nReward: %d' % reward)
        print('Done: %s' % done)

    rewards.append(total_reward)
    env.reset()
    done = False
    
print('\n===========')
print('FINAL STATS')
print('Rewards: %s' % rewards)


STATE:
|C| | |
| |B| |
|L| |W|

EPISODE 0

---------------
Epsilon Threshold: 1.000000
Random Sample: 0.573923
EXPLORATION
Action Selected: Up
Old Q-Value: 0.00
Next Q-Value: 0.00
(1 - a)old_q + a(r + g(next_q)))
(0.30)(0.00) + 0.70(-1.00 + 0.99(0.00))
0.00 + 0.70(-1.00)
-0.70

STATE:
|C| | |
|L|B| |
| | |W|

Q Table:
   [L, R, U, D]
C: [0.00, 0.00, 0.00, 0.00]
1: [0.00, 0.00, 0.00, 0.00]
2: [0.00, 0.00, 0.00, 0.00]
3: [0.00, 0.00, 0.00, 0.00]
B: [0.00, 0.00, 0.00, 0.00]
4: [0.00, 0.00, 0.00, 0.00]
5: [0.00, 0.00, -0.70, 0.00]
6: [0.00, 0.00, 0.00, 0.00]
W: [0.00, 0.00, 0.00, 0.00]

Reward: -1
Done: False

---------------
Epsilon Threshold: 1.000000
Random Sample: 0.564576
EXPLORATION
Action Selected: Right
Old Q-Value: 0.00
Next Q-Value: 0.00
(1 - a)old_q + a(r + g(next_q)))
(0.30)(0.00) + 0.70(-10.00 + 0.99(0.00))
0.00 + 0.70(-10.00)
-7.00

STATE:
|C| | |
| |L| |
| | |W|

Q Table:
   [L, R, U, D]
C: [0.00, 0.00, 0.00, 0.00]
1: [0.00, 0.00, 0.00, 0.00]
2: [0.00, 0.00, 0.00, 0.00]
3: 