In [30]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym


# Helper functions

In [34]:
def running_mean(x, N):
    """
    https://stackoverflow.com/questions/13728392/moving-average-or-running-mean
    """
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)



# Q-learning

In [70]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
qtable = np.zeros((state_dim, n_actions))

gamma = 0.90
epsilon = 0.9
learning_rate = 0.8

total_episodes = 500
rewards = []
states_visited = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    if state not in states_visited:
        states_visited.append(state)

    
    while not done:
        # choose action
        if np.random.uniform(0,1) > epsilon:
            action = np.argmax(qtable[state, :])
        else:
            action = env.action_space.sample()
        # take step
        next_state, reward, done, info = env.step(action)
        
        # update q values
        qtable[state, action] += learning_rate * (reward + gamma * np.max(qtable[next_state, :]) - qtable[state, action])
        
        # update rewards
        total_reward += reward
        

        state = next_state
        if state not in states_visited:
            states_visited.append(state)


    rewards.append(total_reward)

q_rewards = running_mean(rewards, 10)
print(len(states_visited))
states_visited.sort()
print(states_visited)

38
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 47]


# Sarsa

In [73]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
qtable = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    if state not in states_visited:
        states_visited.append(state)
    
    if np.random.uniform(0,1) > epsilon:
        action = np.argmax(qtable[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:

        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            next_action = np.argmax(qtable[next_state, :])
        else:
            next_action = env.action_space.sample()
        
        qtable[state, action] += learning_rate * (reward + gamma * qtable[next_state, next_action] - qtable[state, action])
        
        total_reward += reward
        
        state = next_state
        action = next_action
        
        if state not in states_visited:
            states_visited.append(state)
        
    rewards.append(total_reward)

sarsa_rewards = running_mean(rewards, 10)
print(len(states_visited))
states_visited.sort()
print(states_visited)

38
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 47]


# Expected Sarsa

In [None]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
qtable = np.zeros((state_dim, n_actions))

gamma = 0.99
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    if np.random.uniform(0,1) > epsilon:
        action = np.argmax(qtable[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            next_action = np.argmax(qtable[next_state, :])
        else:
            next_action = env.action_space.sample()
        
        qtable[state, action] += learning_rate * (reward + gamma * qtable[next_state, next_action] - qtable[state, action])
        
        total_reward += reward
        
        state = next_state
        action = next_action
        
    rewards.append(total_reward)

sarsa_rewards = running_mean(rewards, 10)

# Double Q-learning

In [None]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
qtable1 = np.zeros((state_dim, n_actions))
qtable2 = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []
states_visited = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    if state not in states_visited:
        states_visited.append(state)
    while not done:
        if np.random.uniform(0,1) > epsilon:
            avg_q = (qtable1 + qtable2)/2
            action = np.argmax(avg_q[state, :])
        else:
            action = env.action_space.sample()
        
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0, 1) > 0.5:
            a = np.argmax(qtable1[next_state, :])
            qtable1[state, action] += learning_rate * (reward + gamma * qtable2[next_state, a] - qtable1[state, action])
        else:
            a = np.argmax(qtable2[next_state, :])
            qtable2[state, action] += learning_rate * (reward + gamma * qtable1[next_state, a] - qtable2[state, action])
            
        total_reward += reward
        
        state = next_state     
        if state not in states_visited:
            states_visited.append(state)

    rewards.append(total_reward)

double_q_rewards = running_mean(rewards, 10)
print(len(states_visited))
states_visited.sort()
print(states_visited)

# Double Sarsa

# Double Expected Sarsa