In [30]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym


# Helper functions

In [34]:
def running_mean(x, N):
    """
    https://stackoverflow.com/questions/13728392/moving-average-or-running-mean
    """
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)



# Q-learning

In [82]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []
states_visited = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0
   
    while not done:

        if np.random.uniform(0,1) > epsilon:
            action = np.argmax(q[state, :])
        else:
            action = env.action_space.sample()

        next_state, reward, done, info = env.step(action)
        
        q[state, action] += learning_rate * (reward + gamma * np.max(q[next_state, :]) - q[state, action])
        
        total_reward += reward
        state = next_state

    rewards.append(total_reward)

q_rewards = running_mean(rewards, 10)

# Sarsa

In [83]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    
    if np.random.uniform(0,1) > epsilon:
        action = np.argmax(qtable[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:

        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            next_action = np.argmax(q[next_state, :])
        else:
            next_action = env.action_space.sample()
        
        q[state, action] += learning_rate * (reward + gamma * q[next_state, next_action] - q[state, action])
        
        total_reward += reward
        
        state = next_state
        action = next_action
        
    rewards.append(total_reward)

sarsa_rewards = running_mean(rewards, 10)

# Expected Sarsa

In [85]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    if np.random.uniform(0,1) > epsilon:
        action = np.argmax(q[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            next_action = np.argmax(q[next_state, :])
        else:
            next_action = env.action_space.sample()
            
        qexp = np.mean(q[next_state, :])
        
        q[state, action] += learning_rate * (reward + gamma * qexp - q[state, action])
        
        total_reward += reward
        
        state = next_state
        action = next_action
        
    rewards.append(total_reward)

expected_sarsa_rewards = running_mean(rewards, 10)

# Double Q-learning

In [89]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q1 = np.zeros((state_dim, n_actions))
q2 = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []
states_visited = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        if np.random.uniform(0,1) > epsilon:
            qavg = (q1 + q2)/2
            action = np.argmax(qavg[state, :])
        else:
            action = env.action_space.sample()
        
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0, 1) > 0.5:
            a = np.argmax(q1[next_state, :])
            q1[state, action] += learning_rate * (reward + gamma * q2[next_state, a] - q1[state, action])
        else:
            a = np.argmax(q2[next_state, :])
            q2[state, action] += learning_rate * (reward + gamma * q1[next_state, a] - q2[state, action])
            
        total_reward += reward
        state = next_state     

    rewards.append(total_reward)

double_q_rewards = running_mean(rewards, 10)

# Double Sarsa

In [88]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q1 = np.zeros((state_dim, n_actions))
q2 = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    if np.random.uniform(0,1) > epsilon:
        qavg = (q1 + q2)/2
        action = np.argmax(qavg[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            qavg = (q1+ q2)/2
            next_action = np.argmax(qavg[state, :])
        else:
            next_action = env.action_space.sample()
        
        if np.random.uniform(0, 1) > 0.5:
            q1[state, action] += learning_rate * (reward + gamma * q2[next_state, next_action] - q1[state, action])
        else:
            q2[state, action] += learning_rate * (reward + gamma * q1[next_state, next_action] - q2[state, action])
                
        total_reward += reward
        
        state = next_state
        action = next_action
        
    rewards.append(total_reward)

double_sarsa_rewards = running_mean(rewards, 10)

# Double Expected Sarsa

In [90]:
env = gym.make("CliffWalking-v0")
n_actions = env.action_space.n
state_dim = env.observation_space.n
q1 = np.zeros((state_dim, n_actions))
q2 = np.zeros((state_dim, n_actions))

gamma = 0.95
epsilon = 0.1
learning_rate = 0.8

total_episodes = 500
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    if np.random.uniform(0,1) > epsilon:
        qavg = (q1 + q2)/2
        action = np.argmax(qavg[state, :])
    else:
        action = env.action_space.sample()
    
    while not done:
        next_state, reward, done, info = env.step(action)
        
        if np.random.uniform(0,1) > epsilon:
            qavg = (q1 + q2)/2
            next_action = np.argmax(qavg[state, :])
        else:
            next_action = env.action_space.sample()
        
        if np.random.uniform(0, 1) > 0.5:
            q2exp = np.mean(q2[next_state, :])
            q1[state, action] += learning_rate * (reward + gamma * q2exp - q1[state, action])
        else:
            q1exp = np.mean(q1[next_state, :])
            q2[state, action] += learning_rate * (reward + gamma * q1exp - q2[state, action])

        total_reward += reward
        
        state = next_state
        action = next_action
        
    rewards.append(total_reward)

double_expected_sarsa_rewards = running_mean(rewards, 10)