In [None]:
from notebook_logging import log_progress

In [None]:
# %load "../statistics.py"
def ma(ts, q):
    acc = 0
    res = []
    for i in range(q, len(ts) - q):
        for j in range(i - q, i + q):
            acc += ts[j]
        res.append(acc / (2 * q + 1))
        acc = 0
    return res

def accuracy(results):
    return results[1] / (results[0]+results[1]) * 100

In [None]:
# %load "../qlearning.py"
import numpy as np
import numpy.random as rn

def updateQ(Q, state, new_state, action, reward, alpha, gamma):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    state -> current state t
    new_state -> next state t
    reward -> reward
    action -> current action
    """
    future_action = np.argmax(Q[new_state]) # Find the best action to perform at time t+1
    Q[state, action] = (1 - alpha)*Q[state, action] + alpha * (reward + gamma*Q[new_state, future_action])
    return Q

def next_action1(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    max_value = np.amax(state)
    max_indexes = np.arange(len(state))[state == max_value]
    rn.shuffle(max_indexes)
    return max_indexes[0]

def next_action2(state,i_episode):
    return np.argmax(state + np.random.randn(1,len(state))*(1./(i_episode+1)))

def next_action3(state,epsilon):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    if np.random.uniform() > epsilon:
        max_value = np.amax(state)
        max_indexes = np.arange(len(state))[state == max_value]
        rn.shuffle(max_indexes)
        return max_indexes[0]
    return np.argmax(np.random.uniform(0,1, size=4))

def get_epsilon(k,n):
    res = (n - k) / n
    if res < 0.01:
        return 0.01
    return res


def get_epsilon_exp(n):
    res = 1 / (n + 1)
    if res < 0.01:
        return 0.01
    return res
    

In [None]:
import gym
import time

def experiment(alpha = 0.01, gamma = 0.5, n_episodes = 5000, max_action = 100000, final_pun = 0.5, step_pun = 0.07, default_policy = False, policy = np.zeros(64), render = False):
    """
    Execute an experiment given a configuration
    Parameters:
    alpha -> learning rate
    gamma -> discount factor
    n_episodes -> number of completed/failed plays
    max_action -> maximum number of actions per episode
    final_pun -> adjustment for the final reward
    step_pun -> punishment for each step
    """

    Res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory}
    Scores = [] # Cumulative rewards
    Steps = [] # Steps per episode

    from gym import wrappers
    env = gym.make('FrozenLake-v0')
    # Set seeds
    np.random.seed(91)
    env.seed(91)
    
    #env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-1', force=True)
    Q = np.zeros([env.observation_space.n,env.action_space.n])
    for i_episode in range(n_episodes):
        state = env.reset()
        cumulative_reward = 0
        
        for t in range(max_action):
            if (render):
                env.render()
                time.sleep(1)
            
            if (default_policy):
                next_action = policy[state]
            else:
                epsilon = get_epsilon_exp(i_episode)
                if np.random.uniform() > epsilon:
                    next_action = next_action1(Q[state])
                else:
                    next_action = np.argmax(np.random.uniform(0,1, size=4))
            new_state, reward, end, info = env.step(next_action)
            if end:
                Res[int(reward)] += 1
                if reward == 0:
                    reward = reward - final_pun
                Q = updateQ(Q, state, new_state, next_action, reward, alpha, gamma)
                Steps.append(t)
                cumulative_reward += reward
                Scores.append(cumulative_reward)
                break
            else:
                Q = updateQ(Q, state, new_state, next_action, reward - step_pun, alpha, gamma)
                state = new_state
                cumulative_reward += reward - step_pun

    env.close()
    return {"results": np.array(Res), "steps": np.array(Steps), "scores": np.array(Scores), "Q": Q}

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 50000, "max_action": 100, "final_pun": 0, "step_pun": 0}
res = experiment(**config)

In [None]:
q = 100

import matplotlib.pyplot as plt
%matplotlib inline

# Scores
x = range(len(res["scores"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["scores"], q))
#plt.errorbar(x, res["scores"], fmt='ro', label="data", xerr=0.75, ecolor='black')

# Steps
x = range(len(res["steps"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["steps"],q))

# Steps distribution
plt.figure(figsize=(15,5))
kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=40)
plt.hist(res["steps"],**kwargs)
#plt.hist(res["steps"], len(res["steps"]), density=0, facecolor='green')

In [None]:
learnt_policy = np.argmax(res["Q"], axis=1)
print("Policy learnt: ",learnt_policy)

In [None]:
np.mean(res["scores"])

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 5000, "max_action": 1000, "final_pun": 0, "step_pun": 0, "default_policy": True, "policy": learnt_policy}
res = experiment(**config)

In [None]:
print(accuracy(res["results"]))