In [None]:
from notebook_logging import log_progress

In [None]:
# %load "../qlearning.py"
import numpy as np
import numpy.random as rn

def updateQ(Q, s_t, s_tn, a, R, alpha, gamma):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    s_tn -> new state
    R -> reward
    a -> action
    """
    a_max = np.argmax(Q[s_tn])
    Q[s_t, a] = (1 - alpha)*Q[s_t, a] + alpha * (R + gamma*Q[s_tn, a_max])
    return Q

def choose_policy(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    v_max = np.amax(state)
    indexes = np.arange(len(state))[state == v_max]
    rn.shuffle(indexes)
    return indexes[0]

In [None]:
def experiment(alpha = 0.01, gamma = 0.5, n_episodes = 5000, max_action = 100000, final_pun = 0.5, step_pun = 0.07):
    Q = np.zeros((64, 4))
    acc = 0;
    Qs = [] # Weights matrices
    Rs = [] # Final rewards
    Crs = [] # Cumulative rewards
    Ts = [] # Steps per episode

    from gym import wrappers
    env = gym.make('FrozenLake8x8-v0')
    env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-2', force=True)
    for i_episode in range(n_episodes):
        s_old = env.reset()
        acc_rew = 0

        for t in range(max_action):
            #env.render()
            policy = choose_policy(Q[s_old])
            s_new, reward, done, info = env.step(policy)
            if done:
                Q = updateQ(Q, s_old, s_new, policy, reward - final_pun, alpha, gamma)
                Qs.append(Q)
                Rs.append(reward - final_pun)
                Ts.append(t)
                acc_rew += reward - final_pun
                Crs.append(acc_rew)
                break
            else:
                Q = updateQ(Q, s_old, s_new, policy, reward - step_pun, alpha, gamma)
                s_old = s_new
                acc_rew += reward - step_pun


    env.close()
    return {"final rewards": np.array(Rs), "steps": np.array(Ts), "cumulated rewards": np.array(Crs)}

In [None]:
configs = [
    {"alpha": 0.01, "gamma": 0.5, "n_episodes": 5000, "max_action": 100000, "final_pun": 0.5, "step_pun": 0.07},
    {"alpha": 0.05, "gamma": 0.3, "n_episodes": 10000, "max_action": 100000, "final_pun": 0.5, "step_pun": 0.007},
]

#res = [experiment(**config) for config in configs]
config = {"alpha": 0.01, "gamma": 0.5, "n_episodes": 5000, "max_action": 100000, "final_pun": 0.5, "step_pun": 0.07}
res = []

for alpha in np.arange(0.01, 0.99, 0.01):
    config["alpha"] = alpha
    res.append(experiment(**config))

In [None]:
alphas = np.arange(0.01, 0.99, 0.01)
reward_means = []
for r in res:
    reward_means.append(np.mean(r["final rewards"]))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(alphas, reward_means)

In [None]:
plt.figure(figsize=(15,5))

for (i, r) in enumerate(res):
    plt.plot(r["final rewards"] / r["steps"] + i * 0.)
#plt.plot(res_1["final rewards"] / res_1["steps"] + 0.5, 'g')

In [None]:
plt.figure(figsize=(15,5))

fit = Rs / Ts
print("Rs %f Ts %f mFit %f" % (Rs.sum(), Ts.sum(), fit.mean()))
plt.plot(fit)

In [None]:
plt.figure(figsize=(15,5))

fit = Prs / Ts
print("mFit %f" % (fit.mean()))
plt.plot(fit)

In [None]:
print("Policy learned: {}".format(np.argmax(Q, axis=1)))

## Policies

$\gamma = 0.5,\alpha = 0.01$

[3 3 3 2 2 0 0 2 2 2 3 3 0 1 0 1 2 0 0 0 2 3 2 2 1 0 0 1 0 0 2 0 0 3 0 0 2
 1 3 0 0 0 0 1 3 0 0 2 0 0 0 1 0 2 0 2 2 1 0 0 1 0 3 0] <- 5000 episodes  
