In [None]:
from notebook_logging import log_progress

In [None]:
# %load "../sarsa.py"
import numpy as np
import numpy.random as rn

def updateQ(Q, s_t, s_tn, a, a_n, R, alpha, gamma):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    s_tn -> new state
    R -> reward
    a -> action
    """
    Q[s_t, a] = (1 - alpha) * Q[s_t, a] + alpha * (R + gamma*Q[s_tn, a_n])
    return Q

def choose_policy(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    v_max = np.amax(state)
    indexes = np.arange(len(state))[state == v_max]
    rn.shuffle(indexes)
    return indexes[0]

def choose_policy_greedy(state,env,i_episode):
    return np.argmax(state + np.random.randn(1,env.action_space.n)*(1./(i_episode+1)))

def gen_policy(Q):
    return [choose_policy(state) for state in Q]

In [None]:
import gym

def experiment(alpha = 0.01, gamma = 0.5, n_episodes = 5000, max_action = 100000, final_pun = 0.5, step_pun = 0.07):
    """
    Execute an experiment given a configuration
    Parameters:
    alpha -> learning rate
    gamma -> discount factor
    n_episodes -> number of completed/failed plays
    max_action -> maximum number of actions per episode
    final_pun -> adjustment for the final reward
    step_pun -> punishment for each step
    """
    # Q = np.zeros((64, 4))
    Qs = [] # Weights matrices
    Res = [] # Final results
    Scores = [] # Cumulative rewards (Scores)
    Steps = [] # Steps per episode
    
    won=0
    
    pol = [3, 2, 3, 2, 2, 3, 1, 1, 0, 0, 3, 2, 1, 0, 0, 2, 2, 0, 0, 0, 3, 2, 3, 2, 3, 1, 1, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0,
 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
    
    from gym import wrappers
    env = gym.make('FrozenLake8x8-v0')
    np.random.seed(9)
    env.seed(9)
    #env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-2', force=True)
    Q = np.zeros([env.observation_space.n,env.action_space.n])
    for i_episode in range(n_episodes):
        s_old = env.reset()
        acc_rew = 0
        p = gen_policy(Q)
        for t in range(max_action):
            #env.render()
            policy = p[s_old]
            s_new, reward, done, info = env.step(policy)
            if done:
                if(reward==1):
                    won+=1
                reward = reward - final_pun
                Q = updateQ(Q, s_old, s_new, policy, p[s_new], reward, alpha, gamma)
                #Qs.append(Q) TODO: da togliere?
                Res.append(reward)
                Steps.append(t)
                acc_rew += reward
                Scores.append(acc_rew)
                break
            else:
                Q = updateQ(Q, s_old, s_new, policy, p[s_new], reward - step_pun, alpha, gamma)
                s_old = s_new
                acc_rew += reward - step_pun


    env.close()
    
    return {"results": np.array(Res), "steps": np.array(Steps), "scores": np.array(Scores), "Q": Q, "won":won}

In [None]:
configs = [
    {"alpha": 0.01, "gamma": 0.5, "n_episodes": 5000, "max_action": 100000, "final_pun": 0.5, "step_pun": 0.07},
    {"alpha": 0.05, "gamma": 0.3, "n_episodes": 10000, "max_action": 100000, "final_pun": 0.5, "step_pun": 0.007},
]

np.random.seed(73)
#config = {"alpha": 0.5, "gamma": 1, "n_episodes": 50000, "max_action": 10000, "final_pun": 0.5, "step_pun": 0.07}
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 10000, "max_action": 200, "final_pun": 0, "step_pun": 0}
#res = [experiment(**config) for config in configs]
res = experiment(**config)

#for alpha in np.arange(0.01, 0.99, 0.01):
#    config["alpha"] = alpha
#    res.append(experiment(**config))

In [None]:
print("Accuracy: ",res["won"]/config["n_episodes"])
print(res["won"])

In [None]:
# %load "../mylibrary.py"
def ma(ts, q):
    acc = 0
    res = []
    for i in range(q, len(ts) - q):
        for j in range(i - q, i + q):
            acc += ts[j]
        res.append(acc / (2 * q + 1))
        acc = 0
    return res

In [None]:
q = 48

import matplotlib.pyplot as plt
%matplotlib inline

# Scores
x = range(len(res["scores"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["scores"], q))

# Steps
x = range(len(res["scores"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["steps"],q))

# Steps
x = range(np.max(res["steps"]) + 1)
plt.figure(figsize=(15,5))
y = np.zeros(np.max(res["steps"]) + 1)
for i in range(len(res["steps"])):
    y[(res["steps"][i])] += 1 
plt.plot(x,y)

In [None]:
print("Policy learned: {}".format(np.argmax(res["Q"], axis=1)))

In [None]:
alphas = np.arange(0.01, 0.99, 0.01)
reward_means = []
#for r in res:
#    reward_means.append(np.mean(r["final rewards"]))

## Policies

$\gamma = 1,\alpha = 0.5$
10k episodes

[2 1 1 1 0 0 3 0 1 1 1 1 1 1 1 0 1 1 2 0 1 1 1 1 1 2 1 2 1 0 1 1 1 2 2 0 1
 2 1 1 1 0 0 0 1 2 0 1 1 0 3 1 0 1 0 2 2 2 2 0 0 1 1 0]