In [None]:
from __future__ import print_function
import numpy as np
import sys
from collections import defaultdict

import gym
from gym import envs
from gym import wrappers

In [None]:
def epsilon_greedy_policy(Q, state, num_actions, epsilon):
    policy = np.ones(num_actions, dtype=float) * epsilon / num_actions
    best_action = np.argmax(Q[state])
    policy[best_action] += 1-epsilon
    return policy

In [None]:
def learn(envi, num_episodes, Q= defaultdict(lambda: np.zeros(envi.action_space.n)), discount_factor = 0.8, lambda_factor = 0.9, replacing=True):
    for i_episode in range(1,num_episodes+1):
        if (i_episode%1000)==0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        #initialization: use a list of (state, action) pair to keep
        #track of the episode "history"
        episode = []
        E = defaultdict(lambda: np.zeros(envi.action_space.n))
        alpha = 1-i_episode/num_episodes
        epsilon = (1 - i_episode/num_episodes) ** 2
        #initial state
        state = envi.reset()
        #calculate the e-greedy policy for the init state given the Q function
        #and choose the action accordingly
        policy = epsilon_greedy_policy(Q, state, envi.action_space.n, epsilon)
        action = np.random.choice(np.array(len(policy)), p=policy)
        while True:
            if ((state, action) not in episode):
                episode.append((state, action))
            next_state, reward, done, _= envi.step(action)
            policy = epsilon_greedy_policy(Q, next_state, envi.action_space.n, epsilon)
            next_action = np.random.choice(np.array(len(policy)), p=policy)
            delta = reward + discount_factor * Q[next_state][next_action] - Q[state][action]

            #updating the trace using the replacing or the eligibility
            if replacing:
                E[state][action]  = 1
            else:
                E[state][action]  += 1

            #improvement of the Q value function for each state visited in the current episode
            for (states, actions) in episode:
                Q[states][actions] = Q[states][actions] + alpha * delta * E[states][actions]
                E[states][actions] = discount_factor * lambda_factor * E[states][actions]
            state = next_state; action = next_action;
            if done:
                break
    return Q

In [None]:
def namelearn(env_name):
    env = envs.make(env_name)
    outdir = "/Users/jacopo/openaigym/project/TD/results/Q_Learning/" + env_name + "/"
    env = wrappers.Monitor(env, outdir, video_callable=False, force=True)
    env.seed(0)
    num_episodes = 1000000
    Q = learn(envi=env, num_episodes=num_episodes, replacing=False)
    env.close()
    return Q

In [None]:
if __name__ == "__main__":
    Q1 = namelearn('FrozenLake-v0')