In [1]:
import gym
import numpy as np
import os

import matplotlib.pyplot as plt

In [2]:
def softmax(q_value, beta=1.0):
    assert beta >= 0.0
    q_tilde = q_value - np.max(q_value)
    factors = np.exp(beta * q_tilde)
    return factors / np.sum(factors)

def select_a_with_softmax(curr_s, q_value, beta=1.0):
    prob_a = softmax(q_value[curr_s, :], beta=beta)
    cumsum_a = np.cumsum(prob_a)
    return np.where(np.random.rand() < cumsum_a)[0][0]

def select_a_with_epsilon_greedy(curr_s, q_value, epsilon=0.1):
    a = np.argmax(q_value[curr_s, :])
    if np.random.rand() < epsilon:
        a = np.random.randint(q_value.shape[1])
    return a

In [3]:
def main(arguments):

    args = parse_arguements(arguments)

    # General parameters
    env_type         = args.environment
    algorithm_type   = args.algorithm
    policy_type      = args.policy

    # Meta parameters for the reiforcement learning agent
    alpha            = args.alpha
    beta = init_beta = args.beta
    beta_inc         = args.betainc
    gamma            = args.gamma
    epsilon          = args.epsilon
    epsilon_decay    = args.epsilondecay
    n_episode        = args.nepisode
    max_step         = args.maxstep
    
    # Selection of the problem & constraints imposed by the environment
    env = gym.envs.make(env_type)
    n_a = env.action_space.n
    n_s = env.observation_space.n
    
    # Initialization and configuration
    q_table = np.zeros([n_s, n_a]) # Initialization of a Q-value table
    history = [] # Initialization of a list for storing simulation history
    env.reset()
    np.set_printoptions(precision=3, suppress=True)
    result_dir = 'results-{0}-{1}-{2}'.format(env_type, algorithm_type, policy_type)
    
    # Print main params
    print("n_episode      : {}".format(n_episode))
    print("algorithm_type : {}".format(algorithm_type))
    print("policy_type    : {}".format(policy_type))

    for i_episode in range(n_episode):
        score = 0 # Reset a cumulative reward for this episode
        observation = env.reset() # Start a new episode and sample the initial state

        # Select the first action in this episode
        if policy_type == 'softmax':
            action = select_a_with_softmax(observation, q_table, beta=beta)
        elif policy_type == 'epsilon_greedy':
            action = select_a_with_epsilon_greedy(observation, q_table, epsilon=epsilon)
        else:
            raise ValueError("Invalid policy_type: {}".format(policy_type))

        for i_step in range(max_step):            
            next_observation, reward, done, info = env.step(action) # Get a result of your action from the environment

            # Modification of reward (not sure if it's OK to change reward setting by hand...)
            if done & (reward == 0):
                # Punishment for falling into a hall
                reward = 0.0
            elif not done:
                # Cost per step
                reward = -0.001

            # Update a cummulative reward
            score = reward + gamma * score

            # Select an action
            if policy_type == 'softmax':
                next_a = select_a_with_softmax(next_observation, q_table, beta=beta)
            elif policy_type == 'epsilon_greedy':
                next_a = select_a_with_epsilon_greedy(next_observation, q_table, epsilon=epsilon)
            else:
                raise ValueError("Invalid policy_type: {}".format(policy_type))            

            # Calculation of TD error
            if algorithm_type == 'sarsa':
                delta = reward + gamma * q_table[next_observation, next_a] - q_table[observation, action]
            elif algorithm_type == 'q_learning':
                delta = reward + gamma * np.max(q_table[next_observation, :]) - q_table[observation, action]
            else:
                raise ValueError("Invalid algorithm_type: {}".format(algorithm_type))

            # Update a Q value table
            q_table[observation, action] += alpha * delta

            observation = next_observation
            action = next_a

            if done:
                if policy_type == 'softmax':
                    print("Episode: {0}\t Steps: {1:>4}\tCumuR: {2:>5.2f}\tTermR: {3}\tBeta: {4:.3f}".format(i_episode, i_step, score, reward, beta))
                    history.append([i_episode, i_step, score, reward, beta])
                elif policy_type == 'epsilon_greedy':                
                    print("Episode: {0}\t Steps: {1:>4}\tCumuR: {2:>5.2f}\tTermR: {3}\tEpsilon: {4:.3f}".format(i_episode, i_step, score, reward, epsilon))
                    history.append([i_episode, i_step, score, reward, epsilon])
                else:
                    raise ValueError("Invalid policy_type: {}".format(policy_type))

                break

        if policy_type == 'epsilon_greedy':
            # epsilon is decayed expolentially
            epsilon = epsilon * epsilon_decay
        elif policy_type == 'softmax':
            # beta is increased linearly
            beta = init_beta + i_episode * beta_inc

    history = np.array(history)

    window_size = 100
    def running_average(x, window_size, mode='valid'):
        return np.convolve(x, np.ones(window_size)/window_size, mode=mode)

    fig, ax = plt.subplots(2, 2, figsize=[12, 8])
    # Number of steps
    ax[0, 0].plot(history[:, 0], history[:, 1], '.') 
    ax[0, 0].set_xlabel('Episode')
    ax[0, 0].set_ylabel('Number of steps')
    ax[0, 0].plot(history[window_size-1:, 0], running_average(history[:, 1], window_size))
    # Cumulative reward
    ax[0, 1].plot(history[:, 0], history[:, 2], '.') 
    ax[0, 1].set_xlabel('Episode')
    ax[0, 1].set_ylabel('Cumulative rewards')
    ax[0, 1].plot(history[window_size-1:, 0], running_average(history[:, 2], window_size))
    # Terminal reward
    ax[1, 0].plot(history[:, 0], history[:, 3], '.') 
    ax[1, 0].set_xlabel('Episode')
    ax[1, 0].set_ylabel('Terminal rewards')
    ax[1, 0].plot(history[window_size-1:, 0], running_average(history[:, 3], window_size))
    # Epsilon/Beta
    ax[1, 1].plot(history[:, 0], history[:, 4], '.') 
    ax[1, 1].set_xlabel('Episode')
    if policy_type == 'softmax':
        ax[1, 1].set_ylabel('Beta')
    elif policy_type == 'epsilon_greedy':
        ax[1, 1].set_ylabel('Epsilon')
    fig.savefig('./'+result_dir+'.png')

    print("Q value table:")
    print(q_table)

    if policy_type == 'softmax':
        print("Action selection probability:")
        print(np.array([softmax(q, beta=beta) for q in q_table]))
    elif policy_type == 'epsilon_greedy':
        print("Greedy action")
        greedy_action = np.zeros([n_s, n_a])
        greedy_action[np.arange(n_s), np.argmax(q_table, axis=1)] = 1
        #print np.array([zero_vec[np.argmax(q)] = 1 for q in q_table])
        print(greedy_action)