In [188]:
#!pip install gym
#!pip install pygame

In [189]:
import gym
import numpy as np
import time

In [190]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [191]:
def get_learning_rate(t, min_rate=0.1):
    learning_step = 25
    return max(min_rate, min(1, 1.0 - np.log10((t+1)/learning_step)))

In [192]:
def get_epsilon(t, min_epsilon=0.01):
    epsilon_step = 35
    return max(min_epsilon, min(1, 1.0 - np.log10((t+1)/epsilon_step)))

In [193]:
def q_learning(state, Q, episode):
    '''
    Inicializar Q(s, a) arbitrariamente ∀s ∈ S, a ∈ A(s)
    Repetir:
        Inicializar s
        done ← False
        Repetir hasta done:
            Con probabilidad ε hacer: (* estrategia ε-greedy *)
                explore: a ← sample(A(s))
                exploit: a ← arg m ́ax Q(s, ·)
            s′, r , done ← step(a)
            Q(s, a) ← Q(s, a) + α(r + γ max Q(s′, ·) − Q(s, a))
            s ← s′
    '''
   
    done = False
    gamma = 0.999
    while not done:
        alpha = get_learning_rate(episode)
        epsilon = get_epsilon(episode)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info, _ = env.step(action)
        current_Q_value = Q[state][action]
        discrete_obs = get_state(obs)
        Q[state][action] = current_Q_value + alpha*(reward + gamma*np.max(Q[discrete_obs][:]) - current_Q_value)
        state = discrete_obs

In [194]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

# Discretizacion de variables de la observación

Como la posicion del carrito y su aceleracion no son tan relevantes como la posicion de la barra y su velocidad, se discretizan utilizando una menor cantidad de contenedores de mayor tamaño Por otro lado, tanto la aceleración angular como el ángulo del poste son las variables de mayor importancia, son los que van a variar con una mayor velocidad. Por lo tanto, se discretizan con una mayor cantidad de contenedores de menor tamaño.

In [209]:
import sys
cart_position_bins, cart_pos_step = np.linspace(-2.4, 2.4, 6, retstep=True)
cart_acc_bins, cart_acc_step = np.linspace(-10, 10, 3, retstep=True)
pole_angle_bins, pole_angle_step = np.linspace(-.2095,.2095, 12, retstep=True)
angular_acc_bins, angular_acc_step = np.linspace(-1000, 1000, 6, retstep=True)
print("cart_pos_step: ", cart_pos_step)
print("cart_acc_step: ", cart_acc_step)
print("pole_angle_step: ", pole_angle_step)
print("angular_acc_step: ", angular_acc_step)

cart_pos_step:  0.96
cart_acc_step:  10.0
pole_angle_step:  0.03809090909090909
angular_acc_step:  400.0


In [196]:
def get_state(obs):
    cart_pos = np.digitize(obs[0], cart_position_bins)
    cart_acc = np.digitize(obs[1], cart_acc_bins)
    pole_ang = np.digitize(obs[2], pole_angle_bins)
    ang_acc = np.digitize(obs[3], angular_acc_bins)
    state = tuple([cart_pos, cart_acc, pole_ang, ang_acc])
    return state

In [197]:
state = get_state(np.array([-1.4, 0.11, -0.100, 200]))
state

(2, 3, 3, 4)

In [198]:
position_bins_count = cart_position_bins.size + 1
acc_bins_count = cart_acc_bins.size + 1
angle_bins_count = pole_angle_bins.size + 1
angular_acc_bins_count = angular_acc_bins.size + 1
print("Valid cart position bins: 0 - ", position_bins_count - 1)
print("Valid cart acceleration bins: 0 - ", acc_bins_count - 1)
print("Valid pole angle bins: 0 - ", angle_bins_count - 1)
print("Valid pole acceleration bins: 0 - ", angular_acc_bins_count - 1)
Q = np.random.random((position_bins_count,acc_bins_count,angle_bins_count,angular_acc_bins_count,2))
Q.shape

Valid cart position bins: 0 -  6
Valid cart acceleration bins: 0 -  6
Valid pole angle bins: 0 -  12
Valid pole acceleration bins: 0 -  8


(7, 7, 13, 9, 2)

In [210]:
env = gym.make('CartPole-v1', new_step_api=True)
done = False
max_episodes = 550
for i in range(max_episodes):
    if i % 100 == 0:
        print('episode: ', i)
    obs = env.reset()
    q_learning(get_state(obs), Q, i)
env.close()

episode:  0
episode:  100
episode:  200
episode:  300
episode:  400
episode:  500


# Ejecución con la policy óptima

In [211]:
time.sleep(1)
max_reward = 475
env = gym.make('CartPole-v1', new_step_api=True)
tries = 100
rewards = np.zeros(tries)
print("Playing optimal policy")
for i in range(tries):
    episode_reward= 0
    obs = env.reset()
    done = False
    while not done and episode_reward < max_reward:
        state = get_state(obs)
        action = optimal_policy(state, Q)
        obs, reward, done, info, _ = env.step(action)
        episode_reward += reward
        #print('->', reward, obs, done)
    rewards[i] = episode_reward
env.close()
print("Average reward: ", np.mean(rewards))

Playing optimal policy
Average reward:  198.36
