In [1]:
#!pip install gym
#!pip install pygame

In [2]:
import gym
import numpy as np

In [3]:
env = gym.make('CartPole-v1', render_mode='human', new_step_api=True)

In [4]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        print('explore')
    else:
        action = np.argmax(Q[state])
        print('exploit')
    return action

In [5]:
def q_learning(state, Q, epsilon=0.1):
    '''
    Inicializar Q(s, a) arbitrariamente ∀s ∈ S, a ∈ A(s)
    Repetir:
        Inicializar s
        done ← False
        Repetir hasta done:
            Con probabilidad ε hacer: (* estrategia ε-greedy *)
                explore: a ← sample(A(s))
                exploit: a ← arg m ́ax Q(s, ·)
            s′, r , done ← step(a)
            Q(s, a) ← Q(s, a) + α(r + γ max Q(s′, ·) − Q(s, a))
            s ← s′
    '''
    
    done = False
    while not done:
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info, _ = env.step(action)
        Q

In [6]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [7]:
bins = np.linspace(-0.000001, 100., 2)
bins

array([-1.e-06,  1.e+02])

In [8]:
def get_state(obs):
    d = np.digitize(obs, bins)
    state = tuple(d)
    return state

In [9]:
state = get_state(np.array([-1.4, -2., 0.23, 1.2]))
state

(0, 0, 1, 1)

In [10]:
Q = np.random.random((2,2,2,2,2))
Q

array([[[[[0.0609531 , 0.12940894],
          [0.97848808, 0.47370207]],

         [[0.99031082, 0.13319581],
          [0.38238955, 0.48738757]]],


        [[[0.29236919, 0.28220496],
          [0.44537984, 0.97694098]],

         [[0.66597042, 0.10600376],
          [0.31903103, 0.21593529]]]],



       [[[[0.19302581, 0.24601327],
          [0.24023452, 0.47183272]],

         [[0.49591997, 0.42115207],
          [0.23287445, 0.13396688]]],


        [[[0.1665551 , 0.21453539],
          [0.26411405, 0.12136745]],

         [[0.68144823, 0.16184743],
          [0.51716612, 0.95799866]]]]])

In [11]:
obs = env.reset()
print(obs)
done = False
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, info, _ = env.step(action)
    print('->', state, action, reward, obs, done, info)
env.close()

[ 0.03211415  0.00114949 -0.02842947  0.01297775]
exploit
-> (1, 1, 0, 1) 0 1.0 [ 0.03213714 -0.19355345 -0.02816992  0.296557  ] False False
explore
-> (1, 0, 0, 1) 1 1.0 [ 0.02826607  0.00195852 -0.02223878 -0.00487545] False False
exploit
-> (1, 1, 0, 0) 1 1.0 [ 0.02830524  0.19739223 -0.02233629 -0.3044912 ] False False
exploit
-> (1, 1, 0, 0) 1 1.0 [ 0.03225309  0.39282525 -0.02842611 -0.6041339 ] False False
exploit
-> (1, 1, 0, 0) 1 1.0 [ 0.04010959  0.588333   -0.04050879 -0.9056332 ] False False
explore
-> (1, 1, 0, 0) 0 1.0 [ 0.05187625  0.3937823  -0.05862145 -0.6259529 ] False False
exploit
-> (1, 1, 0, 0) 1 1.0 [ 0.0597519   0.58967143 -0.07114051 -0.93650675] False False
explore
-> (1, 1, 0, 0) 1 1.0 [ 0.07154533  0.78567696 -0.08987065 -1.250669  ] False False
explore
-> (1, 1, 0, 0) 1 1.0 [ 0.08725887  0.9818283  -0.11488403 -1.570095  ] False False
exploit
-> (1, 1, 0, 0) 1 1.0 [ 0.10689543  1.178119   -0.14628592 -1.8962917 ] False False
exploit
-> (1, 1, 0, 0) 1 1.0 