In [1]:
import gym
import numpy as np

In [24]:
def eps_greedy_q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros([env.observation_space.n,env.action_space.n])
    y = 0.95
    eps = 0.5
    lr = 0.8
    decay_factor = 0.999
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        done = False
        while not done:
            # select the action with highest cummulative reward
            if np.random.random() < eps or np.sum(q_table[s, :]) == 0:
                a = np.random.randint(0, env.action_space.n)
            else:
                a = np.argmax(q_table[s, :])
            # pdb.set_trace()
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + lr * (y * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
    return q_table

In [25]:
env = gym.make('FrozenLake-v0')
eps_greedy_q_learning_with_table(env, 5000)

array([[0.00000000e+00, 3.60966371e-03, 1.92203940e-02, 2.21783612e-01],
       [0.00000000e+00, 1.96538216e-02, 1.47451826e-03, 1.51265079e-01],
       [0.00000000e+00, 5.65779180e-02, 6.51212261e-04, 2.27704080e-04],
       [0.00000000e+00, 3.44466770e-03, 1.22517089e-05, 1.32564044e-01],
       [0.00000000e+00, 4.02948567e-06, 1.13121729e-01, 1.21313510e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.13694466e-05, 7.47672577e-03, 7.73663052e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 6.34084707e-03, 1.70114161e-03, 2.98685203e-02],
       [0.00000000e+00, 6.08906096e-01, 3.62270373e-02, 4.01685386e-03],
       [0.00000000e+00, 1.20912449e-01, 1.04136284e-03, 3.73992576e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 5.24889325e-02, 2.99467007e

In [9]:
def eps_greedy_q_learning_with_neural_network(env, num_episodes=500):
    model = Sequential()
    model.add(InputLayer(batch_input_shape=(1, env.observation_space.n)))
    model.add(Dense(10, activation='sigmoid'))
    model.add(Dense(env.action_space.n, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    y = 0.95
    eps = 0.5
    decay_factor = 0.999
    r_avg_list = []
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        if i % 100 == 0:
            print("Episode {} of {}".format(i + 1, num_episodes))
        done = False
        r_sum = 0
        while not done:
            if np.random.random() < eps:
                a = np.random.randint(0, env.action_space.n)
            else:
                a = np.argmax(model.predict(np.identity(env.observation_space.n)[s:s + 1]))
            new_s, r, done, _ = env.step(a)
            target = r + y * np.max(model.predict(np.identity(env.observation_space.n)[new_s:new_s + 1]))
            target_vec = model.predict(np.identity(env.observation_space.n)[s:s + 1])[0]
            target_vec[a] = target
            model.fit(np.identity(env.observation_space.n)[s:s + 1], target_vec.reshape(-1, env.action_space.n), epochs=1, verbose=0)
            s = new_s
            r_sum += r
        r_avg_list.append(r_sum / 1000)
    return model

In [10]:
from keras.models import Sequential
from keras.layers import InputLayer
from keras.layers import Dense
env = gym.make('FrozenLake-v0')
eps_greedy_q_learning_with_neural_network(env)

Episode 1 of 500
Episode 101 of 500
Episode 201 of 500
Episode 301 of 500
Episode 401 of 500


<keras.engine.sequential.Sequential at 0x29c66f1ae08>