In [43]:
%matplotlib inline
import gym
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt

In [44]:
# モデルのimport
env = gym.make("FrozenLake-v0")

In [45]:
# Qテーブル作成
Q = np.zeros([env.observation_space.n, env.action_space.n])

In [46]:
"""ハイパーパラメータ設定"""
# 学習率
lr = 0.8
# 割引率
gamma = 0.95
# 試行回数
num_episodes = 20000
# 報酬リスト
rList = []

In [47]:
for i in tqdm(range(num_episodes)):
    # stateの初期化
    state = env.reset()
    # 報酬の累積和の初期化
    rAll = 0
    # Holeに落ちたらTrue
    done = False
    # 1エピソードあたりのカウンター
    j = 0
    # １エピソードあたり 99回移動
    while j < 99:
        j += 1
        # グリーディー法, ノイズを加えることで学習しやすくする
        action = np.argmax(
            Q[state, :] + np.random.randn(1, env.action_space.n) * (1.0 / (i + 1))
        )
        # アクションの実行結果の取得
        next_state, reward, done, _ = env.step(action)

        # 更新, ベルマン方程式
        Q[state, action] = Q[state, action] + lr * (
            reward + gamma * np.max(Q[next_state, :] - Q[state, action])
        )
        rAll += reward
        state = next_state
        if done:
            break
    rList.append(rAll)




In [55]:
print("average_reward: {}".format(sum(rList)/num_episodes))
print("最終的なQテーブルの値: \n{}".format(Q))

average_reward: 0.6255
最終的なQテーブルの値: 
[[8.39683187e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.70117617e-04 0.00000000e+00 0.00000000e+00 7.83399796e-01]
 [1.06292106e-03 5.62918710e-01 0.00000000e+00 0.00000000e+00]
 [4.88970873e-04 0.00000000e+00 1.59054363e-03 6.06062083e-01]
 [7.77124132e-01 1.62587922e-02 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.59729029e-03 1.51318613e-05 5.83976432e-06 1.49929658e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 7.21417779e-01]
 [0.00000000e+00 7.22936569e-01 0.00000000e+00 0.00000000e+00]
 [2.03236661e-01 8.04525544e-04 5.35198164e-04 3.94408246e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 9.58096756e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.04776446e+00 0.00000000e+00]
 [0.00000000e+00 0

In [54]:
print("\n Q_max=", [np.argmax(ii) for ii in Q])


 Q_max= [0, 3, 1, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 2, 0]


In [63]:
test_num_episodes = 1  # テストの総試行回数
max_number_steps = 100
# 学習したQ関数を使ってテスト
from IPython.display import clear_output
import time
for episode in range(test_num_episodes):
    state = env.reset()
    env.render()
    for t in range(max_number_steps):
        action = np.argmax(Q[state])
        state, reward, done, _ = env.step(action)
        rAll += reward
        env.render()
        time.sleep(0.5)
        clear_output()
        if done:
            print("{} time steps finished".format(t + 1))
            break

# テスト結果
print("\n q_table=\n", Q)
print("\n average_reward=", rAll / test_num_episodes)

28 time steps finished

 q_table=
 [[8.39683187e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.70117617e-04 0.00000000e+00 0.00000000e+00 7.83399796e-01]
 [1.06292106e-03 5.62918710e-01 0.00000000e+00 0.00000000e+00]
 [4.88970873e-04 0.00000000e+00 1.59054363e-03 6.06062083e-01]
 [7.77124132e-01 1.62587922e-02 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.59729029e-03 1.51318613e-05 5.83976432e-06 1.49929658e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 7.21417779e-01]
 [0.00000000e+00 7.22936569e-01 0.00000000e+00 0.00000000e+00]
 [2.03236661e-01 8.04525544e-04 5.35198164e-04 3.94408246e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 9.58096756e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.04776446e+00 0.00000000e+00]
 [0.00000000e+00 0.0