# 論文

* [https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)

# AIM

簡単な問題でテスト

In [1]:
import copy
import time
import gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib nbagg
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import Chain, optimizers, Variable, serializers

In [2]:
from pkg_resources import get_distribution
import platform
print("python", platform.python_version())
print("")
libs = ["numpy", "matplotlib", "gym"]
for lib in libs:
    version = get_distribution(lib).version
    print(lib, version)

python 3.5.2

numpy 1.13.1
matplotlib 2.0.2
gym 0.9.2


In [3]:
env = gym.make("FrozenLake-v0")
print("observation_space.n:", env.observation_space.n)
print("action_space.n: ", env.action_space.n)
print("")

obs = env.reset()
print("obs: ", obs)
env.render()
print("")

act = env.action_space.sample()
obs = env.step(act)
print("act: ", act)
print("obs: ", obs)
env.render()
print("")

[2017-09-06 19:49:20,648] Making new env: FrozenLake-v0


observation_space.n: 16
action_space.n:  4

obs:  0

[41mS[0mFFF
FHFH
FFFH
HFFG

act:  0
obs:  (0, 0.0, False, {'prob': 0.3333333333333333})
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG



In [8]:
# 環境
env = gym.make("FrozenLake-v0")

# NNクラス定義
class NN(Chain):
    def __init__(self):
        super(NN, self).__init__(
            xc = L.Linear(16, 100),
            ch = L.Linear(100, 100),
            hy = L.Linear(100, 4)
        )
        
    def __call__(self, x, t=None, train=False):
        x = Variable(x)
        if train:
            t = Variable(t)
        h = F.relu(self.xc(x))
        h = F.relu(self.ch(h))
        y = F.softmax(self.hy(h))
        if train:
            return F.softmax_cross_entropy(y, t)
        else:
            return y
        
    def reset(self):
        self.zerograds()
        
# 状態変換関数定義
def convert(obs):
    tmp = np.zeros(16)
    tmp[obs] = 1
    obs = np.array(tmp, dtype="float32")
    return obs

# モデル
Q = NN() # 近似Q関数
#serializers.load_npz("./******.npz", q) # 重みファイル読み込み
Q_ast = copy.deepcopy(Q)
optimizer = optimizers.Adam()
optimizer.setup(Q)

# 定数
EPOCH_NUM = 3000 # エポック数
MEMORY_SIZE = 1000 # メモリサイズいくつで学習を開始するか
BATCH_SIZE = 200 # バッチサイズ
EPSILON = 1 # ε-greedy法
EPSILON_DECREASE = 0.0001 # εの減少値
EPSILON_MIN = 0.05 # εの下限
START_REDUCE_EPSILON = 1000 # εを減少させるステップ数
TRAIN_FREQ = 10 # Q関数の学習間隔
UPDATE_TARGET_Q_FREQ = 20 # Q関数の更新間隔
GAMMA = 0.99

total_step = 0 # 総ステップ（行動）数
memory = [] # メモリ
total_rewards = np.zeros(EPOCH_NUM) # 累積報酬記録用リスト

# 学習開始
print("Train")
print("\t".join(["epoch", "EPSILON", "total_reward", "total_step", "elapsed_time"]))
start = time.time()
for epoch in range(EPOCH_NUM):
    pobs = env.reset() # 環境初期化
    pobs = convert(pobs)
    done = False # ゲーム終了フラグ
    total_reward = 0 # 累積報酬
    while not done:
        # 行動選択
        pact = env.action_space.sample()
        if np.random.rand() > EPSILON: # ε-greedy法
            y = Q(pobs.reshape((1, 16))) # 最適な行動を予測 # batchsize, channel, height, width
        # 行動
        obs, reward, done, _ = env.step(pact)
        obs = convert(obs)
        # メモリに蓄積
        memory.append((pobs, pact, reward, obs, done)) # 変換済みの行動前状態ベクトル、未変換の行動ラベル、報酬、変換済みの行動後状態ベクトル、ゲーム終了フラグ
        if len(memory) > MEMORY_SIZE: # メモリサイズを超えていれば消していく
            memory.pop(0)
        # 学習
        if len(memory) == MEMORY_SIZE: # メモリサイズ分溜まっていれば学習
            # 経験リプレイ
            if total_step % TRAIN_FREQ == 0:
                np.random.shuffle(memory)
                memory_idx = range(len(memory))
                for i in memory_idx[::BATCH_SIZE]:
                    batch = np.array(memory[i:i+BATCH_SIZE]) # 経験ミニバッチ
                    pobss = np.array(batch[:,0].tolist(), dtype="float32").reshape((BATCH_SIZE, 16))
                    pacts = np.array(batch[:,1].tolist(), dtype="int32")
                    rewards = np.array(batch[:,2].tolist(), dtype="int32")
                    obss = np.array(batch[:,3].tolist(), dtype="float32").reshape((BATCH_SIZE, 16))
                    dones = np.array(batch[:,4].tolist(), dtype="bool")
                    # set y
                    q = Q(pobss)
                    maxq = list(map(np.max, Q_ast(obss).data)) # maxQ
                    maxq = np.asanyarray(maxq, dtype="float32")
                    target = copy.deepcopy(q.data)
                    target = np.asanyarray(target, dtype="float32")
                    for j in range(BATCH_SIZE):
                        target[j, pacts[j]] = rewards[j]+GAMMA*maxq[j]*(not dones[j])
                    # Perform a gradient descent step
                    Q.reset()
                    loss = F.mean_squared_error(q, Variable(target))
                    loss.backward()
                    optimizer.update()
            # Q関数の更新
            if total_step % UPDATE_TARGET_Q_FREQ == 0:
                Q_ast = copy.deepcopy(Q)
        # εの減少
        if EPSILON > EPSILON_MIN:
            if total_step > START_REDUCE_EPSILON:
                EPSILON -= EPSILON_DECREASE
        # 次の行動へ
        total_reward += reward
        total_step += 1
        pobs = obs
    total_rewards[epoch] = total_reward # 累積報酬を記録
    #serializers.save_npz("./******.npz", q) # 重みファイル出力
    if (epoch+1) % 100 == 0:
        elapsed_time = time.time()-start
        print("\t".join(map(str,[epoch+1, EPSILON, total_reward, total_step, str(elapsed_time)+"[sec]"]))) # ログ出力
        start = time.time()

[2017-09-06 19:55:40,997] Making new env: FrozenLake-v0


Train
epoch	EPSILON	total_reward	total_step	elapsed_time
100	1	0.0	780	0.030592918395996094[sec]
200	0.9492000000000056	0.0	1509	1.4598109722137451[sec]
300	0.8751000000000138	0.0	2250	2.0775279998779297[sec]
400	0.800600000000022	0.0	2995	2.1633191108703613[sec]
500	0.7240000000000304	0.0	3761	2.2294631004333496[sec]
600	0.6506000000000385	0.0	4495	2.1513779163360596[sec]
700	0.5790000000000464	0.0	5211	2.132861852645874[sec]
800	0.5074000000000543	0.0	5927	2.1199989318847656[sec]
900	0.4400000000000617	0.0	6601	2.046755790710449[sec]
1000	0.36660000000006976	0.0	7335	2.219085931777954[sec]
1100	0.2882000000000784	0.0	8119	2.419045925140381[sec]
1200	0.21480000000008648	0.0	8853	2.3922200202941895[sec]
1300	0.1446000000000942	0.0	9555	2.2295758724212646[sec]
1400	0.07180000000009484	0.0	10283	2.306668996810913[sec]
1500	0.049900000000094216	0.0	11009	2.3054778575897217[sec]
1600	0.049900000000094216	0.0	11953	3.032641887664795[sec]
1700	0.049900000000094216	0.0	12613	2.128807067871093

KeyboardInterrupt: 