# 論文

* [https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)

# AIM

簡単な問題でテスト

In [1]:
import copy
import time
import gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
%matplotlib nbagg
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import Chain, optimizers, Variable, serializers

In [2]:
from pkg_resources import get_distribution
import platform
print("python", platform.python_version())
print("")
libs = ["numpy", "matplotlib", "gym"]
for lib in libs:
    version = get_distribution(lib).version
    print(lib, version)

python 3.5.2

numpy 1.13.1
matplotlib 2.0.2
gym 0.9.2


In [9]:
env = gym.make("FrozenLake-v0")
print("observation_space.n:", env.observation_space.n)
print("action_space.n: ", env.action_space.n)
print("")

obs = env.reset()
print("obs: ", obs)
env.render()
print("")

act = env.action_space.sample()
obs = env.step(act)
print("act: ", act)
print("obs: ", obs)
env.render()
print("")

[2017-09-06 10:04:20,912] Making new env: FrozenLake-v0


observation_space.n: 16
action_space.n:  4

obs:  0

[41mS[0mFFF
FHFH
FFFH
HFFG

act:  0
obs:  (4, 0.0, False, {'prob': 0.3333333333333333})
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG



In [29]:
# 環境
env = gym.make("FrozenLake-v0")

# CNNクラス定義
class CNN(Chain):
    def __init__(self):
        super(CNN, self).__init__(
            xc = L.Convolution2D(None, 8, (1, 1)),
            ch = L.Linear(32, 10),
            hy = L.Linear(10, 4)
        )
        
    def __call__(self, x, t=None, train=False):
        x = Variable(x)
        if train:
            t = Variable(t)
        h = F.max_pooling_2d(F.relu(self.xc(x)), 2)
        h = F.relu(self.ch(h))
        y = F.softmax(self.hy(h))
        if train:
            return F.softmax_cross_entropy(y, t)
        else:
            return y
        
    def reset(self):
        self.zerograds()
        
# 画像変換関数定義
def convert(obs):
    tmp = np.zeros(16)
    tmp[obs] = 1
    obs = np.array(tmp, dtype="float32")
    obs = obs.reshape(4,4)
    return obs

# モデル
Q = CNN() # 近似Q関数
#serializers.load_npz("./******.npz", q) # 重みファイル読み込み
Q_ast = copy.deepcopy(Q)
optimizer = optimizers.Adam()
optimizer.setup(Q)

# 定数
EPOCH_NUM = 3000 # エポック数
MEMORY_SIZE = 1000 # メモリサイズいくつで学習を開始するか
BATCH_SIZE = 200 # バッチサイズ
EPSILON = 1 # ε-greedy法
EPSILON_DECREASE = 0.0001 # εの減少値
EPSILON_MIN = 0.05 # εの下限
START_REDUCE_EPSILON = 1000 # εを減少させるステップ数
TRAIN_FREQ = 10 # Q関数の学習間隔
UPDATE_TARGET_Q_FREQ = 20 # Q関数の更新間隔
GAMMA = 0.99

total_step = 0 # 総ステップ（行動）数
memory = [] # メモリ
total_rewards = np.zeros(EPOCH_NUM) # 累積報酬記録用リスト

# 学習開始
print("Train")
print("\t".join(["epoch", "EPSILON", "total_reward", "total_step", "elapsed_time"]))
start = time.time()
for epoch in range(EPOCH_NUM):
    pobs = env.reset() # 環境初期化
    pobs = convert(pobs)
    converted_height, converted_width = pobs.shape[0], pobs.shape[1] # 後で利用するので、変換後のサイズを取得
    done = False # ゲーム終了フラグ
    total_reward = 0 # 累積報酬
    while not done:
        # 行動選択
        pact = env.action_space.sample()
        if np.random.rand() > EPSILON: # ε-greedy法
            y = Q(pobs.reshape((1, 1, converted_height, converted_width))) # 最適な行動を予測 # batchsize, channel, height, width
        # 行動
        obs, reward, done, _ = env.step(pact)
        obs = convert(obs)
        # メモリに蓄積
        memory.append((pobs, pact, reward, obs, done)) # 変換済みの行動前状態ベクトル、未変換の行動ラベル、報酬、変換済みの行動後状態ベクトル、ゲーム終了フラグ
        if len(memory) > MEMORY_SIZE: # メモリサイズを超えていれば消していく
            memory.pop(0)
        # 学習
        if len(memory) == MEMORY_SIZE: # メモリサイズ分溜まっていれば学習
            # 経験リプレイ
            if total_step % TRAIN_FREQ == 0:
                np.random.shuffle(memory)
                memory_idx = range(len(memory))
                for i in memory_idx[::BATCH_SIZE]:
                    batch = np.array(memory[i:i+BATCH_SIZE]) # 経験ミニバッチ
                    pobss = np.array(batch[:,0].tolist(), dtype="float32").reshape((BATCH_SIZE, 1, converted_height, converted_width))
                    pacts = np.array(batch[:,1].tolist(), dtype="int32")
                    rewards = np.array(batch[:,2].tolist(), dtype="int32")
                    obss = np.array(batch[:,3].tolist(), dtype="float32").reshape((BATCH_SIZE, 1, converted_height, converted_width))
                    dones = np.array(batch[:,4].tolist(), dtype="bool")
                    # set y
                    q = Q(pobss)
                    maxq = list(map(np.max, Q_ast(obss).data)) # maxQ
                    target = copy.deepcopy(q.data)
                    for j in range(BATCH_SIZE):
                        target[j, pacts[j]] = rewards[j]+GAMMA*maxq[j]*(not dones[j])
                    # Perform a gradient descent step
                    Q.reset()
                    loss = F.mean_squared_error(q, Variable(target))
                    loss.backward()
                    optimizer.update()
            # Q関数の更新
            if total_step % UPDATE_TARGET_Q_FREQ == 0:
                Q_ast = copy.deepcopy(Q)
        # εの減少
        if EPSILON > EPSILON_MIN:
            if total_step > START_REDUCE_EPSILON:
                EPSILON -= EPSILON_DECREASE
        # 次の行動へ
        total_reward += reward
        total_step += 1
        pobs = obs
    total_rewards[epoch] = total_reward # 累積報酬を記録
    #serializers.save_npz("./******.npz", q) # 重みファイル出力
    if (epoch+1) % 100 == 0:
        elapsed_time = time.time()-start
        print("\t".join(map(str,[epoch+1, EPSILON, total_reward, total_step, str(elapsed_time)+"[sec]"]))) # ログ出力
        start = time.time()

[2017-09-06 11:05:07,613] Making new env: FrozenLake-v0


Train
epoch	EPSILON	total_reward	total_step	elapsed_time
100	1	0.0	737	0.03086709976196289[sec]
200	0.9520000000000053	0.0	1481	1.8411839008331299[sec]
300	0.8797000000000132	0.0	2204	2.562824010848999[sec]
400	0.8041000000000216	0.0	2960	2.7136969566345215[sec]
500	0.7233000000000305	0.0	3768	2.9716551303863525[sec]
600	0.6428000000000393	0.0	4573	3.0141799449920654[sec]
700	0.5685000000000475	0.0	5316	2.8471131324768066[sec]
800	0.49390000000005574	0.0	6062	2.8832509517669678[sec]
900	0.4171000000000642	0.0	6830	2.999462127685547[sec]
1000	0.33590000000007314	0.0	7642	3.675112009048462[sec]
1100	0.26320000000008115	0.0	8369	2.8910319805145264[sec]
1200	0.18940000000008927	0.0	9107	2.999579906463623[sec]
1300	0.11310000000009603	0.0	9870	3.245645046234131[sec]
1400	0.049900000000094216	0.0	10627	3.165088176727295[sec]
1500	0.049900000000094216	0.0	11370	3.1297669410705566[sec]
1600	0.049900000000094216	0.0	12084	3.0102670192718506[sec]
1700	0.049900000000094216	0.0	12871	3.31008911132