# ペッツを対象とした強化学習

## ライブラリのインポート

In [1]:
import numpy as np

## 環境クラス
行動を受け取り状態を遷移させ、報酬を与える

In [2]:
class Environment():
    def __init__(self):
        self.Reset()
#初期化
    def Reset(self):
        self.state = 0
        return self.state
    def Step(self, action):
        reward = 0
        if self.state==0:#閉じている
            if action==0:#開ける
                self.state = 1
        elif self.state==1:#開いていて，菓子がある
            if action==1:#閉じる
                self.state = 0
            elif action==2:#傾ける
                self.state = 2
                reward = 1
        else:#開いていて，菓子がない
            if action==1:
                self.state = 0
        return self.state, reward

## エージェントクラス
状態を観測し、行動を決定し、状態・行動・報酬からQ値を更新する

In [3]:
class Agent():
    def __init__(self):
        self.QV = np.zeros((3, 3))
    def GetAction(self, state, epsilon):
        self.state = state
        if epsilon > np.random.uniform(0, 1):#徐々に最適行動のみをとる、ε-greedy法
            self.action = np.random.choice([0, 1])
        else:
            a = np.where(self.QV[state]==self.QV[state].max())[0]
            self.action = np.random.choice(a)
        return self.action
    def UpdateQValue(self, next_state, reward):
        alpha, gamma = 0.5, 0.9
        next_maxQ=max(self.QV[next_state])
        self.QV[self.state, self.action] = (1 - alpha) * self.QV[self.state, self.action] + alpha * (reward + gamma * next_maxQ)


## 定数の設定

In [4]:
num_episodes = 5  #総試行回数
num_steps = 10  #1試行の中の行動数

## 学習のための試行の繰り返し

In [5]:
env = Environment()
agent = Agent()
for episode in range(num_episodes):  #試行数分繰り返す
    state = env.Reset()#初期状態に戻す
    sum_reward = 0#累積報酬
    epsilon = 0.5 * (1 / (episode + 1))
    for t in range(num_steps):  #1試行のループ
        action = agent.GetAction(state, epsilon)
        old_state = state#表示用
        state, reward = env.Step(action)
        print(old_state, action, reward)
        sum_reward += reward  #報酬を追加
        agent.UpdateQValue(state, reward)
    print('episode : %d total reward %d' %(episode+1, sum_reward))
    print(agent.QV)

0 0 0
1 1 0
0 0 0
1 0 0
1 1 0
0 0 0
1 1 0
0 0 0
1 1 0
0 0 0
episode : 1 total reward 0
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
0 2 0
0 1 0
0 1 0
0 2 0
0 0 0
1 0 0
1 0 0
1 2 1
2 0 0
2 0 0
episode : 2 total reward 1
[[0.  0.  0. ]
 [0.  0.  0.5]
 [0.  0.  0. ]]
0 2 0
0 0 0
1 2 1
2 2 0
2 1 0
0 0 0
1 2 1
2 1 0
0 0 0
1 1 0
episode : 3 total reward 2
[[0.63925312 0.         0.        ]
 [0.         0.28766391 0.9205625 ]
 [0.         0.253125   0.        ]]
0 1 0
0 0 0
1 1 0
0 0 0
1 2 1
2 1 0
0 0 0
1 2 1
2 1 0
0 0 0
episode : 4 total reward 2
[[1.00049773 0.28766391 0.        ]
 [0.         0.47407781 1.25223845]
 [0.         0.63234105 0.        ]]
0 0 0
1 2 1
2 1 0
0 0 0
1 2 1
2 1 0
0 0 0
1 2 1
2 1 0
0 0 0
episode : 5 total reward 3
[[1.40682431 0.28766391 0.        ]
 [0.         0.47407781 1.6966084 ]
 [0.         1.04023386 0.        ]]
