# 三目並べ：協調動作（引き分けを目指す）

## ライブラリのインストール

In [8]:
import numpy as np

## 環境クラス
行動を受け取り状態を遷移させ、報酬を与える

In [9]:
class Environment():
    def __init__(self):
        self.Reset()
#初期化
    def Reset(self):
        self.state = np.zeros(9, dtype=np.int32)#マスの設定
        return self.state
#行動による状態変化
    def Step(self, action, turn):
        rewards = [0,0]
        done = False
        if self.state[action] != 0:#すでに〇か×が書かれているところに打った場合
            done = True
            rewards[turn] = -2#打ったエージェントだけマイナスの報酬
            return self.state, rewards, done
        self.state[action] = turn+1
        #3つ並んだかを判定
        ptn = [[0,1,2],[3,4,5],[6,7,8],[0,3,6],[1,4,7],[2,5,8],[0,4,8],[2,4,6]]
        for i in range(8):
            if self.state[ptn[i][0]] == turn+1 and \
               self.state[ptn[i][1]] == turn+1 and \
               self.state[ptn[i][2]] == turn+1:
                   rewards[turn] = -1#勝ったエージェントにもマイナスの報酬
                   rewards[(turn+1)%2] = -1#負けたエージェントにマイナスの報酬
                   done = True
                   return self.state, rewards, done            
        return self.state, rewards, done
    def ShowBoard(self):
        mb = {0:' ', 1:'O', 2:'X'}
        i=0
        print(mb[self.state[i*3]], "|", mb[self.state[i*3+1]], "|", mb[self.state[i*3+2]] )
        print("----------" )
        i=1
        print(mb[self.state[i*3]], "|", mb[self.state[i*3+1]], "|", mb[self.state[i*3+2]] )
        print("----------" )
        i=2
        print(mb[self.state[i*3]], "|", mb[self.state[i*3+1]], "|", mb[self.state[i*3+2]] )
    

## エージェントクラス
状態を観測し、行動を決定し、状態・行動・報酬からQ値を更新する

In [10]:
class Agent():
    def __init__(self, ID, train=True):
        self.ID = ID
        if train:#学習時のQ値
            print("Training")
            self.QV=np.zeros((3**9,9), dtype=np.float32)
        else:#対戦時のQ値の読み込み
            print("Game Start")
            fn = 'Q'+str(ID)+'value.txt'
            self.QV = np.loadtxt(fn)
#行動の選択
    def GetAction(self, state, epsilon):
        s = 0
        for i in range(9):
            s = s + state[i]*(3**i)
        if epsilon > np.random.uniform(0, 1):#徐々に最適行動のみをとる、ε-greedy法
            next_action = np.random.choice(range(9))
        else:
            a = np.where(self.QV[s]==self.QV[s].max())[0]
            next_action = np.random.choice(a)
        return next_action
    def UpdateQValue(self, action, reward, state, state_old):
        s = 0
        so = 0
        for i in range(9):
            s = s + state[i]*(3**i)
            so = so + state_old[i]*(3**i)
        alpha, gamma = 0.5, 0.9
        maxQ = np.max(self.QV[s])
        self.QV[so,action] = (1-alpha)*self.QV[so,action]+alpha*(reward + gamma*maxQ);
    def SaveQValue(self):
        fn = 'Q'+str(self.ID)+'value.txt'
        np.savetxt(fn, self.QV)


## 学習のための試行の繰り返し

In [4]:
np.random.seed(1)
num_episodes = 200000#100000  #総試行回数
env = Environment()
agent = [Agent(0), Agent(1)]
for episode in range(num_episodes):  #試行数分繰り返す
    if episode%10000==0:
        wins = [0,0,0]
    state = env.Reset()
    state_old = [state,state]
    actions = [0,0]
    epsilon = (1 / (episode + 1))+0.1
    done = False
    for step in range(9):
        s0 = step%2
        s1 = (step+1)%2
        actions[s0] = agent[s0].GetAction(state, epsilon)
        state_old[s0] = np.copy(state)
        state, rewards, done = env.Step(actions[s0], s0)
        agent[s1].UpdateQValue(actions[s1], rewards[s1], state, state_old[s1])
        if done==True:
            agent[s0].UpdateQValue(actions[s0], rewards[s0], state, state_old[s0])
            if rewards[s0]==-1:#1列できた場合
                wins[0]+=1
            if rewards[s0]==-2:#反則での勝敗数
                wins[1]+=1
            break
    if done==False:
        wins[2]+=1
        rewards=[1,1]
        agent[s1].UpdateQValue(actions[s1], rewards[s1], state, state_old[s1])
        agent[s0].UpdateQValue(actions[s0], rewards[s0], state, state_old[s0])
    if (episode+1)%10000==0:
        print(wins,sum(wins))

agent[0].SaveQValue()
agent[1].SaveQValue()


Training
Training
[847, 8811, 342] 10000
[376, 5166, 4458] 10000
[355, 4762, 4883] 10000
[322, 4474, 5204] 10000
[288, 4180, 5532] 10000
[255, 4044, 5701] 10000
[276, 3950, 5774] 10000
[299, 3961, 5740] 10000
[301, 3753, 5946] 10000
[303, 3773, 5924] 10000
[342, 3763, 5895] 10000
[313, 3659, 6028] 10000
[321, 3662, 6017] 10000
[304, 3629, 6067] 10000
[326, 3684, 5990] 10000
[342, 3658, 6000] 10000
[337, 3657, 6006] 10000
[355, 3636, 6009] 10000
[412, 3672, 5916] 10000
[349, 3507, 6144] 10000


## 対戦のための表示設定
最初の盤面の表示用

In [5]:
def show_InitBoard():
    i=0
    print(i*3, "|", i*3+1, "|", i*3+2 )
    print("----------" )
    i=1
    print(i*3, "|", i*3+1, "|", i*3+2 )
    print("----------" )
    i=2
    print(i*3, "|", i*3+1, "|", i*3+2 )


## 対戦（人間が先攻）

In [6]:
show_InitBoard()

env = Environment()
agent = Agent(1,False)
state = env.Reset()
state_old = [state,state]
actions = [0,0]
step = 0
while(1):
    actions[0] = int(input('[0-8]'))
    state, rewards, done = env.Step(actions[0], 0)
    env.ShowBoard()
    if done==True:
        if rewards[0]==-2:
            print('Penalty. You lose.')
        else:
            print('You win!!!')
        break
    step +=1
    if step==9:
        print('Draw!')
        break
    actions[1] = agent.GetAction(state, 0)
    print("Agent action:", actions[1])
    state, rewards, done = env.Step(actions[1], 1)
    env.ShowBoard()
    if done==True:
        if rewards[1]==-2:
            print('Penalty. You win.')
        else:
            print('You loose.')
        break
    step +=1

0 | 1 | 2
----------
3 | 4 | 5
----------
6 | 7 | 8
Game Start
[0-8]0
O |   |  
----------
  |   |  
----------
  |   |  
Agent action: 6
O |   |  
----------
  |   |  
----------
X |   |  
[0-8]4
O |   |  
----------
  | O |  
----------
X |   |  
Agent action: 1
O | X |  
----------
  | O |  
----------
X |   |  
[0-8]5
O | X |  
----------
  | O | O
----------
X |   |  
Agent action: 3
O | X |  
----------
X | O | O
----------
X |   |  
[0-8]7
O | X |  
----------
X | O | O
----------
X | O |  
Agent action: 8
O | X |  
----------
X | O | O
----------
X | O | X
[0-8]2
O | X | O
----------
X | O | O
----------
X | O | X
Draw!


## 対戦（人間が後攻）

In [11]:
show_InitBoard()

env = Environment()
agent = Agent(0,False)
state = env.Reset()
state_old = [state,state]
actions = [0,0]
step = 0
while(1):
    actions[0] = agent.GetAction(state, 0)
    print("Agent action:", actions[0])
    state, rewards, done = env.Step(actions[0], 0)
    env.ShowBoard()
    if done==True:
        if rewards[0]==-1:
            print('Penalty. You win.')
        else:
            print('You loose.')
        break
    step +=1
    if step==9:
        print('Draw!')
        break
    actions[1] = int(input('[0-8]'))
    state, rewards, done = env.Step(actions[1], 1)
    env.ShowBoard()
    if done==True:
        if rewards[1]==-2:
            print('Penalty. You lose.')
        else:
            print('You win!!!')
        break
    step +=1

0 | 1 | 2
----------
3 | 4 | 5
----------
6 | 7 | 8
Game Start
Agent action: 1
  | O |  
----------
  |   |  
----------
  |   |  
[0-8]0
X | O |  
----------
  |   |  
----------
  |   |  
Agent action: 8
X | O |  
----------
  |   |  
----------
  |   | O
[0-8]6
X | O |  
----------
  |   |  
----------
X |   | O
Agent action: 3
X | O |  
----------
O |   |  
----------
X |   | O
[0-8]2
X | O | X
----------
O |   |  
----------
X |   | O
Agent action: 4
X | O | X
----------
O | O |  
----------
X |   | O
[0-8]7
X | O | X
----------
O | O |  
----------
X | X | O
Agent action: 7
X | O | X
----------
O | O |  
----------
X | X | O
You loose.
