In [31]:
import numpy as np
import random
import copy

In [48]:
class Environment:
    '''三目並べを実行する環境のクラス'''
    
    def __init__(self):
        num_states = 9
        num_actions = 9
        self.game_board = np.zeros(num_states)
        self.agent = Agent(num_states, num_actions)
        
    
    def run(self):
        '''三目並べの実行'''
        
        win = 0 # 勝ち数
        lose = 0 # 負け数
        
        for episode in range(NUM_EPISODES):
            
            self.game_board = np.zeros(9) # 盤面をリセット
            observation = copy.deepcopy(self.game_board) # 盤面のコピー
            reward = 0 # 報酬をリセット
            winner = 0 # 勝利者をリセット
            pre = random.randint(1, 2) # 1→Q学習が先攻、2→Q学習が後攻
            
            for step in range(5):
                
                if pre == 1:
                    action, winner = self.q_learning_turn(observation, episode) # Q学習のターン
                    if (winner == 0 and step < 4):
                        pos, winner = self.random_player_turn() # ランダムプレイヤーのターン
                else:
                    pos, winner = self.random_player_turn() # ランダムプレイヤーのターン
                    if (winner == 0 and step < 4):
                        action, winner = self.q_learning_turn(observation, episode) # Q学習のターン
                        
                if winner == 1:
                    win += 1
                    reward = 1
                elif winner == 2:
                    lose += 1
                    reward = -1
                
                if(step==4 and pre == 2):
                    break
                
                observation_next = self.game_board # 両者が打ち込んだ後の状態
                
                self.agent.update_Q_function(observation, action, reward, observation_next) # Q関数の更新
                
                observation = copy.deepcopy(observation_next) # game_boardと一緒に更新されないようdeepcopyを使う
                
                if winner != 0 or step == 4:
                    break
        
        print("win:", win, "lose:", lose, "draw", NUM_EPISODES - win - lose)
        print("勝率", win / NUM_EPISODES)
    
    def q_learning_turn(self, observation, episode):
        '''Q学習の行動選択、盤面反映、勝利判定'''
        action = self.agent.get_action(observation, episode) # エージェントが行動を選択
        self.game_board[action] = 1 # 盤面に反映
        winner = self.win_func(1) # 勝っているか判定
        return action, winner # 行動、勝利者を返す
    
    def random_player_turn(self):
        '''ランダムプレイヤーの行動選択、盤面反映、勝利判定'''
        pos = self.random_player_action() # ランダムプレイヤーが行動を選択
        self.game_board[pos] = 2 # 盤面に反映
        winner = self.win_func(2) # 勝っているか判定
        return pos, winner # 行動、勝利者を返す
    
    def random_player_action(self):
        '''空いているマスのランダムな行動選択'''
        choices = []
        for i in range(9):
            if (self.game_board[i] == 0):
                choices.append(i)
        action = random.choice(choices)
        return action
        
    def win_func(self, player):
        '''勝利判定'''
        # 勝ち手を列挙
        lines = [
          [0, 1, 2],
          [3, 4, 5],
          [6, 7, 8],
          [0, 3, 6],
          [1, 4, 7],
          [2, 5, 8],
          [0, 4, 8],
          [2, 4, 6],
        ]
        for i in range(0, len(lines)):
            [a, b, c] = lines[i]
            if (self.game_board[a] and self.game_board[a] == self.game_board[b] and self.game_board[b] == self.game_board[c]):
                return player # 勝ち手に当てはまってたら勝利者の番号を返す
        return 0
    
    def show(self, action, player):
        print("player:", player, ", action:", action)
        board = np.reshape(self.game_board, [3,3])
        print(board)

In [49]:
class Agent:
    '''三目並べのエージェントクラス'''
    
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
        # エージェントが行動を決定するための頭脳を生成
        
    def update_Q_function(self, observation, action, reward, observation_next):
        '''Q関数の更新'''
        self.brain.update_Q_table(observation, action, reward, observation_next)
            
    def get_action(self, observation, episode):
        '''行動の決定'''
        action = self.brain.decide_action(observation, episode)
        return action

In [50]:
class Brain:
    '''エージェントが持つ脳となるクラスです。Q学習を実行します'''
    
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        self.count = 0
        self.q_table = np.random.uniform(low=0, high=1, size=(3**num_states, num_actions))
    
    def digitize_state(self, observation):
        return int(sum([x * (3**i) for i, x in enumerate(observation)]))
    
    def update_Q_table(self, observation, action, reward, observation_next):
        '''QテーブルをQ学習より更新'''
        state = self.digitize_state(observation) # 状態を離散化
        state_next = self.digitize_state(observation_next) # 次の状態を離散化
        Max_Q_next = max(self.q_table[state_next][:])
        if (reward != 0):
            self.q_table[state,action] = self.q_table[state, action] + ETA * (reward - self.q_table[state, action])
        else:
            self.q_table[state, action] = self.q_table[state, action] + ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])
        # 次の状態で選択できない行動の行動価値関数をゼロにしておく
        choices = []
        for i in range(9):
            if (observation_next[i] != 0):
                choices.append(i)
        for x in choices:
            self.q_table[state_next][x] = MINIMUM
        
    def decide_action(self, observation, episode):
        '''ε-greedy法で徐々に最適行動のみを採用する'''
        
        state = self.digitize_state(observation)
        
        epsilon = 0.5 * (1 / (episode + 1))

        choices = []
        for i in range(9):
            if (observation[i] == 0):
                choices.append(i)
                    
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
            if (action not in choices):
                action = random.choice(choices)
            return action
        else:
            choices = []
            for i in range(9):
                if (observation[i] == 0):
                    choices.append(i)
            action = random.choice(choices)
            return action
                
    def show(self, observation):
        board = np.reshape(observation, [3,3])
        print(board)

In [99]:
NUM_EPISODES = 1000000
COMPLETE_EPISODES = 100
MINIMUM = - 10 ** 10
GAMMA = 0.99 # 時間割引率
ETA = 0.5 # 学習係数
env = Environment()
env.run()

win: 869999 lose: 95972 draw 34029
勝率 0.869999


In [100]:
class VS_Q_Learning:
    
    def __init__(self):
        num_states = 9
        num_actions = 9
        self.game_board = np.zeros(num_states)
        self.agent = VS_Agent(num_states, num_actions)
        
    def play(self):
        
        pre = int(input("先攻→0, 後攻→1を入力してください"))
        print("ゲームスタート！")
        self.show()
        observation = copy.deepcopy(self.game_board)
        winner = 0
            
        for step in range(9):
            
            if pre == 0:
                winner = self.your_turn()
                if (winner == 0 and step < 4):
                    winner = self.q_learning_turn()
            else:
                winner = self.q_learning_turn()
                if (winner == 0 and step < 4):
                    winner = self.your_turn()

            if winner == 1:
                print("AIの勝ち")
                break
            elif winner == 2:
                print("あなたの勝ち")
                break

            if step == 4:
                print("引き分け")
                complete_episodes = 0
                break
    
    def your_turn(self):
        print("あなたのターン")
        pos = int(input("手を選んでください:"))
        self.game_board[pos] = 2
        self.show()
        winner = self.winner_func(2)
        return winner
            
    def q_learning_turn(self):
        print("AIのターン")
        action = self.agent.get_action(self.game_board)
        self.game_board[action] = 1
        self.show()
        winner = self.winner_func(1)
        return winner
    
    def winner_func(self, player):
        # 勝ち手を列挙
        lines = [
          [0, 1, 2],
          [3, 4, 5],
          [6, 7, 8],
          [0, 3, 6],
          [1, 4, 7],
          [2, 5, 8],
          [0, 4, 8],
          [2, 4, 6],
        ]
        for i in range(0, len(lines)):
            [a, b, c] = lines[i]
            
            if self.game_board[a] and self.game_board[a] == self.game_board[b] and self.game_board[a] == self.game_board[c]: 
                return player
        return 0
    
    def show(self):
        show_board = np.array(["","","","","","","","",""])
        for i in range(9):
            x = self.game_board[i]
            if (x==0):
                show_board[i] = str(i)
            elif (x==1):
                show_board[i] = "×"
            else:
                show_board[i] = "⚪︎"
        board = np.reshape(show_board, [3,3])
        print(board)

In [101]:
class VS_Agent:
    '''三目並べのエージェントクラス'''
    
    def __init__(self, num_states, num_actions):
        self.brain = VS_Brain(num_states, num_actions)
        # エージェントが行動を決定するための頭脳を生成
            
    def get_action(self, observation):
        '''行動の決定'''
        action = self.brain.decide_action(observation)
        return action

In [102]:
class VS_Brain:
    '''エージェントが持つ脳となるクラスです。Q学習を実行します'''
    
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
    
    def digitize_state(self, observation):
        return int(sum([x * (3**i) for i, x in enumerate(observation)]))
        
    def decide_action(self, observation):
        choices = []
        for i in range(9):
            if (observation[i] == 0):
                choices.append(i)
        state = self.digitize_state(observation)
        action = np.argmax(q_table[state][:])
        if (action not in choices):
            action = random.choice(choices)
        return action

In [103]:
import numpy as np
import random

In [110]:
q_table = env.agent.brain.q_table
game = VS_Q_Learning()
game.play()

先攻→0, 後攻→1を入力してください1
ゲームスタート！
[['0' '1' '2']
 ['3' '4' '5']
 ['6' '7' '8']]
AIのターン
[['0' '1' '2']
 ['3' '×' '5']
 ['6' '7' '8']]
あなたのターン
手を選んでください:1
[['0' '⚪' '2']
 ['3' '×' '5']
 ['6' '7' '8']]
AIのターン
[['×' '⚪' '2']
 ['3' '×' '5']
 ['6' '7' '8']]
あなたのターン
手を選んでください:8
[['×' '⚪' '2']
 ['3' '×' '5']
 ['6' '7' '⚪']]
AIのターン
[['×' '⚪' '2']
 ['×' '×' '5']
 ['6' '7' '⚪']]
あなたのターン
手を選んでください:6
[['×' '⚪' '2']
 ['×' '×' '5']
 ['⚪' '7' '⚪']]
AIのターン
[['×' '⚪' '2']
 ['×' '×' '×']
 ['⚪' '7' '⚪']]
AIの勝ち
