# 囚人のジレンマを対象とした強化学習（エージェントは1つだけ学習）

## ライブラリのインストール

In [1]:
!pip install axelrod

Collecting axelrod
  Using cached Axelrod-4.12.0-py2.py3-none-any.whl (191 kB)
Installing collected packages: axelrod
Successfully installed axelrod-4.12.0


## ライブラリのインポート

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
import axelrod as axl

## 対戦数の設定

In [3]:
NUM_MATCH = 5
#0:C(協調), 1:D(裏切り)

## エージェントクラス
状態を観測し、行動を決定し、状態・行動・報酬からQ値を更新する

In [4]:
class Agent():
    def __init__(self):
        self.QV = np.zeros((3**2, 2))
    def GetAction(self, epsilon, opponent_history, player_history):
        if(len(opponent_history)==0):#1回目の状態は0
            self.state = 0
        elif(len(opponent_history)==1):#2回目の状態は相手の1手目だけから作成
            self.state = 1 if opponent_history[0]==C else 2
        else:#3回目以降の状態は相手の最後の2手から作成
            self.state = (1 if opponent_history[-1]==C else 2) + (1 if opponent_history[-2]==C else 2)*3

        if epsilon > np.random.random():#徐々に最適行動のみをとる、ε-greedy法
            self.action = np.random.choice([0, 1])
        else:
            a = np.where(self.QV[self.state]==self.QV[self.state].max())[0]
            self.action = np.random.choice(a)
        return self.action
    def UpdateQValue(self, state, next_state, action, reward):
        alpha, gamma = 0.4, 0.9
        next_maxQ=max(self.QV[next_state])
        self.QV[state, action] = (1 - alpha) * self.QV[state, action] + alpha * (reward + gamma * next_maxQ)
    def Update(self, mp, ms):
        for i in range(NUM_MATCH):
            if i==0:#1手目を学習するための状態と次の状態の作成
                state = 0
                next_state = (1 if mp[i][1]==C else 2)
            else:#2手目以降を学習するための状態と次の状態の作成
                state = next_state
                next_state = (1 if mp[i][1]==C else 2) + (1 if mp[i-1][1]==C else 2)*3
            action = 0 if mp[i][0]==C else 1
            reward = 0
            if i==NUM_MATCH-1:
                reward = (float(ms[0])/NUM_MATCH)
                #reward = (float(ms[0])/NUM\_MATCH)**3#報酬を3乗する
            self.UpdateQValue(state, next_state, action, reward)


## エージェント戦略の作成

In [5]:
from axelrod.strategy_transformers import *

A = Agent()
class MyStrategy(Player):
    name = "MyStrategy"
    def strategy(self, opponent):
        global A, epsilon
        act = A.GetAction(epsilon,opponent.history, self.history)
        if act == 0:
            return C
        return D


## 定数の設定

In [6]:
num_episode=1000  #総試行回数

## 学習のための試行の繰り返し## ライブラリのインストール

In [7]:
#np.random.seed(1)

epsilon = 0
for episode in range(num_episode):
    epsilon = float(num_episode-episode)*5/num_episode
    match = axl.Match([MyStrategy(), axl.Alternator()], NUM_MATCH)#繰り返し戦略
#    match = axl.Match([MyStrategy(), axl.TitForTat()], NUM_MATCH)#しっぺ返し戦略
    mp = match.play()
    ms = match.final_score()
    A.Update(mp,ms)

#最終結果の確認
epsilon=0
match = axl.Match([MyStrategy(), axl.Alternator()], NUM_MATCH)#繰り返し戦略
#match = axl.Match([MyStrategy(), axl.TitForTat()], NUM_MATCH)#しっぺ返し戦略
mp = match.play()
ms = match.final_score()
print(mp)
print(ms)
print(A.QV)


[(D, C), (D, D), (D, C), (C, D), (D, C)]
(16, 6)
[[5.98921783 6.00020104]
 [6.60675105 6.60849787]
 [0.         0.        ]
 [0.         0.        ]
 [0.         0.        ]
 [6.09022848 7.16761956]
 [0.         0.        ]
 [5.9172964  5.90237197]
 [0.         0.        ]]


## 学習済みのQ値を用いて11回対戦した場合の結果

In [8]:
epsilon=0
match = axl.Match([MyStrategy(), axl.Alternator()], 11)#繰り返し戦略
#match = axl.Match([MyStrategy(), axl.TitForTat()], 11)#しっぺ返し戦略
mp = match.play()
ms = match.final_score()
print(mp)
print(ms)

[(D, C), (D, D), (D, C), (C, D), (D, C), (C, D), (D, C), (C, D), (D, C), (C, D), (D, C)]
(31, 21)


## 学習を100回行って最大の報酬を得られる結果とQ値を表示

In [9]:
#np.random.seed(1)
max_ms = 0
for n in range(100):
    A = Agent()
    epsilon = 0
    for episode in range(num_episode):
        epsilon = float(num_episode-episode)*5/num_episode
        match = axl.Match([MyStrategy(), axl.Alternator()], NUM_MATCH)#繰り返し戦略
#        match = axl.Match([MyStrategy(), axl.TitForTat()], NUM_MATCH)#しっぺ返し戦略
        mp = match.play()
        ms = match.final_score()
        A.Update(mp,ms)

    epsilon=0
    match = axl.Match([MyStrategy(), axl.Alternator()], NUM_MATCH)#繰り返し戦略
#    match = axl.Match([MyStrategy(), axl.TitForTat()], NUM_MATCH)#しっぺ返し戦略
    mp = match.play()
    ms = match.final_score()
    print(mp)
    print(ms)
    if max_ms<ms[0]:
        max_ms = ms[0]
        stock_mp = mp
        stock_ms = ms
        stock_QV = A.QV

print('最大得点')
print(stock_mp)
print(stock_ms)
print(stock_QV)


[(D, C), (C, D), (C, C), (C, D), (C, C)]
(11, 16)
[(C, C), (D, D), (D, C), (C, D), (D, C)]
(14, 9)
[(C, C), (C, D), (C, C), (D, D), (C, C)]
(10, 15)
[(C, C), (D, D), (C, C), (D, D), (C, C)]
(11, 11)
[(D, C), (C, D), (C, C), (D, D), (C, C)]
(12, 12)
[(D, C), (C, D), (C, C), (C, D), (C, C)]
(11, 16)
[(C, C), (D, D), (C, C), (C, D), (C, C)]
(10, 15)
[(D, C), (D, D), (D, C), (D, D), (D, C)]
(17, 2)
[(D, C), (C, D), (D, C), (C, D), (D, C)]
(15, 10)
[(D, C), (C, D), (C, C), (C, D), (C, C)]
(11, 16)
[(D, C), (C, D), (C, C), (C, D), (C, C)]
(11, 16)
[(D, C), (C, D), (D, C), (D, D), (D, C)]
(16, 6)
[(C, C), (D, D), (C, C), (C, D), (C, C)]
(10, 15)
[(D, C), (D, D), (D, C), (D, D), (D, C)]
(17, 2)
[(D, C), (C, D), (D, C), (C, D), (D, C)]
(15, 10)
[(D, C), (D, D), (D, C), (C, D), (D, C)]
(16, 6)
[(D, C), (C, D), (D, C), (C, D), (D, C)]
(15, 10)
[(D, C), (D, D), (D, C), (C, D), (D, C)]
(16, 6)
[(C, C), (D, D), (D, C), (D, D), (D, C)]
(15, 5)
[(D, C), (C, D), (D, C), (C, D), (D, C)]
(15, 10)
[(D, C)

## 繰り返しの試行で得られた学習済みのQ値を用いて11回対戦した場合の結果

In [10]:
epsilon=0
A.QV = stock_QV
match = axl.Match([MyStrategy(), axl.Alternator()], 11)
#match = axl.Match([MyStrategy(), axl.TitForTat()], 11)
mp = match.play()
ms = match.final_score()
print(mp)
print(ms)

[(D, C), (D, D), (D, C), (D, D), (D, C), (D, D), (D, C), (D, D), (D, C), (D, D), (D, C)]
(35, 5)
