# 0. 세팅

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

model_number = 2
learning_rate = 0.0002
gamma = 0.98
game_round = 10
strategies = ["Copycat", "All Cooperate", "All Cheat", "Grudger", "Detective",
              "Copykitten", "Simpleton", "Random", "Cheat-Downing", "Cooperate-Downing",
              "Joss", "Cheat-Tester", "Cooperate-Tester", "Tranquilizer", "Gradual",
              "Prober", "Pavlov", "Mistrust", "Per-Kind", "Per-Nasty"]

# 1. State, Action, Reward에 관한 환경 설정

## 1-1. REINFORCE 알고리즘

In [28]:
class REINFORCE(nn.Module):
    def __init__(self):
        super(REINFORCE, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(game_round * 2, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)

    def train_net(self):
        R = 0
        policy_loss = []
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            policy_loss.append(loss.unsqueeze(0))
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        self.data = []

## 1-2. 점수 획득 규칙

In [29]:
def reward(my_action, opponent_action):
    if [my_action, opponent_action] == [1, 1]: # 협력, 협력
        return 2
    elif [my_action, opponent_action] == [1, 0]: # 협력, 배신
        return -1
    elif [my_action, opponent_action] == [0, 1]: # 배신, 협력
        return 3
    elif [my_action, opponent_action] == [0, 0]: # 배신, 배신
        return 0

## 1-3. 상대방의 전략과 현재 게임 상태에 기반하여 상대방의 행동 선택

In [30]:
class game():
    def reset():
        return [-1 for _ in range(game_round * 2)], strategies[random.randint(0, len(strategies)-1)]

    def step(state, strategy, my_action):
        my_state = []
        opp_state = []
        for i, s in enumerate(state):
            if s == -1:
                cur_game_round = i // 2
                break
            if i % 2 == 0:
                my_state.append(s)
            else:
                opp_state.append(s)

        if strategy == "Copycat":
            if cur_game_round == 0:
                opp_action = 1
            else:
                prev_my_action = my_state[-1]
                opp_action = prev_my_action

        elif strategy == "All Cooperate":
            opp_action = 1

        elif strategy == "All Cheat":
            opp_action = 0

        elif strategy == "Grudger":
            if cur_game_round == 0:
                opp_action = 1
            else:
                opp_action = 1 if 0 not in my_state else 0

        elif strategy == "Detective":
            if cur_game_round in [0, 2, 3]:
                opp_action = 1
            elif cur_game_round == 1:
                opp_action = 0
            else:
                if 0 in my_state[:4]:
                    prev_my_action = my_state[-1]
                    opp_action = prev_my_action
                else:
                    opp_action = 0

        elif strategy == "Copykitten":
            if cur_game_round in [0, 1]:
                opp_action = 1
            else:
                prev_prev_my_action, prev_my_action = my_state[-2], my_state[-1]
                opp_action = 0 if [prev_prev_my_action, prev_my_action] == [0, 0] else 1

        elif strategy == "Simpleton":
            if cur_game_round == 0:
                opp_action = 1
            else:
                prev_my_action, prev_opp_action = my_state[-1], opp_state[-1]
                if prev_my_action == 0:
                    if prev_opp_action == 0:
                        opp_action = 1
                    else:
                        opp_action = 0
                else:
                    opp_action = prev_opp_action

        elif strategy == "Random":
            opp_action = random.randint(0, 1)

        elif strategy == "Cheat-Downing":
            my_coop, my_cheat = 0, 0
            for ms in my_state:
                if ms == 0:
                    my_cheat += 1
                else:
                    my_coop += 1
            if my_cheat >= my_coop:
                opp_action = 0
            else:
                opp_action = 1

        elif strategy == "Cooperate-Downing":
            my_coop, my_cheat = 0, 0
            for ms in my_state:
                if ms == 0:
                    my_cheat += 1
                else:
                    my_coop += 1
            if my_cheat > my_coop:
                opp_action = 0
            else:
                opp_action = 1

        elif strategy == "Joss":
            if cur_game_round == 0:
                prob = random.randint(0, 9)
                if prob == 0:
                    opp_action = 0
                else:
                  opp_action = 1
            else:
                prev_my_action = my_state[-1]
                opp_action = prev_my_action

        elif strategy == "Cheat-Tester":
            if cur_game_round < game_round // 2:
                opp_action = random.randint(0, 1)
            else:
                my_coop, my_cheat = 0, 0
                for i in range(game_round // 2):
                    if my_state[i] == 0:
                        my_cheat += 1
                    else:
                        my_coop += 1
                if my_cheat >= my_coop:
                    opp_action = 0
                else:
                    opp_action = 1

        elif strategy == "Cooperate-Tester":
            if cur_game_round < game_round // 2:
                opp_action = random.randint(0, 1)
            else:
                my_coop, my_cheat = 0, 0
                for i in range(game_round // 2):
                    if my_state[i] == 0:
                        my_cheat += 1
                    else:
                        my_coop += 1
                if my_cheat > my_coop:
                    opp_action = 0
                else:
                    opp_action = 1

        elif strategy == "Tranquilizer":
            opp_coop, opp_cheat = 0, 0
            for os in opp_state:
                if os == 0:
                    opp_cheat += 1
                else:
                    opp_coop += 1
            if (opp_cheat + 1) / (opp_coop + opp_cheat + 1) < 0.25:
                opp_action = 0
            else:
                opp_action = 1

        elif strategy == "Gradual":
            if cur_game_round == 0:
                  opp_action = 1
            else:
                my_cheat = 0
                for ms in my_state:
                    if ms == 0:
                        my_cheat += 1
                x, cheat_sum = 1, 0
                for mc in range(my_cheat):
                    cheat_sum += x
                    x += 1
                opp_cheat = 0
                for os in opp_state:
                    if os == 0:
                        opp_cheat += 1
                if cheat_sum > opp_cheat:
                    opp_action = 0
                else:
                    opp_action = 1

        elif strategy == "Prober":
            if cur_game_round in [0, 1, 2]:
                if cur_game_round == 0:
                    opp_action = 1
                else:
                    opp_action = 0
            else:
                prev_three_round = [3, 4, 5]
                while True:
                    if cur_game_round in prev_three_round:
                        for i in range(3):
                            prev_three_round[i] -= 3
                        break
                    else:
                        for i in range(3):
                            prev_three_round[i] += 3
                if my_state[prev_three_round[1]] == 1 and my_state[prev_three_round[2]] == 0:
                    opp_action = my_state[-1]
                else:
                    if cur_game_round % 3 == 0:
                        opp_action = 0
                    else:
                        opp_action = 1

        elif strategy == "Pavlov":
            if cur_game_round == 0:
                opp_action = 1
            else:
                if opp_state[-1] == my_state[-1]:
                    opp_action = 1
                else:
                    opp_action = 0

        elif strategy == "Mistrust":
            if cur_game_round == 0:
                opp_action = 0
            else:
                prev_my_action = my_state[-1]
                opp_action = prev_my_action

        elif strategy == "Per-Kind":
            if cur_game_round % 3 in [0, 1]:
                opp_action = 1
            else:
                opp_action = 0

        elif strategy == "Per-Nasty":
            if cur_game_round % 3 == 0:
                opp_action = 1
            else:
                opp_action = 0

        state[cur_game_round * 2] = my_action
        state[cur_game_round * 2 + 1] = opp_action

        return state, reward(my_action, opp_action), (-1 not in state)

In [31]:
class selfplay:
    def reset():
        return [-1 for _ in range(game_round * 2)], [-1 for _ in range(game_round * 2)]

    def step(state_i, state_j, i_action, j_action):
        i_state_by_i = []
        j_state_by_i = []
        i_state_by_j = []
        j_state_by_j = []
        for i, s in enumerate(state_i):
            if s == -1:
                cur_game_round = i // 2
                break
            if i % 2 == 0:
                i_state_by_i.append(s)
            else:
                j_state_by_i.append(s)
        for i, s in enumerate(state_j):
            if s == -1:
                cur_game_round = i // 2
                break
            if i % 2 == 0:
                i_state_by_j.append(s)
            else:
                j_state_by_j.append(s)
        state_i[cur_game_round * 2] = i_action
        state_i[cur_game_round * 2 + 1] = j_action
        state_j[cur_game_round * 2] = j_action
        state_j[cur_game_round * 2 + 1] = i_action

        return state_i, state_j, reward(i_action, j_action), reward(j_action, i_action), (-1 not in state_i)


# 2. 강화학습

In [36]:
models = [REINFORCE() for _ in range(model_number)]
for model in models:
  score = 0.0
  print_interval = 1000

  X = []
  Y = []
  for n_epi in range(70001):
      s, opp_st = game.reset()
      done = False

      while not done:

          prob = model(torch.tensor(s).float())
          m = Categorical(prob)
          a = m.sample()
          s_prime, r, done = game.step(s, opp_st, a)
          model.put_data((r,prob[a]))
          s = s_prime
          score += r

      model.train_net()

      if n_epi%print_interval==0 and n_epi!=0:
          print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
          X.append(n_epi)
          Y.append(score/print_interval)
          score = 0.0

# of episode :1000, avg score : 12.578
# of episode :2000, avg score : 13.575
# of episode :3000, avg score : 13.799
# of episode :4000, avg score : 13.538
# of episode :5000, avg score : 14.285
# of episode :6000, avg score : 13.892
# of episode :7000, avg score : 14.1
# of episode :8000, avg score : 14.645
# of episode :9000, avg score : 14.25
# of episode :10000, avg score : 14.066
# of episode :11000, avg score : 14.056
# of episode :12000, avg score : 14.574
# of episode :13000, avg score : 14.746
# of episode :14000, avg score : 14.567
# of episode :15000, avg score : 14.477
# of episode :16000, avg score : 14.109
# of episode :17000, avg score : 14.353
# of episode :18000, avg score : 14.244
# of episode :19000, avg score : 14.501
# of episode :20000, avg score : 14.433
# of episode :21000, avg score : 14.473
# of episode :22000, avg score : 14.446
# of episode :23000, avg score : 14.265
# of episode :24000, avg score : 14.844
# of episode :25000, avg score : 14.745
# of episode

In [59]:
def train_net(self):
    R = 0
    policy_loss = []
    self.optimizer.zero_grad()
    for r, prob in self.data[::-1]:
        R = r + gamma * R
        loss = -torch.log(prob) * R
        policy_loss.append(loss.unsqueeze(0))
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    self.optimizer.step()
    self.data = []


In [60]:
for _ in range(10000):
    for i in range(model_number - 1):
        for j in range(i + 1, model_number):
            score_i = 0.0
            score_j = 0.0
            epinumber = 100

            listx = []
            listy = []
            for n_epi in range(epinumber):
                s_i, s_j = selfplay.reset()
                done = False

                # Train model[i] against model[j]
                while not done:
                    prob_i = models[i](torch.tensor(s_i).float())
                    prob_j = models[j](torch.tensor(s_j).float())
                    m_i = Categorical(prob_i)
                    m_j = Categorical(prob_j)
                    a_i = m_i.sample()
                    a_j = m_j.sample()
                    s_i_prime, s_j_prime, reward_i, reward_j, done = selfplay.step(s_i, s_j, a_i, a_j)
                    models[i].put_data((reward_i, prob_i[a_i]))
                    models[j].put_data((reward_j, prob_j[a_j]))
                    s_i = s_i_prime
                    s_j = s_j_prime
                    score_i += reward_i
                    score_j += reward_j

                models[i].train_net()  # Train model[i] after the episode

                if n_epi == epinumber-1:
                    print("# of episode :{}, score_i : {}, score_j : {}".format(n_epi, score_i / print_interval, score_j / print_interval))
                    listx.append(n_epi)
                    listy.append(score_i / print_interval)
                    score_i = 0.0
                    score_j = 0.0

            # Now, train model[j] against model[i] as the fixed opponent
            for n_epi in range(epinumber):
                s_i, s_j = selfplay.reset()
                done = False

                while not done:
                    prob_i = models[j](torch.tensor(s_j).float())
                    prob_j = models[i](torch.tensor(s_i).float())
                    m_i = Categorical(prob_i)
                    m_j = Categorical(prob_j)
                    a_i = m_i.sample()
                    a_j = m_j.sample()
                    s_i_prime, s_j_prime, reward_i, reward_j, done = selfplay.step(s_j, s_i, a_i, a_j)
                    models[j].put_data((reward_i, prob_i[a_i]))
                    models[i].put_data((reward_j, prob_j[a_j]))
                    s_i = s_i_prime
                    s_j = s_j_prime
                    score_i += reward_i
                    score_j += reward_j

                models[j].train_net()  # Train model[j] after the episode

                if n_epi == epinumber-1:
                    print("# of episode :{}, score_i : {}, score_j : {}".format(n_epi, score_i / print_interval, score_j / print_interval))
                    listx.append(n_epi)
                    listy.append(score_i / print_interval)
                    score_i = 0.0
                    score_j = 0.0


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.