<a href="https://colab.research.google.com/github/joshuazhu17/Project_Studio/blob/main/HexGameQLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import distributions, nn
import numpy as np
import random
import math

In [None]:
class HexEnvironment():
  def __init__(self, boardsize):
    self.size = boardsize
    self.grid = [[], []] #the first is where p1 puts its pieces, the second is where p2 puts its pieces
    for i in range(boardsize):
      p1_row = []
      p2_row = []
      for j in range(boardsize):
        p1_row.append(0)
        p2_row.append(0)
      self.grid[0].append(p1_row)
      self.grid[1].append(p2_row)
    self.current_player = 1
    self.done = False

  def neighbors(self, x, y):
    neighbors = [[x-1, y+1], [x-1, y], [x, y-1], [x+1, y-1], [x+1, y], [x, y+1]]
    real_neighbors = []
    for i in range(len(neighbors)):
      [a, b] = neighbors[i]
      if (a >= 0) and (a < self.size) and (b >= 0) and (b < self.size):
        real_neighbors.append([a, b])
    return real_neighbors

  def obs(self):
    return self.grid

  def is_done(self):
    return self.done

  def get_player(self):
    return self.current_player
    
  def reset(self):
    self.current_player = 1
    self.done = False
    for i in range(self.size):
      for j in range(self.size):
        self.grid[0][i][j] = 0
        self.grid[1][i][j] = 0

  def print_self(self):
    for i in range(self.size):
      next_line = ""
      next_line += " "*i
      for j in range(self.size):
        if self.grid[0][i][j] == 1:
          next_line += "1"
        elif self.grid[1][i][j] == 1:
          next_line += "2"
        else:
          next_line += "_"
        next_line += " "
      print(next_line)

  def search_for_win(self):
    #Player 1 wants to connect bottom left ((10, 0), (10, 1), ..., (10, 10)) to top right ((0, 0), (0, 1), ... (0, 10))
    #Player 2 wants to connect top left ((0, 0), (1, 0), ... (10, 0)) to bottom right ((0, 10), (1, 10), ... (10, 10))
    #I'll do a dfs to find a winning path
        
    for player in range(2): #0 is p1, 1 is p2
      stack = []
      visited = [[False for i in range(self.size)] for j in range(self.size)]
      #Initialize search with the side corresponding to the player
      if player == 0:
        for i in range(self.size):
          if self.grid[0][self.size-1][i] == 1:
            stack.append([self.size-1, i])
      elif player == 1:
        for i in range(self.size):
          if self.grid[1][i][0] == 1:
            stack.append([i, 0])

      #Do the search
      while len(stack)>0:
        [x, y] = stack.pop()
        visited[x][y] = True
        neighbors = self.neighbors(x, y)
        for [newx, newy] in neighbors:
          if (self.grid[player][newx][newy] == 1) and (not visited[newx][newy]):
            stack.append([newx, newy])

      #End the search by checking the other side corresponding to the player
      if player == 0:
        for i in range(self.size):
          if visited[0][i]:
            self.done = True
            return 1
      elif player == 1:
        for i in range(self.size):
          if visited[i][self.size-1]:
            self.done = True
            return 2

    #0 means no one has won
    return 0

  def filter_illegal(self, qvals, transpose = False):
    valid_probs = {}

    for i in range(self.size):
      for j in range(self.size):
        if self.grid[0][i][j] == 0 and self.grid[1][i][j] == 0:
          valid_probs[(i, j)] = qvals[i][j] if not transpose else qvals[j][i]

    #sorted_vals contains the true (i, j) coordinates, deals with the transpose already
    sorted_vals = sorted(valid_probs.items(), key=lambda x: x[1], reverse=True)
    return sorted_vals

  def greedy(self, qvals, transpose = False):
    #if transopose is true, that means the qvals are of the transposed board
    #however, the output is always in the true board's coordinates
    
    sorted_vals = self.filter_illegal(qvals, transpose)

    #greedy takes the maximum probability
    (best_x, best_y) = sorted_vals[0][0]
    
    #sorted_vals already dealth with the transpose
    return (best_x, best_y)
  
  def epsilon_greedy(self, qvals, epsilon, transpose = False):
    #if transopose is true, that means the qvals are of the transposed board
    #however, the output is always in the true board's coordinates

    sorted_vals = self.filter_illegal(qvals, transpose)

    (best_x, best_y) = sorted_vals[0][0]

    choice = random.random()
    if choice < epsilon:
      (best_x, best_y) = sorted_vals[int(len(sorted_vals)*(random.random()))][0]
    
    #sorted_vals already dealth with the transpose
    return (best_x, best_y)
    
  def random(self):
    valid_moves = []

    for i in range(self.size):
      for j in range(self.size):
        if self.grid[0][i][j] == 0 and self.grid[1][i][j] == 0:
          valid_moves.append((i, j))

    index = random.randrange(0, len(valid_moves))
    (random_x, random_y) = valid_moves[index]
    return (random_x, random_y)

    #stochastic doesn't make sense for Q-learning
    '''
    def stochastic(self, preds):
        filtered_preds = preds.detach().clone()

        for i in range(self.size):
            for j in range(self.size):
                if self.grid[0][i][j] != 0 or self.grid[1][i][j] != 0:
                    filtered_preds[(i*self.size) + j] = 0

        m = distributions.Categorical(filtered_preds)
        action = m.sample().item()
        (chosen_x, chosen_y) = (math.floor(action/self.size), action%self.size)
        return (chosen_x, chosen_y)  
    '''      

  def step(self, policy, move = None, epsilon = 0, transpose = False):
    #move will be a 2d array with q values for each hexagon of the grid

    if policy == 'epsilon_greedy':
      (x, y) = self.epsilon_greedy(move, epsilon, transpose)
    elif policy == 'greedy':
      (x, y) = self.greedy(move, transpose)
    elif policy == 'random':
      (x, y) = self.random()

    if self.grid[0][x][y] == 1 or self.grid[1][x][y] == 1:
      raise Exception("Tried to make an illegal move")
    self.grid[self.current_player - 1][x][y] = 1
    
    if self.current_player == 1:
      self.current_player = 2
    else:
      self.current_player = 1

    return (x, y)

In [None]:
class Agent():
  def __init__(self, model, size):
    self.model = model
    self.size = size

  def action(self, grid, transpose = False):
    nn_input = torch.tensor(grid).float()
    if transpose:
      nn_input = torch.transpose(nn_input, 1, 2)
      nn_input = torch.flip(nn_input,[0])
    qvals = self.model(nn_input)
    out = qvals.detach().clone().numpy()
    out = out.reshape((self.size, self.size))
    out = out.tolist()
    return out, qvals #qvals is a 1D tensor with grad, out is a nxn array without grad

In [None]:
class LinearModel(nn.Module):
  def __init__(self):
    super(LinearModel, self).__init__()
    self.layer1 = nn.Sequential(
      nn.Linear(50, 128),
      nn.ReLU(),
      nn.Linear(128, 128),
      nn.ReLU(),
      nn.Linear(128, 25)
      )
  def forward(self, x):
    out = torch.flatten(x)
    out = self.layer1(out)
    return out

In [None]:
class Controller():
  def __init__(self, size, player1, player2, singleplayer = None):
    self.size = size
    self.env = HexEnvironment(size)
    self.player1 = player1
    self.player2 = player2
    self.singleplayer = singleplayer

    #Rewards: 1 upon winning, -1 upon losing
    self.gamma = 0.9 #the discount factor, I should fiddle around with this
    self.move_reward = 0 #right now I have these set to zero
    self.win_reward = 10.0
    # self.lose_reward = 1.0 no such thing as a lose reward

    # Experience replay stores (state, action taken, reward for action, next state)
    # I'm going to implement these later
    self.p1replay = []
    self.p2replay = []
    self.winner_history = []

    self.p1_optimizer = torch.optim.Adam(self.player1.model.parameters())
    self.p2_optimizer = torch.optim.Adam(self.player2.model.parameters())
    self.singleplayer_optimizer = None
    if (singleplayer != None):
      self.singleplayer_optimizer = torch.optim.Adam(self.singleplayer.model.parameters())
  
  def run_game_singleplayer_no_replay(self, epsilon=0, printgame = False): #not using experience replay, just directly doing backpropogation after each move
    self.env.reset()
    while not self.env.is_done():
      obs = self.env.obs()
      action = 0
      reward = 0
      if self.env.current_player == 1:
        move, qvals = self.singleplayer.action(obs)
        (movex, movey) = self.env.step('epsilon_greedy', move = move, epsilon = epsilon)
        tile_num = movex*self.size + movey
        action = tile_num
        reward = self.move_reward
      else:
        move, qvals = self.singleplayer.action(obs, True) #transpose the input
        (movex, movey) = self.env.step('epsilon_greedy', move = move, epsilon = epsilon, transpose = True)
        qvals = torch.flatten(torch.transpose(qvals.reshape((self.env.size, self.env.size)), 0, 1)) #transpose qvals to get the true coordinates
        tile_num = movex*self.size + movey
        action = tile_num
        reward = self.move_reward

      #Run gradient descent
      #To get the next Q value, I look at the best move the opponent can make and take the opposite of that

      self.singleplayer.model.zero_grad()
      
      nextq = 0
      winner = self.env.search_for_win()
      if winner != 0:
        self.winner_history.append(winner)
        reward = self.win_reward
      else:
        with torch.no_grad():
          new_observation = self.env.obs()
          nextmove, nextqvals = self.singleplayer.action(obs)
          legal_move_q_pairs = self.env.filter_illegal(nextmove)
          if self.env.current_player == 2:
            nextmove, nextqvals = self.singleplayer.action(obs, True)
            legal_move_q_pairs = self.env.filter_illegal(nextmove, True)
          quality_for_enemy = legal_move_q_pairs[0][1]
          nextq = -1 * quality_for_enemy
      
      target = torch.tensor(reward + self.gamma*nextq)

      loss = (target - qvals[action])**2
      print("Target: ", target)
      print("Loss: ", loss)
      loss.backward()
      self.singleplayer_optimizer.step()

    if printgame:
      self.env.print_self()

  
  def test_game(self, printgame = False):
    with torch.no_grad():
      self.env.reset()
      p1_qvals = []
      p2_qvals = []
      while not self.env.is_done():
        obs = self.env.obs()
        action = 0
        #reward = 0
        if self.env.current_player == 1:
          move, qvals = self.singleplayer.action(obs)
          (movex, movey) = self.env.step('greedy', move = move)
          tile_num = movex*self.size + movey
          action = tile_num
          #reward = self.move_reward
          copy_of_qvals = qvals.detach().clone()
          p1_qvals.append(copy_of_qvals)
        else:
          move, qvals = self.singleplayer.action(obs, True) #transpose the input
          (movex, movey) = self.env.step('greedy', move = move, transpose = True)
          qvals = torch.flatten(torch.transpose(qvals.reshape((self.env.size, self.env.size)), 0, 1)) #transpose qvals to get the true coordinates
          tile_num = movex*self.size + movey
          action = tile_num
          #reward = self.move_reward
          copy_of_qvals = qvals.detach().clone()
          p2_qvals.append(copy_of_qvals)

        winner = self.env.search_for_win()
        if winner != 0:
          print("Winner: ", winner)

      if printgame:
        self.env.print_self()
      #print(p1_tile_num)
      #print(p2_tile_num)
      return (p1_qvals, p2_qvals)
    
  def log(self):

    print("-----------------------------------")
    print("P1 expected reward mean: ")
    print("P1 expected reward std: ")
    print("P2 expected reward mean: ")
    print("P2 expected reward std: ")
    print("-----------------------------------")

  def train_singleplayer(self, games, batch_size, verbose=False):
    for i in range(games):
      epsilon = max(1-(i/1000), 0.2)
      self.run_game_singleplayer_no_replay(epsilon)
      if i%batch_size == 0 and i != 0:
        if verbose:
          self.log()

  def singleplayer_p1_against_random(self, n):
    with torch.no_grad():
      wins = 0.0
      total = 0.0
      for i in range(n):
        self.env.reset()
        while not self.env.is_done():
          obs = self.env.obs()
          if self.env.current_player == 1:
            move, qvals = self.player1.action(obs)
            (movex, movey) = self.env.step('greedy', move=move)
          else:
            (movex, movey) = self.env.step('random')
          winner = self.env.search_for_win()
          if winner == 1:
            wins += 1
            total += 1
          elif winner == 2:
            total += 1
        self.env.print_self()
      print(wins/total)


  def singleplayer_p2_against_random(self, n):
    with torch.no_grad():
      wins = 0.0
      total = 0.0
      for i in range(n):
        self.env.reset()
        while not self.env.is_done():
          obs = self.env.obs()
          if self.env.current_player == 2:
            move, pred = self.player2.action(obs, True)
            (movex, movey) = self.env.step('greedy', move=move, transpose=True)
          else:
            (movex, movey) = self.env.step('random')
          winner = self.env.search_for_win()
          if winner == 2:
            wins += 1
            total += 1
          elif winner == 1:
            total += 1
        self.env.print_self()
      print(wins/total)
  
  def random_against_random(self, n):
    p1_wins = 0.0
    p2_wins = 0.0
    for i in range(n):
      self.env.reset()
      while not self.env.is_done():
        self.env.step('random')
        winner = self.env.search_for_win()
        if winner == 1:
          p1_wins += 1
        elif winner == 2:
          p2_wins += 1
      self.env.print_self()
    print("P1 wins: ", p1_wins)
    print("P2 wins: ", p2_wins)

In [None]:
doubleagent = Agent(LinearModel(), 5)

In [None]:
yahaha = Controller(5, doubleagent, doubleagent, doubleagent)

In [None]:
yahaha.run_game_singleplayer_no_replay(epsilon = 1, printgame = True)

Target:  tensor(-0.1050)
Loss:  tensor(0.0055, grad_fn=<PowBackward0>)
Target:  tensor(-0.1185)
Loss:  tensor(0.0567, grad_fn=<PowBackward0>)
Target:  tensor(-0.1017)
Loss:  tensor(0.0026, grad_fn=<PowBackward0>)
Target:  tensor(-0.1262)
Loss:  tensor(0.0398, grad_fn=<PowBackward0>)
Target:  tensor(-0.0960)
Loss:  tensor(0.0019, grad_fn=<PowBackward0>)
Target:  tensor(-0.1326)
Loss:  tensor(0.0352, grad_fn=<PowBackward0>)
Target:  tensor(-0.0979)
Loss:  tensor(0.0012, grad_fn=<PowBackward0>)
Target:  tensor(-0.1298)
Loss:  tensor(0.0284, grad_fn=<PowBackward0>)
Target:  tensor(-0.1329)
Loss:  tensor(1.4613e-05, grad_fn=<PowBackward0>)
Target:  tensor(-0.0998)
Loss:  tensor(0.0065, grad_fn=<PowBackward0>)
Target:  tensor(-0.1291)
Loss:  tensor(0.0051, grad_fn=<PowBackward0>)
Target:  tensor(-0.1002)
Loss:  tensor(0.0031, grad_fn=<PowBackward0>)
Target:  tensor(-0.1409)
Loss:  tensor(0.0085, grad_fn=<PowBackward0>)
Target:  tensor(-0.0717)
Loss:  tensor(2.7764e-05, grad_fn=<PowBackward0>

In [None]:
yahaha.train_singleplayer(1000, 1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Target:  tensor(1.0941)
Loss:  tensor(0.0358, grad_fn=<PowBackward0>)
Target:  tensor(-0.7196)
Loss:  tensor(0.7756, grad_fn=<PowBackward0>)
Target:  tensor(1.1686)
Loss:  tensor(0.1289, grad_fn=<PowBackward0>)
Target:  tensor(-4.1510)
Loss:  tensor(0.7468, grad_fn=<PowBackward0>)
Target:  tensor(3.2165)
Loss:  tensor(2.0079, grad_fn=<PowBackward0>)
Target:  tensor(-4.9568)
Loss:  tensor(1.9234, grad_fn=<PowBackward0>)
Target:  tensor(3.3722)
Loss:  tensor(4.5263, grad_fn=<PowBackward0>)
Target:  tensor(-3.8973)
Loss:  tensor(0.0228, grad_fn=<PowBackward0>)
Target:  tensor(1.4414)
Loss:  tensor(8.2590, grad_fn=<PowBackward0>)
Target:  tensor(-4.5147)
Loss:  tensor(8.6848, grad_fn=<PowBackward0>)
Target:  tensor(2.5376)
Loss:  tensor(5.9811, grad_fn=<PowBackward0>)
Target:  tensor(-5.4557)
Loss:  tensor(6.5221, grad_fn=<PowBackward0>)
Target:  tensor(2.6425)
Loss:  tensor(11.0998, grad_fn=<PowBackward0>)
Target:  tensor(-4

In [None]:
yahaha.singleplayer_p1_against_random(1000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 2 2 1 1 2 
  _ 2 1 _ _ 
   _ 1 2 1 _ 
    1 2 1 _ _ 
_ _ _ 1 _ 
 _ _ 1 _ _ 
  _ _ 1 2 _ 
   _ 1 _ 2 _ 
    2 1 2 _ _ 
_ _ _ 1 _ 
 2 _ 1 _ _ 
  2 _ 1 _ _ 
   _ 1 _ 2 2 
    _ 1 _ _ _ 
2 _ _ 1 _ 
 _ _ 1 _ _ 
  _ 2 1 _ _ 
   _ 1 _ _ _ 
    _ 1 2 2 _ 
_ _ 2 1 2 
 _ _ 1 _ _ 
  _ _ 1 _ 2 
   _ 1 2 _ _ 
    _ 1 _ _ _ 
_ 2 _ 1 2 
 _ _ 1 _ _ 
  _ _ 1 2 2 
   _ 1 _ _ _ 
    _ 1 _ _ _ 
2 2 _ 2 1 
 _ 2 1 1 2 
  _ 2 1 _ _ 
   _ 1 1 _ _ 
    _ 1 _ _ _ 
_ _ _ 1 _ 
 2 _ 1 2 _ 
  _ _ 1 _ _ 
   _ 1 2 2 _ 
    _ 1 _ _ _ 
2 _ _ 1 _ 
 _ _ 1 _ 2 
  _ _ 1 _ _ 
   _ 1 _ _ 2 
    2 1 _ _ _ 
2 2 1 1 1 
 2 2 1 1 1 
  _ 1 1 2 2 
   1 2 2 2 2 
    1 1 1 2 _ 
2 2 2 1 1 
 2 _ 1 1 2 
  2 2 1 1 _ 
   _ 1 1 1 _ 
    1 2 2 _ _ 
_ _ _ 1 _ 
 _ 2 1 _ _ 
  _ 2 1 _ _ 
   _ 2 1 _ _ 
    _ 1 _ _ 2 
_ 2 2 1 1 
 _ _ 1 1 _ 
  _ _ 1 _ 2 
   2 2 1 _ _ 
    _ 2 1 _ _ 
_ 2 _ 2 1 
 _ _ 1 1 _ 
  _ 2 1 2 _ 
   2 1 1 2 _ 
    _ 1 _ _ _ 
_ _ _ 1 _ 
 _ _ 1 _ _ 
  _ _ 1 2 2 


In [None]:
yahaha.singleplayer_p2_against_random(1000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 _ _ 1 2 2 
  2 2 2 2 _ 
   1 1 1 _ 1 
    2 _ _ _ _ 
1 _ _ 1 2 
 1 _ _ 2 1 
  1 2 2 2 1 
   2 2 _ 2 _ 
    2 1 1 1 _ 
_ _ 1 1 2 
 _ _ _ 2 1 
  2 2 2 1 2 
   2 2 _ 1 1 
    1 1 _ _ _ 
_ 1 1 _ 1 
 _ 1 2 2 2 
  _ 2 1 2 _ 
   2 _ _ 1 _ 
    _ _ _ _ _ 
_ _ _ _ _ 
 1 1 _ 2 2 
  _ 2 2 _ _ 
   2 1 _ _ 1 
    1 _ _ _ _ 
1 1 _ 1 _ 
 _ _ _ 2 2 
  1 2 2 _ _ 
   2 _ _ _ _ 
    _ _ _ 1 _ 
_ _ _ _ 1 
 _ _ _ 2 2 
  1 2 2 2 1 
   2 _ _ 1 1 
    1 _ _ _ _ 
_ _ 1 _ 1 
 _ _ 1 2 2 
  _ _ 2 _ _ 
   2 2 1 _ _ 
    _ 1 _ _ _ 
_ 1 _ 1 _ 
 _ _ _ 2 2 
  1 2 2 _ _ 
   2 _ _ 1 _ 
    _ _ 1 _ _ 
_ 1 _ _ 2 
 _ _ 2 2 1 
  1 2 1 1 2 
   1 2 _ 2 _ 
    2 _ 1 1 _ 
_ _ _ 1 _ 
 _ _ _ 2 2 
  _ 1 2 1 _ 
   2 2 _ 1 1 
    _ _ _ _ _ 
_ _ _ _ _ 
 1 _ 1 2 2 
  2 2 2 2 1 
   1 1 _ _ _ 
    2 1 _ _ 1 
1 1 _ _ 1 
 _ _ 1 2 1 
  1 2 2 2 2 
   2 2 _ _ _ 
    2 1 1 _ _ 
_ _ _ 1 _ 
 _ _ _ 2 2 
  _ 2 2 _ _ 
   2 1 1 1 _ 
    _ _ _ _ 1 
_ _ _ _ _ 
 _ _ 1 2 2 
  _ 1 2 2 _ 


In [None]:
yahaha.random_against_random(1000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  2 _ 2 2 1 
   1 _ 2 2 1 
    2 1 2 1 1 
1 1 1 2 2 
 2 1 1 1 1 
  1 1 2 1 1 
   2 1 2 2 2 
    2 1 2 2 2 
1 _ 2 1 2 
 2 1 2 2 1 
  2 1 2 _ 1 
   2 2 2 _ 1 
    _ 1 _ 1 1 
1 2 1 2 1 
 1 2 1 2 1 
  1 1 2 2 2 
   1 2 _ 2 1 
    2 1 2 1 2 
1 2 1 1 _ 
 2 1 1 2 2 
  1 _ _ 1 2 
   _ 1 2 2 1 
    2 2 1 _ 2 
2 2 1 2 2 
 1 1 1 1 2 
  1 1 2 2 1 
   2 1 2 1 2 
    1 1 1 2 2 
2 1 _ 1 2 
 1 2 _ _ 1 
  1 2 2 2 1 
   1 1 2 1 2 
    _ 1 _ _ 2 
1 1 1 2 1 
 1 2 1 2 2 
  2 1 1 1 1 
   2 2 2 1 2 
    1 2 1 2 2 
2 1 2 2 1 
 2 1 1 _ 2 
  2 1 2 2 1 
   1 _ 2 1 1 
    1 2 1 1 2 
_ 2 1 1 1 
 _ _ 2 _ 2 
  _ 2 2 1 1 
   1 1 2 2 2 
    2 2 1 1 1 
1 _ 2 2 1 
 2 1 1 2 1 
  1 2 1 2 2 
   1 2 2 1 1 
    2 1 2 2 1 
_ 2 _ 1 1 
 2 2 2 1 2 
  2 2 1 2 _ 
   2 _ 1 1 1 
    _ 1 1 _ 1 
2 1 1 _ 2 
 1 1 2 1 2 
  1 2 2 2 2 
   2 _ 1 1 1 
    2 1 2 _ 1 
2 2 2 2 1 
 1 1 1 1 2 
  1 1 1 2 2 
   2 2 1 2 1 
    2 1 2 1 1 
1 2 1 2 1 
 1 1 1 2 2 
  2 2 1 1 2 
   1 2 1 2 2

In [None]:
(history1, history2) = yahaha.test_game(True)

Winner:  2
_ _ _ 1 _ 
 _ _ 2 1 2 
  _ 1 1 2 _ 
   2 2 2 _ _ 
    _ 1 _ 1 _ 


In [None]:
history1

[tensor([-0.0708,  0.1481, -0.0765, -0.1040, -0.2387, -0.0567, -0.1883,  0.4186,
          0.4031, -0.3719,  0.0958,  0.1042,  0.7201, -0.0874, -0.1365, -0.1048,
          0.2360, -0.2983, -0.3949, -0.1941, -0.2685,  0.1183, -0.3013, -0.2227,
         -0.2641]),
 tensor([-0.4316, -0.1293, -0.5770, -0.4796, -0.4951, -0.3144, -0.6084,  0.7068,
          0.6435, -0.8161, -0.2362,  0.1342,  0.8738, -0.3244, -0.1657, -0.3304,
          0.5838, -0.5355, -0.5078, -0.5640, -0.8149,  0.1220, -1.2034, -0.2842,
         -0.5262]),
 tensor([ 0.1711,  0.4011, -0.0683,  0.1123,  0.1132,  0.3140, -0.1433,  1.2079,
          0.9828, -0.3655,  0.3677,  0.4984,  1.1354,  0.2926,  0.3638,  0.0169,
          0.8597, -0.0155, -0.1502,  0.1658, -0.5163,  0.5939, -0.6719,  0.3852,
          0.1788]),
 tensor([-0.3337, -0.2422, -0.8447, -0.3617, -0.4331, -0.2460, -0.7254,  0.5908,
          0.5387, -0.8049, -0.5289,  0.3284,  0.7560, -0.2426, -0.1627, -0.4478,
          0.8294, -0.2195, -0.5192, -0.2180, -1.0

In [None]:
history2

[tensor([-1.8761, -1.9803, -1.9529, -1.7050, -2.1053, -2.0570, -2.2863, -0.6368,
         -1.1009, -1.9653, -1.9753, -1.3998,  0.1315, -1.9037, -1.8851, -1.6807,
         -0.7485, -1.7066, -2.2818, -2.2300, -2.1185, -1.9882, -2.0215, -2.2419,
         -2.4030]),
 tensor([-1.3577, -1.8880, -1.8752, -1.4706, -1.7011, -1.6995, -2.3054, -0.2278,
         -0.0135, -1.4393, -1.6421, -1.1862,  0.3255, -1.4621, -1.7804, -1.5178,
         -0.5008, -1.0029, -2.0551, -1.7237, -1.8585, -2.1237, -1.6651, -1.9767,
         -1.7530]),
 tensor([-1.3254, -1.8574, -1.8772, -1.2283, -1.6033, -1.6687, -2.2638, -0.2119,
          0.0230, -1.3823, -1.6027, -1.2968,  0.1609, -1.2872, -1.6218, -1.3450,
         -0.6663, -0.7960, -1.8821, -1.5363, -1.8322, -1.8867, -1.6333, -1.8197,
         -1.6636]),
 tensor([-0.3759, -0.4937, -0.4847, -0.3591, -0.6297, -0.2677, -0.7228,  0.1165,
          0.3285, -0.1231, -0.3901, -0.0215,  0.4420, -0.2654, -0.7491, -0.2970,
          0.1574, -0.2317, -0.5462, -0.3880, -0.4

In [None]:
for param in doubleagent.model.parameters():
  print(param)

Parameter containing:
tensor([[ 0.0033, -0.2526,  0.8168,  ...,  0.6886,  1.7328,  1.1039],
        [-0.4767, -0.0443,  0.0381,  ...,  1.5341,  1.4199, -1.3816],
        [ 1.3538, -3.2625, -0.0197,  ...,  1.0370, -2.3793,  1.0905],
        ...,
        [ 1.8331, -0.6412,  0.6454,  ..., -0.0077,  1.6195,  1.9308],
        [-1.6318, -0.0685, -1.6643,  ...,  1.2157, -0.0737, -1.3769],
        [ 1.0138, -1.2445,  1.9669,  ...,  0.8293,  1.6652, -0.0365]],
       requires_grad=True)
Parameter containing:
tensor([-2.8697, -1.6745, -2.0414, -2.6174, -1.9606, -1.6100, -2.6316, -1.5972,
        -2.3851, -0.7202, -2.0454, -1.2874, -2.1237, -2.8194, -2.0891, -1.5561,
        -1.9337, -2.7758, -1.7449, -2.7259, -1.6899, -1.6273, -2.4857, -1.8566,
        -2.6563], requires_grad=True)
