<a href="https://colab.research.google.com/github/joshuazhu17/Project_Studio/blob/expanded_rewards/HexGame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import random
import math

In [2]:
class HexEnvironment():
    def __init__(self, boardsize):
        self.size = boardsize
        self.grid = [[], []] #the first is where p1 puts its pieces, the second is where p2 puts its pieces
        for i in range(boardsize):
            p1_row = []
            p2_row = []
            for j in range(boardsize):
                p1_row.append(0)
                p2_row.append(0)
            self.grid[0].append(p1_row)
            self.grid[1].append(p2_row)
        self.current_player = 1
        self.done = False

    def neighbors(self, x, y):
        neighbors = [[x-1, y+1], [x-1, y], [x, y-1], [x+1, y-1], [x+1, y], [x, y+1]]
        real_neighbors = []
        for i in range(len(neighbors)):
            [a, b] = neighbors[i]
            if (a >= 0) and (a < self.size) and (b >= 0) and (b < self.size):
                real_neighbors.append([a, b])
        return real_neighbors

    def obs(self):
        return self.grid

    def is_done(self):
        return self.done

    def get_player(self):
        return self.current_player
    
    def reset(self):
        self.current_player = 1
        self.done = False
        for i in range(self.size):
            for j in range(self.size):
                self.grid[0][i][j] = 0
                self.grid[1][i][j] = 0

    def print_self(self):
        for i in range(self.size):
            next_line = ""
            next_line += " "*i
            for j in range(self.size):
                if self.grid[0][i][j] == 1:
                    next_line += "1"
                elif self.grid[1][i][j] == 1:
                    next_line += "2"
                else:
                    next_line += "_"
                next_line += " "
            print(next_line)

    def search_for_win(self):
        #Player 1 wants to connect bottom left ((10, 0), (10, 1), ..., (10, 10)) to top right ((0, 0), (0, 1), ... (0, 10))
        #Player 2 wants to connect top left ((0, 0), (1, 0), ... (10, 0)) to bottom right ((0, 10), (1, 10), ... (10, 10))
        #I'll do a dfs to find a winning path
        
        for player in range(2): #0 is p1, 1 is p2
            stack = []
            visited = [[False for i in range(self.size)] for j in range(self.size)]
            #Initialize search with the side corresponding to the player
            if player == 0:
                for i in range(self.size):
                    if self.grid[0][self.size-1][i] == 1:
                        stack.append([self.size-1, i])
            elif player == 1:
                for i in range(self.size):
                    if self.grid[1][i][0] == 1:
                        stack.append([i, 0])

            #Do the search
            while len(stack)>0:
                [x, y] = stack.pop()
                visited[x][y] = True
                neighbors = self.neighbors(x, y)
                for [newx, newy] in neighbors:
                    if (self.grid[player][newx][newy] == 1) and (not visited[newx][newy]):
                        stack.append([newx, newy])

            #End the search by checking the other side corresponding to the player
            if player == 0:
                for i in range(self.size):
                    if visited[0][i]:
                        self.done = True
                        return 1
            elif player == 1:
                for i in range(self.size):
                    if visited[i][self.size-1]:
                        self.done = True
                        return 2

        #0 means no one has won
        return 0

    def greedy(self, probs):
        #I need to filter out the illegal moves
        valid_probs = {}

        for i in range(self.size):
            for j in range(self.size):
                if self.grid[0][i][j] == 0 and self.grid[1][i][j] == 0:
                    valid_probs[(i, j)] = probs[i][j]
        #for now I'll just take the maximum probability
        sorted_vals = sorted(valid_probs.items(), key=lambda x: x[1], reverse=True)
        (best_x, best_y) = sorted_vals[0][0]
        
        return (best_x, best_y)
      
    def epsilon_greedy(self, probs, epsilon):
        #I need to filter out the illegal moves
        valid_probs = {}

        for i in range(self.size):
            for j in range(self.size):
                if self.grid[0][i][j] == 0 and self.grid[1][i][j] == 0:
                    valid_probs[(i, j)] = probs[i][j]
        
        sorted_vals = sorted(valid_probs.items(), key=lambda x: x[1], reverse=True)
        (best_x, best_y) = sorted_vals[0][0]

        choice = random.random()
        if choice < epsilon:
            (best_x, best_y) = sorted_vals[random.randint(0, len(sorted_vals)-1)][0]
        
        return (best_x, best_y)
    
    def random(self):
        valid_moves = []

        for i in range(self.size):
            for j in range(self.size):
                if self.grid[0][i][j] == 0 and self.grid[1][i][j] == 0:
                    valid_moves.append((i, j))

        index = random.randint(0, len(valid_moves)-1)
        (bext_x, best_y) = valid_moves[index]
        return (bext_x, best_y)

    def step(self, policy, probs = None, epsilon = 0):
        #probs will be a 2d array with probabilities for each hexagon of the grid

        if policy == 'epsilon_greedy':
            (best_x, best_y) = self.epsilon_greedy(probs, epsilon)
        elif policy == 'greedy':
            (best_x, best_y) = self.greedy(probs)
        elif policy == 'random':
            (best_x, best_y) = self.random()
        self.grid[self.current_player - 1][best_x][best_y] = 1
        
        if self.current_player == 1:
            self.current_player = 2
        else:
            self.current_player = 1

        return (best_x, best_y)

In [3]:
class Agent():
    def __init__(self, model, size):
        self.model = model
        self.size = size

    def action(self, grid):
        nn_input = torch.tensor(grid).float()
        nn_input = torch.flatten(nn_input)
        preds = self.model(nn_input)
        out = preds.detach().clone().numpy()
        out = out.reshape((self.size, self.size))
        out = out.tolist()
        return out, preds

In [11]:
model1 = torch.nn.Sequential(
    torch.nn.Linear(50, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 25),
    torch.nn.LogSoftmax()
)
model2 = torch.nn.Sequential(
    torch.nn.Linear(50, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 25),
    torch.nn.LogSoftmax()
)
agent1 = Agent(model1, 5)
agent2 = Agent(model2, 5)

In [17]:
class Controller():
  def __init__(self, size, player1, player2):
    self.size = size
    self.env = HexEnvironment(size)
    self.player1 = player1
    self.player2 = player2

    #These are going to be 3 dimensional. Dim 1 (size: games recorded so far): game playthroughs. Dim 2 (size: number of moves made in the game): moves. Dim 3 (size: 2) the move made and probs
    self.p1preds = []
    self.p2preds = []

    #Rewards: 1 upon winning -0.01 for each other turn elapsed, -1 for the move made before losing + 0.01 for each other turn elapsed

    #e_rewards for "eventual rewards", given by R_t = Sum_{k=0}^\infty gamma^k r_{t+k}, where gamma is the discount factor
    self.gamma = 0.99
    self.move_reward = 0.01 #will get flipped if you lose
    self.win_reward = -1.0
    self.lose_reward = 1.0
    self.p1e_rewards = []
    self.p2e_rewards = []
    self.winner_history = []

    self.p1_optimizer = torch.optim.Adam(self.player1.model.parameters())
    self.p2_optimizer = torch.optim.Adam(self.player2.model.parameters())
  
  def run_game(self, epsilon=0, printgame = False):
    current_p1preds = []
    current_p2preds = []
    current_p1rewards = []
    current_p2rewards = []
    self.env.reset()
    while not self.env.is_done():
      obs = self.env.obs()
      if self.env.current_player == 1:
        move, pred = self.player1.action(obs)
        (movex, movey) = self.env.step('epsilon_greedy', move, epsilon)
        tile_num = movex*self.size + movey
        current_p1preds.append((tile_num, pred))
        current_p1rewards.append(self.move_reward)
      else:
        move, pred = self.player2.action(obs)
        (movex, movey) = self.env.step('epsilon_greedy', move, epsilon)
        tile_num = movex*self.size + movey
        current_p2preds.append((tile_num, pred))
        current_p2rewards.append(self.move_reward)
      winner = self.env.search_for_win()
      if winner != 0:
        self.winner_history.append(winner)
        if winner == 1:
          current_p1rewards[-1] = self.win_reward
          for i in range(len(current_p2rewards)):
            current_p2rewards[i] = current_p2rewards[i]*-1
          current_p2rewards[-1] = self.lose_reward
        else:
          current_p2rewards[-1] = self.win_reward
          for i in range(len(current_p1rewards)):
            current_p1rewards[i] = current_p1rewards[i]*-1
          current_p1rewards[-1] = self.lose_reward
    if printgame:
      self.env.print_self()

    self.p1preds.append(current_p1preds)
    self.p2preds.append(current_p2preds)
    current_p1e_rewards = []
    current_p2e_rewards = []

    for i in range(len(current_p1rewards)):
      e_reward = 0
      for j in range(len(current_p1rewards)-i):
        factor = self.gamma**j
        e_reward += (current_p1rewards[i+j]*factor)
      current_p1e_rewards.append(e_reward)
    for i in range(len(current_p2rewards)):
      e_reward = 0
      for j in range(len(current_p2rewards)-i):
        factor = self.gamma**j
        e_reward += (current_p2rewards[i+j]*factor)
      current_p2e_rewards.append(e_reward)

    self.p1e_rewards.append(current_p1e_rewards)
    self.p2e_rewards.append(current_p2e_rewards)

  def test_game(self):
    with torch.no_grad():
      current_p1preds = []
      current_p2preds = []
      self.env.reset()
      self.winner = 0
      while not self.env.is_done():
        obs = self.env.obs()
        if self.env.current_player == 1:
          move, pred = self.player1.action(obs)
          (movex, movey) = self.env.step('greedy', probs=move)
          tile_num = movex*self.size + movey
          current_p1preds.append((tile_num, pred))
        else:
          move, pred = self.player2.action(obs)
          (movex, movey) = self.env.step('greedy', probs=move)
          tile_num = movex*self.size + movey
          current_p2preds.append((tile_num, pred))
        winner = self.env.search_for_win()
        if winner != 0:
          self.winner = winner
        #Since this is in the while loop, it will print the whole history of the game
        self.env.print_self()
      print(current_p1preds)
      print(current_p2preds)
  
  def backprop(self, clip=1):
    #player1 backprop

    #normalizing rewards
    #[item for sublist in list for item in sublist]
    flattenedp1e_rewards = []
    for i in range(len(self.p1e_rewards)):
      for j in range(len(self.p1e_rewards[i])):
        flattenedp1e_rewards.append(self.p1e_rewards[i][j])
    p1mean = np.mean(flattenedp1e_rewards)
    p1std = np.std(flattenedp1e_rewards, ddof=1)
    for i in range(len(self.p1e_rewards)):
      for j in range(len(self.p1e_rewards[i])):
        self.p1e_rewards[i][j] = (self.p1e_rewards[i][j]-p1mean)/p1std

    p1total_loss = 0
    for i in range(len(self.p1preds)):
      game = self.p1preds[i]
      p1batch_loss = 0
      for j in range(len(game)):
        (tile_num, pred) = game[j]
        p1batch_loss += (pred[tile_num]*self.p1e_rewards[i][j])
      #not actually sure if I want the following line
      #p1batch_loss = p1batch_loss*1.0/len(game)
      
      p1total_loss += p1batch_loss

    print("Player1 loss: ", p1total_loss)
    p1total_loss.backward()

    #gradient clipping
    torch.nn.utils.clip_grad_norm_(self.player1.model.parameters(), clip)

    self.p1_optimizer.step()

    self.player1.model.zero_grad()

    #player2 backprop

    #normalizing rewards
    flattenedp2e_rewards = []
    for i in range(len(self.p2e_rewards)):
      for j in range(len(self.p2e_rewards[i])):
        flattenedp2e_rewards.append(self.p2e_rewards[i][j])
    p2mean = np.mean(flattenedp2e_rewards)
    p2std = np.std(flattenedp2e_rewards, ddof=1)
    for i in range(len(self.p2e_rewards)):
      for j in range(len(self.p2e_rewards[i])):
        self.p2e_rewards[i][j] = (self.p2e_rewards[i][j]-p2mean)/p2std

    p2total_loss = 0
    for i in range(len(self.p2preds)):
      game = self.p2preds[i]
      p2batch_loss = 0
      for j in range(len(game)):
        (tile_num, pred) = game[j]
        p2batch_loss += (pred[tile_num]*self.p2e_rewards[i][j])
      #not actually sure if I want the following line
      #p2batch_loss = p2batch_loss*1.0/len(game)
      
      p2total_loss += p2batch_loss

    print("Player2 loss: ", p2total_loss)
    p2total_loss.backward()

    #gradient clipping
    torch.nn.utils.clip_grad_norm_(self.player2.model.parameters(), clip)

    self.p2_optimizer.step()

    self.player2.model.zero_grad()

    #reset histories
    self.p1preds = []
    self.p2preds = []
    self.p1e_rewards = []
    self.p2e_rewards = []

  def log(self):
    flattenedp1e_rewards = []
    for i in range(len(self.p1e_rewards)):
      for j in range(len(self.p1e_rewards[i])):
        flattenedp1e_rewards.append(self.p1e_rewards[i][j])
    p1mean = np.mean(flattenedp1e_rewards)
    p1std = np.std(flattenedp1e_rewards, ddof=1)

    flattenedp2e_rewards = []
    for i in range(len(self.p2e_rewards)):
      for j in range(len(self.p2e_rewards[i])):
        flattenedp2e_rewards.append(self.p2e_rewards[i][j])
    p2mean = np.mean(flattenedp2e_rewards)
    p2std = np.std(flattenedp2e_rewards, ddof=1)

    print("-----------------------------------")
    print("P1 expected reward mean: ", p1mean)
    print("P1 expected reward std: ", p1std)
    print("P2 expected reward mean: ", p2mean)
    print("P2 expected reward std: ", p2std)
    print("-----------------------------------")

  def train(self, games, batch_size, verbose=False):
    for i in range(games):
      epsilon = max(math.e**(-games/1000), 0.1)
      self.run_game(epsilon)
      if i%batch_size == 0 and i != 0:
        if verbose:
          self.log()
        self.backprop()

  def p1_against_random(self, n):
    with torch.no_grad():
      wins = 0.0
      total = 0.0
      for i in range(n):
        self.env.reset()
        self.winner = 0
        while not self.env.is_done():
          obs = self.env.obs()
          if self.env.current_player == 1:
            move, pred = self.player1.action(obs)
            (movex, movey) = self.env.step('greedy', probs=move)
          else:
            pred = 'random'
            (movex, movey) = self.env.step('random')
            tile_num = movex*self.size + movey
          winner = self.env.search_for_win()
          if winner == 1:
            self.winner = winner
            wins += 1
            total += 1
          elif winner == 2:
            self.winner = winner
            total += 1
        self.env.print_self()
      print(wins/total)

  def p2_against_random(self, n):
    with torch.no_grad():
      wins = 0.0
      total = 0.0
      for i in range(n):
        self.env.reset()
        self.winner = 0
        while not self.env.is_done():
          obs = self.env.obs()
          if self.env.current_player == 2:
            move, pred = self.player2.action(obs)
            (movex, movey) = self.env.step('greedy', probs=move)
          else:
            pred = 'random'
            (movex, movey) = self.env.step('random')
            tile_num = movex*self.size + movey
          winner = self.env.search_for_win()
          if winner == 2:
            self.winner = winner
            wins += 1
            total += 1
          elif winner == 1:
            self.winner = winner
            total += 1
        self.env.print_self()
      print(wins/total)

In [19]:
test = Controller(5, agent1, agent2)

In [13]:
test.p1_against_random(1000)

  input = module(input)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 1 _ 1 1 2 
  _ 2 1 1 1 
   2 2 2 2 1 
    1 2 2 1 _ 
2 _ 1 1 2 
 1 2 1 2 1 
  2 2 2 1 1 
   1 1 2 2 1 
    1 _ 2 2 _ 
_ 1 2 1 1 
 2 2 1 1 1 
  2 2 _ 1 2 
   2 2 1 2 1 
    1 1 2 _ _ 
2 1 1 1 1 
 2 2 1 1 1 
  2 2 2 1 1 
   2 2 2 2 1 
    1 1 2 2 1 
2 1 1 1 2 
 1 2 1 1 2 
  1 1 2 1 2 
   2 2 2 1 1 
    1 2 2 2 1 
2 1 2 2 2 
 1 2 1 1 _ 
  1 2 _ 1 1 
   2 1 2 2 1 
    2 1 1 2 _ 
2 _ _ 1 2 
 2 2 1 1 1 
  2 1 _ 1 1 
   2 1 2 2 1 
    1 _ 2 _ _ 
_ 2 1 2 2 
 1 2 1 1 1 
  1 2 _ 2 1 
   2 1 _ 2 1 
    1 2 2 1 _ 
2 1 2 1 1 
 1 2 1 1 1 
  1 2 2 2 1 
   1 2 _ 2 1 
    2 1 2 2 2 
2 _ 2 2 1 
 1 2 _ 1 1 
  _ 2 1 2 _ 
   _ 1 2 _ 1 
    _ 1 _ 1 2 
1 1 1 1 2 
 1 2 1 2 1 
  2 2 1 1 2 
   2 2 _ 2 1 
    1 2 2 1 2 
_ 2 1 2 1 
 1 _ 2 1 1 
  2 2 _ _ 1 
   2 1 _ _ 1 
    _ _ 2 1 2 
2 2 1 1 2 
 1 2 1 1 2 
  1 1 _ 1 2 
   2 1 2 2 1 
    2 2 1 1 2 
2 2 1 2 1 
 1 2 1 2 1 
  1 1 2 1 2 
   1 1 _ 2 1 
    2 2 2 1 2 
2 1 1 1 1 
 1 2 1 1 1 
  2 2 2 1 1 


In [14]:
test.train(10000, 100, verbose=True)

  input = module(input)


-----------------------------------
P1 expected reward mean:  -0.4280481774027693
P1 expected reward std:  0.8052192521908329
P2 expected reward mean:  0.39702045437879757
P2 expected reward std:  0.8281456216142914
-----------------------------------
Player1 loss:  tensor(4.0298, grad_fn=<AddBackward0>)
Player2 loss:  tensor(4.0967, grad_fn=<AddBackward0>)
-----------------------------------
P1 expected reward mean:  -0.3926910168086297
P1 expected reward std:  0.8278693099262141
P2 expected reward mean:  0.3581487869072861
P2 expected reward std:  0.850327863664875
-----------------------------------
Player1 loss:  tensor(2.1375, grad_fn=<AddBackward0>)
Player2 loss:  tensor(4.0167, grad_fn=<AddBackward0>)
-----------------------------------
P1 expected reward mean:  -0.2627436260301471
P1 expected reward std:  0.8729138575672261
P2 expected reward mean:  0.22370569897025133
P2 expected reward std:  0.889372179981338
-----------------------------------
Player1 loss:  tensor(-5.8563, 

In [16]:
test.p1_against_random(1000)

  input = module(input)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 2 _ _ _ 1 
  _ _ _ 2 1 
   _ _ _ _ 1 
    _ _ 2 1 _ 
_ _ _ _ 1 
 _ _ 2 _ 1 
  2 _ _ _ 1 
   _ _ _ 2 1 
    _ _ _ 1 2 
_ 2 1 _ 1 
 2 2 1 1 1 
  _ 2 2 1 1 
   _ _ 2 1 2 
    2 1 2 1 _ 
_ _ 2 _ 1 
 _ _ 1 1 2 
  _ 2 _ 1 1 
   _ 2 _ _ 1 
    _ 2 2 1 _ 
_ _ 2 _ 1 
 _ 2 _ _ 1 
  _ 2 _ _ 1 
   _ _ _ _ 1 
    2 _ _ 1 _ 
_ 2 1 2 2 
 _ 2 1 1 1 
  2 _ 2 _ 1 
   _ _ _ _ 1 
    _ _ _ 1 _ 
_ 2 2 _ 1 
 _ _ _ _ 1 
  _ _ _ _ 1 
   _ 2 _ _ 1 
    _ _ 2 1 _ 
_ 2 2 2 1 
 2 1 1 1 1 
  _ 2 _ 2 1 
   _ _ 1 1 1 
    _ 1 2 2 2 
_ _ _ _ 1 
 _ _ _ _ 1 
  _ _ 2 _ 1 
   _ _ _ _ 1 
    2 _ 2 1 2 
_ 2 2 2 1 
 _ _ _ _ 1 
  _ _ _ _ 1 
   _ _ _ _ 1 
    _ _ 2 1 _ 
_ _ 2 _ 1 
 _ 2 _ _ 1 
  2 2 _ _ 1 
   _ _ _ _ 1 
    _ _ _ 1 _ 
_ _ 1 1 1 
 2 _ 2 1 1 
  2 _ _ 1 1 
   2 2 1 2 2 
    _ 2 1 1 2 
_ _ _ _ 1 
 _ _ _ 2 1 
  _ 2 _ _ 1 
   _ _ _ 2 1 
    2 _ _ 1 _ 
2 _ _ 2 1 
 _ 2 _ _ 1 
  _ _ _ _ 1 
   _ _ _ _ 1 
    _ _ _ 1 2 
1 1 2 1 1 
 2 2 1 2 1 
  2 1 2 2 1 


In [20]:
test.p2_against_random(1000)

  input = module(input)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 _ 1 2 1 1 
  _ 1 2 2 2 
   2 2 2 2 1 
    1 _ _ 1 1 
_ _ 1 2 2 
 _ 1 2 2 _ 
  2 2 2 _ 1 
   1 _ 1 1 _ 
    _ _ 1 _ _ 
_ _ _ _ _ 
 1 _ 1 _ _ 
  _ 2 2 2 2 
   2 _ _ _ 1 
    1 1 _ _ _ 
_ 1 _ 2 2 
 _ _ _ _ 1 
  2 2 2 2 2 
   1 1 1 1 _ 
    _ 1 _ _ _ 
_ 1 _ 2 _ 
 _ _ 1 _ 1 
  1 2 2 2 2 
   2 _ _ 1 _ 
    _ _ _ _ 1 
1 _ 1 2 1 
 _ 1 _ _ _ 
  _ 2 2 2 2 
   2 _ _ _ _ 
    1 1 _ _ _ 
_ _ 1 _ 1 
 _ _ _ _ _ 
  _ 2 2 2 2 
   2 _ 1 _ _ 
    _ 1 _ _ 1 
_ _ _ 2 _ 
 _ 1 1 _ 1 
  _ 2 2 2 2 
   2 _ _ _ 1 
    1 1 _ _ _ 
1 _ 2 2 2 
 1 1 2 2 1 
  1 2 2 2 2 
   1 1 1 1 _ 
    _ _ _ 1 _ 
1 _ _ 2 _ 
 _ _ _ 1 _ 
  _ 2 2 2 2 
   2 _ 1 1 _ 
    1 1 _ _ _ 
1 _ 1 2 2 
 1 _ _ _ _ 
  1 2 2 2 2 
   2 _ 1 _ _ 
    _ 1 _ 1 _ 
_ _ 1 2 2 
 _ 1 2 _ _ 
  1 2 _ _ 1 
   2 _ 1 _ _ 
    _ _ _ _ _ 
_ 1 1 2 2 
 1 1 _ 1 _ 
  _ 2 2 2 2 
   2 _ _ 1 1 
    _ _ _ _ _ 
1 1 2 2 2 
 _ 1 1 1 2 
  1 2 2 2 2 
   1 1 2 2 2 
    1 _ 1 1 1 
_ 1 _ 2 1 
 1 _ 2 1 _ 
  1 1 2 2 2 
