In [1]:
# Needed if running on Colab
!pip3 install open-spiel
!pip3 install torch



In [23]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from go_search_problem import GoProblem, GoState
from heuristic_go_problems import GoProblemLearnedHeuristic, GoProblemSimpleHeuristic
from agents import GreedyAgent, RandomAgent, MCTSAgent, GameAgent
import matplotlib.pyplot as plt
from tqdm import tqdm
from game_runner import run_many
import pickle

torch.set_default_tensor_type(torch.FloatTensor)

In [3]:
def load_dataset(path: str):
    with open(path, 'rb') as f:
        dataset = pickle.load(f)
    return dataset

dataset_5x5 = load_dataset('dataset_5x5.pkl')
# dataset_9x9 = load_dataset('9x9_dataset.pkl')

In [4]:
def save_model(path: str, model):
    """
    Save model to a file
    Input:
        path: path to save model to
        model: Pytorch model to save
    """
    torch.save({
        'model_state_dict': model.state_dict(),
    }, path)

def load_model(path: str, model):
    """
    Load model from file

    Note: you still need to provide a model (with the same architecture as the saved model))

    Input:
        path: path to load model from
        model: Pytorch model to load
    Output:
        model: Pytorch model loaded from file
    """
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

# Task 1: Convert GameState to Features

In [5]:
def get_features(game_state: GoState):
    """
    Map a game state to a list of features.

    Some useful functions from game_state include:
        game_state.size: size of the board
        get_pieces_coordinates(player_index): get coordinates of all pieces of a player (0 or 1)
        get_pieces_array(player_index): get a 2D array of pieces of a player (0 or 1)
        
        get_board(): get a 2D array of the board with 4 channels (player 0, player 1, empty, and player to move). 4 channels means the array will be of size 4 x n x n
    
        Descriptions of these methods can be found in the GoState

    Input:
        game_state: GoState to encode into a fixed size list of features
    Output:
        features: list of features
    """
    board_size = game_state.size
    features = []
    # for first 25 features, use just a 1 or 0 to indicate if a black piece is in the slot
    black_player_pieces = game_state.get_pieces_array(0)
    black_total_pieces = sum(sum(black_player_pieces))
    for row in black_player_pieces:
        for piece in row:
            features.append(piece)
    # for second 25 features, use just a 1 or 0 to indicate if a white piece is in the slot
    white_player_pieces = game_state.get_pieces_array(1)
    white_total_pieces = sum(sum(white_player_pieces))
    for row in white_player_pieces:
        for piece in row:
            features.append(piece)
            
    # add total number of pieces
    features.append(black_total_pieces)
    features.append(white_total_pieces)
    # finally append the player to move
    features.append(game_state.player_to_move())


    # the solution might just be calling getboard() and flattening, look into this:

    

    return features

In [9]:
# Print information about first data point
data_point = dataset_5x5[0]
features = get_features(data_point[0])
action = data_point[1]
result = data_point[2]
print(data_point[0])
print("features", features)
print("Action #", action)
print("Game Result", result)

GoState(komi=0.5, to_play=B, history.size()=0)

 5 +++++
 4 +++++
 3 +++++
 2 +++++
 1 +++++
   ABCDE

features [np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), 0]


# Task 2: Supervised Learning of a Value Network

In [7]:
class ValueNetwork(nn.Module):
    def __init__(self, input_size):
      super(ValueNetwork, self).__init__()

      # output size should be 1, as we are predicting a value in between [-1,1]
      output_size = 1

      # add more layers
      self.layer1 = nn.Linear(input_size, 64)
      self.layer2 = nn.Linear(64, 32)
      self.layer3 = nn.Linear(32, 10)
      self.layer4 = nn.Linear(10, output_size)
      
      self.tanh = nn.Tanh()
      self.sigmoid = nn.Sigmoid()


    def forward(self, x):
      """
      Run forward pass of network

      Input:
        x: input to network
      Output:
        output of network
      """
      # relu, tanh, relu, sigmoid
      z1 = self.layer1(x)
      a1 = torch.relu(z1)
      z2 = self.layer2(a1)
      a2 = self.sigmoid(z2)
      z3 = self.layer3(a2)
      a3 = torch.relu(z3)
      z4 = self.layer4(a3)
      output = self.sigmoid(z4)
      return output

In [12]:
# This will not produce meaningful outputs until trained, but you can test for syntax errors
features_tensor = torch.Tensor(features)
value_net = ValueNetwork(len(features))
print("predicted Value", value_net(features_tensor))

predicted Value tensor([0.4003], grad_fn=<SigmoidBackward0>)


In [13]:
def train_value_network(dataset, num_epochs, learning_rate):
    """
    Train a value network on the provided dataset.

    Input:
        dataset: list of (state, action, result) tuples
        num_epochs: number of epochs to train for
        learning_rate: learning rate for gradient descent
    Output:
        model: trained model
    """
    # Make sure dataset is shuffled for better performance
    random.shuffle(dataset)
    # You may find it useful to create train/test sets to better track performance/overfit/underfit
    # calculate these just to get the correct input size for the value network.
    state = dataset[0][0]
    features = get_features(state)
    features_tensor = torch.Tensor(features)

    model = ValueNetwork(len(features_tensor))
    model = model.float()
    # NEED TO SPLIT DATASET INTO TRAINING AND TESTING???

    # Use MSE as loss function
    loss_function = nn.MSELoss()

    # You can use Adam, which is stochastic gradient descent with ADAptive Momentum
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    batch_size = 32
    batch_loss = 0.0
    batch_counter = 0
    for epoch in range(num_epochs):
        
        for data_point in dataset:
            state = data_point[0]
            features = get_features(state)
            features_tensor = torch.tensor(features, dtype=torch.float32)

            # Note: You will have to convert the label to a torch tensor to use with torch's loss functions
            # make the label be the result of the game from this data point
            label = torch.tensor(data_point[2], dtype=torch.float32)

            # make a prediction with the given features tensor
            prediction = model(features_tensor)
            # compute loss
            loss = loss_function(prediction, label)
            batch_loss += loss
            batch_counter += 1
            if batch_counter % batch_size == 0:
                # Call backward to run backward pass and compute gradients
                batch_loss.backward()

                # Run gradient descent step with optimizer
                optimizer.step()

                # Reset gradient for next batch
                optimizer.zero_grad()
                batch_loss = 0.0

    return model

value_model = train_value_network(dataset_5x5, 10, 1e-4)
save_model("value_model.pt", value_model)

  return F.mse_loss(input, target, reduction=self.reduction)


## Comparing Learned Value function against other Agents

In [None]:
class GoProblemLearnedHeuristic(GoProblem):
    def __init__(self, model=None, state=None):
        super().__init__(state=state)
        self.model = model
        
    def __call__(self, model=None):
        """
        Use the model to compute a heuristic value for a given state.
        """
        return self

    def encoding(self, state):
        """
        Get encoding of state (convert state to features)
        Note, this may call get_features() from Task 1. 

        Input:
            state: GoState to encode into a fixed size list of features
        Output:
            features: list of features
        """
        return get_features(state)

    def heuristic(self, state, player_index):
        """
        Return heuristic (value) of current state

        Input:
            state: GoState to encode into a fixed size list of features
            player_index: index of player to evaluate heuristic for
        Output:
            value: heuristic (value) of current state
        """
        
        value = 0
        # get encoding for the state:
        state_encoding = self.encoding(state)
        features_tensor = torch.Tensor(state_encoding)
        value = self.model(features_tensor)
        # create heuristic value based on this state
        # use return value you get from value Network
        
        # Note, your agent may perform better if you force it not to pass
        # (i.e., don't select action #25 on a 5x5 board unless necessary)
        return value

    def __str__(self) -> str:
        return "Learned Heuristic"


def create_value_agent_from_model():
    """
    Create agent object from saved model. This (or other methods like this) will be how your agents will be created in gradescope and in the final tournament.
    """

    model_path = "value_model.pt"
    feature_size = 53
    model = load_model(model_path, ValueNetwork(feature_size))
    heuristic_search_problem = GoProblemLearnedHeuristic(model)

    # TODO: Try with other heuristic agents (IDS/AB/Minimax)
    learned_agent = GreedyAgent(heuristic_search_problem)

    return learned_agent

learned_agent = create_value_agent_from_model()
agent2 = GreedyAgent(GoProblemSimpleHeuristic)
print("Greedy Agent", agent2)
print("Learned Agent", learned_agent)

run_many(learned_agent, GreedyAgent(), 40)

  checkpoint = torch.load(path)


Greedy Agent GreedyAgent + <class 'heuristic_go_problems.GoProblemSimpleHeuristic'>
Learned Agent GreedyAgent + Learned Heuristic


100%|██████████| 20/20 [00:02<00:00,  7.33it/s]

Agent 1: GreedyAgent + Learned Heuristic Score: 40.0
Agent 2: GreedyAgent + Simple Heuristic Score: -40.0
Agent 1: GreedyAgent + Learned Heuristic Score with Black (first move): 20.0
Agent 2: GreedyAgent + Simple Heuristic Score with Black (first move): -20.0
Agent 1: GreedyAgent + Learned Heuristic Average Duration: 0.002678077366616991
Agent 2: GreedyAgent + Simple Heuristic Average Duration: 0.0004990768853355855
Agent 1: GreedyAgent + Learned Heuristic Longest Duration: 0.008513927459716797
Agent 2: GreedyAgent + Simple Heuristic Longest Duration: 0.003184080123901367
Agent 1: GreedyAgent + Learned Heuristic Average Time Remaining: 36.44346686005592
Agent 2: GreedyAgent + Simple Heuristic Average Time Remaining: 35.98968895077705
Agent 1: GreedyAgent + Learned Heuristic Min Time Remaining: 32.93003988265991
Agent 2: GreedyAgent + Simple Heuristic Min Time Remaining: 31.986804008483887





(40.0, -40.0)

# Task 3: Supervised Learning of a Policy Network

In [18]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, board_size=5):
      super(PolicyNetwork, self).__init__()

      output_size = (board_size * board_size) + 1

      self.layer1 = nn.Linear(input_size, 64)
      self.layer2 = nn.Linear(64, 32)
      self.layer3 = nn.Linear(32, 32)
      self.layer4 = nn.Linear(32, output_size)
      self.tanh = nn.Tanh()
      self.sigmoid = nn.Sigmoid()

    def forward(self, x):
      z1 = self.layer1(x)
      a1 = self.tanh(z1)
      z2 = self.layer2(a1)
      a2 = torch.relu(z2)
      z3 = self.layer3(a2)
      a3 = torch.relu(z3)
      z4 = self.layer4(a3)
      output = self.sigmoid(z4)
      return output

In [19]:
# This will not produce meaningful outputs until trained, but you can test for syntax errors
features_tensor = torch.Tensor(features)
policy_net = PolicyNetwork(len(features))
print("Predicted Action Probabilities", policy_net(features_tensor))

Predicted Action Probabilities tensor([0.4676, 0.5095, 0.4832, 0.5046, 0.4911, 0.4817, 0.4840, 0.4762, 0.5102,
        0.5049, 0.5313, 0.5303, 0.4865, 0.5373, 0.5257, 0.4589, 0.4897, 0.4820,
        0.5280, 0.4546, 0.5054, 0.5383, 0.5070, 0.5451, 0.5027, 0.5505],
       grad_fn=<SigmoidBackward0>)


In [20]:
def train_policy_network(dataset, num_epochs, learning_rate):

    """
    Train a policy network on the provided dataset.

    Input:
        dataset: list of (state, action, result) tuples
        num_epochs: number of epochs to train for
        learning_rate: learning rate for gradient descent
    Output:
        model: trained model
    """
    random.shuffle(dataset)
    # needed to get the input size
    state = dataset[0][0]
    features = get_features(state)
    features_tensor = torch.Tensor(features)
    
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # create model
    model = PolicyNetwork(len(features_tensor))
    model = model.float()

    # TODO: Specify Loss Function
    loss_function = torch.nn.CrossEntropyLoss()

    # You can use Adam, which is stochastic gradient descent with ADAptive Momentum
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    batch_size = 32
    batch_loss = 0
    batch_counter = 0

    for epoch in range(num_epochs):
        for data_point in dataset:
            optimizer.zero_grad()

            # TODO: Get features from state and convert features to torch tensor
            state = data_point[0]
            features = get_features(state)
            features_tensor = torch.tensor(features, dtype=torch.float32)

            # TODO: What should the desired output of the value network be?
            # Note: You will have to convert the label to a torch tensor to use with torch's loss functions
            label = torch.tensor(data_point[1], dtype=torch.long)

            # TODO: Get model estimate of value
            prediction = model(features_tensor)

            # TODO: Compute Loss for data point
            loss = loss_function(prediction, label)
            batch_loss += loss
            batch_counter += 1
            if batch_counter % batch_size == 0:


                # Call backward to run backward pass and compute gradients
                loss.backward()

                # Run gradient descent step with optimizer
                optimizer.step()

                # Reset gradient
                optimizer.zero_grad()
                batch_loss = 0.0

    return model

policy_net = train_policy_network(dataset_5x5, 10, 1e-4)
save_model("policy_model.pt", policy_net)

## Comparing Learned Policy against other Agents

In [21]:
class PolicyAgent(GameAgent):
    def __init__(self, search_problem, model_path, board_size=5):
        super().__init__()
        self.search_problem = search_problem
        self.model = load_model(model_path, PolicyNetwork(53, 5))
        self.board_size = board_size

    def encoding(self, state):
        # get features from the state
        return get_features(state)

  # action = random.choice(self.search_problem.get_available_actions(game_state))
    def get_move(self, game_state, time_limit=1):
      """
      Get best action for current state using self.model

      Input:
        game_state: current state of the game
        time_limit: time limit for search (This won't be used in this agent)
      Output:
        action: best action to take
      """

      # TODO: Select LEGAL Best Action predicted by model
      # The top prediction of your model may not be a legal move!

      # get a tensor of the features from the current game state
      features_tensor = torch.tensor(self.encoding(game_state), dtype=torch.float32)
      model_output = self.model(features_tensor)
      legal_actions = self.search_problem.get_available_actions(game_state)
      #print(legal_actions)
      for i in range(len(model_output)):
          if i not in legal_actions:
              model_output[i] = float('-inf')
      # get best legal action
      action = torch.argmax(model_output).item()
      # if the selected action is to pass, but there are other options, choose other options
      if action == 25 and len(legal_actions) > 1:
          legal_actions.remove(25)
          action = legal_actions[torch.argmax(model_output[legal_actions])]
      #print(action)
      # Note, you may want to force your policy not to pass their turn unless necessary
      assert action in self.search_problem.get_available_actions(game_state)
      
      return action

    def __str__(self) -> str:
        return "Policy Agent"
    
def create_policy_agent_from_model():
    """
    Create agent object from saved model. This (or other methods like this) will be how your agents will be created in gradescope and in the final tournament.    
    """

    model_path = "policy_model.pt"
    agent = PolicyAgent(GoProblem(size=5), model_path)
    return agent

In [22]:
# policy_agent = PolicyAgent(GoProblem(size=5), "policy_model.pt")
policy_agent = create_policy_agent_from_model()
print("Policy Agent", policy_agent)
run_many(policy_agent, GreedyAgent(), 40)

  checkpoint = torch.load(path)


Policy Agent Policy Agent


100%|██████████| 20/20 [00:00<00:00, 20.12it/s]

Agent 1: Policy Agent Score: 40.0
Agent 2: GreedyAgent + Simple Heuristic Score: -40.0
Agent 1: Policy Agent Score with Black (first move): 20.0
Agent 2: GreedyAgent + Simple Heuristic Score with Black (first move): -20.0
Agent 1: Policy Agent Average Duration: 0.00041558361053466804
Agent 2: GreedyAgent + Simple Heuristic Average Duration: 0.0005225200653076172
Agent 1: Policy Agent Longest Duration: 0.0014526844024658203
Agent 2: GreedyAgent + Simple Heuristic Longest Duration: 0.0021543502807617188
Agent 1: Policy Agent Average Time Remaining: 39.989610409736635
Agent 2: GreedyAgent + Simple Heuristic Average Time Remaining: 39.986936998367305
Agent 1: Policy Agent Min Time Remaining: 39.984302043914795
Agent 2: GreedyAgent + Simple Heuristic Min Time Remaining: 39.97836089134216





(40.0, -40.0)

# Submitting

After you've completed all the tasks in this notebook, you'll want to add your agents to your agents.py file. You'll want to copy the necessary function and class definitions for PolicyAgent, GoProblemLearnedHeuristic, PolicyNetwork, ValueNetwork, and any other methods you referenced. Your agents will ultimately be tested on gradescope by calling create_value_agent_from_model or by create_policy_agent_from_model.