In [None]:
!pip install pettingzoo
!pip install pygame
from google.colab import drive
drive.mount('/content/drive')

from pettingzoo.classic import connect_four_v3
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.nn.modules.activation import LeakyReLU

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo
  Downloading PettingZoo-1.19.0-py3-none-any.whl (807 kB)
[K     |████████████████████████████████| 807 kB 5.2 MB/s 
[?25hCollecting gym>=0.21.0
  Downloading gym-0.25.0.tar.gz (720 kB)
[K     |████████████████████████████████| 720 kB 50.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.7-py3-none-any.whl (2.7 kB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.25.0-py3-none-any.whl size=824431 sha256=aa8fc39e1a3ad6068f7674dd21712aa8c55ae93b46be978e836d996ee404f8e8
  Stored in directory: /root/.cache/pip/wheels/2c/58/d8/1590abcfe48cdf414681b1e2b6647045b85f7c924563b664ee
Successfully built gy

In [None]:
class Actor(nn.Module):
    def __init__(self, n_actions, drop_out):
        super(Actor, self).__init__()
        self.n_actions = n_actions
        self.drop_out = drop_out

        self.actor = nn.Sequential(
                nn.Conv2d(1, 24, 3, padding=1),
                nn.LeakyReLU(),
                nn.Conv2d(24, 64, 3),
                nn.LeakyReLU(),
                nn.Conv2d(64, 128, 3),
                nn.LeakyReLU(),
                nn.Conv2d(128, 256, 4),
                nn.Flatten(start_dim=1, end_dim=-1),
                nn.LeakyReLU(),
                nn.Linear(256, self.n_actions),
                nn.Softmax(dim=-1)
                )
    
    def forward(self, state, action_mask):
        dist = self.actor(state.float()) * action_mask
        dist = dist / torch.sum(dist) + 1e-7
        dist = Categorical(dist)
        return dist


class Critic(nn.Module):
    def __init__(self, drop_out):
        super(Critic, self).__init__()
        self.drop_out = drop_out

        self.critic = nn.Sequential(
                nn.Conv2d(1, 24, 3, padding=1),
                nn.LeakyReLU(),
                nn.Conv2d(24, 64, 3),
                nn.LeakyReLU(),
                nn.Conv2d(64, 128, 3),
                nn.LeakyReLU(),
                nn.Conv2d(128, 256, 4),
                nn.Flatten(start_dim=1, end_dim=-1),
                nn.LeakyReLU(),
                nn.Linear(256, 1)
                )
    
    def forward(self, state):
        value = self.critic(state.float())
        return value

In [None]:
def clean_memory():

    #initialise agent memory dict
    agent_memory = {}

    #returns clean memory lists
    
    agent_memory['probs'] = []
    agent_memory['values'] = []
    agent_memory['obs'] = []
    agent_memory['rewards'] = []
    agent_memory['actions'] = []
    agent_memory['action_masks'] = []
    agent_memory['dones'] = []

    return agent_memory


def choose_action(observation, action_mask, actor):
    
    #returns a log probability of taking the action as well as the acion

    dist = actor(torch.tensor(observation), torch.tensor(action_mask))

    #sample action from distribution
    action = dist.sample()

    prob = torch.squeeze(dist.log_prob(action)).item()
    action = torch.squeeze(action).item()

    return action, prob


def advantage_calc(values, rewards, dones, lambda_, gamma):

    # returns a list containing the advantages for each timestep

    advantages = []

    # iterate over each timestep
    for i in range(len(rewards) - 1):

        # initialise advantage for timestep t and discount
        A_t = 0
        discount = 1.0

        # at each timestep iterate to the end of episode or block of timesteps
        for j in range(i, len(rewards) - 1):

            # calculate TD error
            # if done is true then no t + 1 val will exist so TD error is simply
            # reward at t minus value at t
            if dones[j] == True:
                TD_error = rewards[j] - values[j]
            
            # if not done then can take value from t + 1 into the future
            else:
                TD_error = rewards[j] + (gamma * values[j + 1]) - values[j]
            
            # add to advatage for iter i
            A_t += TD_error * discount

            # update discount with smoothing and discount gamma
            discount *= (lambda_ * gamma)
            
            # if the epsiode is finished then break advantage update
            # it would not make sense to continue with discounted value 
            # at the start of the next episode
            if dones[j] == True:
                break
        
        # update advantage list
        advantages.append(A_t)
    
    # complete advantage list with zero at end
    advantages.append(0.0)
    
    return advantages


def batch_builder(probs, values, obs, rewards, actions, advantages, action_masks, batch_size):

    #returns a list of dictionaries where each dictionary contains lists of
    #memories of length batch size

    #get random indices
    indices = list(range(0, len(rewards)))
    np.random.shuffle(indices)

    batches = []

    #iterate as many times as full batches can be created
    for i in range(0, len(rewards), batch_size):

        #make sure batch size is correct size
        if len(rewards) - i < batch_size:
            continue
        
        #collect all information in a batch dictionary and append to batch list
        batch_indices = indices[i:i + batch_size]
        batch_dict = {  'observations': np.expand_dims(np.array([obs[index] for index in batch_indices]), 1), 
                        'actions':      [actions[index] for index in batch_indices], 
                        'probs':        [probs[index] for index in batch_indices], 
                        'values':       [values[index] for index in batch_indices], 
                        'rewards':      [rewards[index] for index in batch_indices], 
                        'masks':        [action_masks[index] for index in batch_indices],
                        'advantages':   [advantages[index] for index in batch_indices]}
        batches.append(batch_dict)
    
    return batches


def train(actor, critic, actor_optimiser, critic_optimiser, batches, clip, c1):

    #returns the trained actor and critic model

    #iterate through batches
    for batch in batches:
        
        #calculate critic loss using mean MSE
        current_values = torch.squeeze(critic(torch.tensor(batch['observations'])))
        returns = torch.squeeze(torch.tensor(batch['advantages']) + torch.tensor(batch['values']))
        critic_loss = (returns - current_values) ** 2
        critic_loss = critic_loss.mean()

        #calculate actor loss using clipped probs ratio and advantages
        dist = actor(torch.tensor(batch['observations']), torch.tensor(batch['masks']))
        new_probs = dist.log_prob(torch.tensor(batch['actions']))
        prob_ratio = new_probs.exp() / torch.tensor(batch['probs']).exp()
        unclipped_loss = torch.tensor(batch['advantages']) * prob_ratio
        clipped_loss = torch.clamp(prob_ratio, 1 - clip, 1 + clip) * torch.tensor(batch['advantages'])
        #negative as trying to maximise value
        actor_loss = -torch.min(unclipped_loss, clipped_loss).mean()

        #add losses together
        total_loss = actor_loss + (c1 * critic_loss)

        #update weights
        actor_optimiser.zero_grad()
        critic_optimiser.zero_grad()
        total_loss.backward()
        actor_optimiser.step()
        critic_optimiser.step()
    
    return actor, critic


def reshape_image(observation):

    # returns the observation array after padding, transposing channnel
    # and adding extra dimension

    #pad observation to make square
    padded_observation = np.pad(observation, [(1, 1), (1, 0), (0, 0)])

    #transpose to get channel first
    transposed_observation = np.transpose(padded_observation, (2, 0, 1))

    #compress observation to one channel
    compressed_observation = transposed_observation[0, :, :] + (-1 * transposed_observation[1, :, :])

    #reshape to add batch size to the front
    reshaped_observation = np.reshape(compressed_observation, 
                                    (1, 1, compressed_observation.shape[0],
                                    compressed_observation.shape[1]))
    
    return reshaped_observation


def rand_action_picker(n_actions, observation):

    # returns a random action from the action space

    # create and mask action space with plus one so only masked actions appear as 0
    action_space = np.array(range(1, n_actions + 1))
    action_space_masked = np.array(observation['action_mask']) * action_space

    # remove actions that appear as zero then minus 1 from remaining actions
    action_space_masked = np.array([x for x in action_space_masked if x > 0]) - 1

    # pick a random action from the remaining valid actions
    action = action_space_masked[np.random.randint(0, len(action_space_masked))]

    return action
  

def algo_hyperparam_init(limits):

    #initialise hyperparameters
    algo_hyperparameters = {}

    #iterate over hyperparameters
    for param in limits:

        algo_hyperparameters[param] = np.random.uniform(limits[param][0], limits[param][1])

    return algo_hyperparameters


def score_dict_init(model_list):

    frac_dict = {}
    score_dict = {}

    for model in model_list:

        frac_dict[model] = []
        score_dict[model] = []
      
    return frac_dict, score_dict

In [None]:
def main():

    # initialise gym env
    env = connect_four_v3.env()
    env.reset()
    done = False

    # initialise memory lists and benchmarks
    AI1_mem = clean_memory()
    model_list = ['model_0']
    model_colours_dict = {'model_0': 'r'}
    frac_dict, score_dict = score_dict_init(model_list)
    fraction_calc_eps = 2000

    #define limits for hyperparameter choices
    limits = {'lambda_':(0.95, 0.96), 
              'gamma':(0.61, 0.62), 
              'clip':(0.14, 0.15), 
              'alpha_actor':(0.00025, 0.00026),
              'alpha_critic':(0.00025, 0.00026),
              'drop_out':(0.0, 0.2)}

    # initialise algorithm values
    AI1_params = algo_hyperparam_init(limits)
    model_iterations = 100
    lose_frac = 0
    best_frac = [0.01]
    model_count = 1
    save_count = 0
    ep_count = 0
    last_change = 0
    print_at = 10000
    n_epochs = 2
    n_actions = 7
    batch_size = 64
    T = 2048
    t = 0
    c1 = 0.5
    plot = True
    colour_plot_list = ['r', 'g', 'b', 'c', 'm', 'y']


    # setup initial optimisiers and models
    actor_1 = Actor(n_actions, AI1_params['drop_out'])
    critic_1 = Critic(AI1_params['drop_out'])
    actor_1_optimiser = optim.Adam(actor_1.parameters(), lr=AI1_params['alpha_actor'])
    critic_1_optimiser = optim.Adam(critic_1.parameters(), lr=AI1_params['alpha_critic'])

    #check size of model
    pytorch_total_params = sum(p.numel() for p in actor_1.parameters() if p.requires_grad)
    print(pytorch_total_params)

    #make directories to store trained models
    cwd = os.getcwd()
    os.makedirs(f'{cwd}/drive/MyDrive/models')

    # iterate through episodes
    while model_count != model_iterations:

      # reset env at the end of an episode
      env.reset()
      done = False
      AI1_score = 0
      AI2_score = 0

      # if there is an agent available in memory
      # then randomly pick it for the episode 
      if model_count > 1:
          model_choice = f'model_{np.random.randint(0, model_count)}'
          if model_choice != 'model_0':
              actor_2 = torch.load(f'drive/MyDrive/models/{model_choice}')
      else:
          model_choice = 'model_0'

      # select player agent will play as
      A1 = f'player_{np.random.randint(0, 2)}'

      # iterate until the end of the episode
      while not done:
          for agent in env.agent_iter():

              # get the last observation
              observation, reward, done, _ = env.last()

              # if the agent is AI1
              if agent == A1:
                  
                  #update score
                  AI1_score += reward

                  # boost score if lose
                  if reward == 1:
                    reward = 0

                  #reshape observation for CNN
                  reshaped_observation = reshape_image(observation['observation'])

                  # choose new action if not done and get obs value for observation
                  if not done:
                      action, prob = choose_action(   reshaped_observation, 
                                                      observation['action_mask'], 
                                                      actor_1
                                                      )
                      value = torch.squeeze(critic_1(torch.tensor(reshaped_observation))).item()

                      # store in memory
                      AI1_mem['actions'].append(action)
                      AI1_mem['action_masks'].append(observation['action_mask'])
                      AI1_mem['obs'].append(np.squeeze(reshaped_observation))
                      AI1_mem['probs'].append(prob)
                      AI1_mem['values'].append(value)
                      AI1_mem['rewards'].append(reward)
                      AI1_mem['dones'].append(done)
                  
                  # if done then only append rewards
                  # the rewards should be as a result of taking the action and
                  # so should be staggered and appear one later than the action
                  else:
                      AI1_mem['rewards'].append(reward)
                      AI1_mem['dones'].append(done)

                      # to achieve the staggering if the episode has ended
                      # then delete a zero reward from the episode
                      if len(AI1_mem['rewards']) > 2:
                          del AI1_mem['rewards'][-2]
                          del AI1_mem['dones'][-2]

                      # if memory is too small then clear memory and restart new episode
                      else:
                          # clean memory
                          AI1_mem = clean_memory()
                          break

                  # if T timesteps is reached then train
                  if len(AI1_mem['rewards']) % T == 0:

                      #get lists
                      values = AI1_mem['values']
                      rewards = AI1_mem['rewards']
                      dones = AI1_mem['dones']
                      obs = AI1_mem['obs']
                      actions = AI1_mem['actions']
                      action_masks = AI1_mem['action_masks']
                      probs = AI1_mem['probs']

                      # calculate advantages
                      advantages = advantage_calc(values, 
                                                  rewards, 
                                                  dones, 
                                                  AI1_params['lambda_'], 
                                                  AI1_params['gamma'])

                      # create random batches
                      batches = batch_builder(probs, values, obs, 
                                              rewards, actions, advantages, 
                                              action_masks, batch_size
                                              )

                      # train for n epochs
                      for _ in range(n_epochs):
                          actor_1, critic_1 = train(actor_1, critic_1, 
                                                    actor_1_optimiser, 
                                                    critic_1_optimiser, 
                                                    batches, 
                                                    AI1_params['clip'], 
                                                    c1
                                                   )
                          
                      # clean memory
                      AI1_mem = clean_memory()
              
              # if agent is AI2
              else:

                  # reshape observation
                  reshaped_observation = reshape_image(observation['observation'])

                  # update score
                  AI2_score += reward

                  # choose new action if not done and get obs value for observation
                  if not done:

                      if model_choice != 'model_0':
                          action, prob = choose_action(   reshaped_observation, 
                                                          observation['action_mask'], 
                                                          actor_2
                                                          )
                      else:
                          action = rand_action_picker(n_actions, observation)
                  
              #step in environment
              if done:
                  env.step(None)
              else:
                  env.step(action)

      # append episode scores to score dict and add to ep count
      #score_dict['training_AI'].append(max([0, AI1_score]))
      score_dict[model_choice].append(-1 * (min([0, AI1_score])))
      ep_count += 1

      # calculate the fraction of the last 2000 games won by the training AI
      # only compute this past 30 epsiodes to avoid variance of small sample size
      current_frac_list = []
      for model in list(score_dict.keys()):

          if len(score_dict[model]) > 50:

              frac_dict[model].append(np.sum(score_dict[model][-fraction_calc_eps:]) 
                                          / len(score_dict[model][-fraction_calc_eps:]))
              current_frac_list.append(frac_dict[model][-1])
      
      #if better than best fraction save model
      if len(score_dict[model]) > 50:
          if all(np.array(best_frac) - np.array(current_frac_list) >= 0):
            best_frac = current_frac_list
            save_count += 1
            # save model
            torch.save(actor_1, f'drive/MyDrive/models/model_{model_count}')
            if save_count % 5 == 0:
                print(f'Episode: {ep_count}, current best loss fractions: {best_frac}')
        
      # if model has only lost by the loss frac set then save model
      if all(np.array(current_frac_list) == 0) and len(current_frac_list) > 0:

        print(f'episode: {ep_count} , target loss fraction reached')

        # save model and add to list
        torch.save(actor_1, f'drive/MyDrive/models/model_{model_count}')
        model_list.append(f'model_{model_count}')
        model_colours_dict[f'model_{model_count}'] = colour_plot_list[np.random.randint(0, len(colour_plot_list))]

        plt.figure(figsize=(8, 5))
        plt.grid()
        plt.xlabel('Episodes')
        plt.ylabel(f'fraction of games lost over {fraction_calc_eps} MA')
        plt.title('fraction of games lost by the training agent against all trained models')
        for model in list(frac_dict.keys()):
            plt.plot(range(len(frac_dict[model])), np.array(frac_dict[model]), 
                      c=model_colours_dict[model],
                      label=model)
        plt.legend()
        plt.show()

        # generate fresh model to train
        # setup initial optimisiers and models
        current_frac_list = []
        best_frac = []
        for _ in range(len(model_list)):
            best_frac.append(0.01)
        frac_dict, score_dict = score_dict_init(model_list)
        AI1_params = algo_hyperparam_init(limits)
        actor_1 = Actor(n_actions, AI1_params['drop_out'])
        critic_1 = Critic(AI1_params['drop_out'])
        actor_1_optimiser = optim.Adam(actor_1.parameters(), lr=AI1_params['alpha_actor'])
        critic_1_optimiser = optim.Adam(critic_1.parameters(), lr=AI1_params['alpha_critic'])

        print(f'saved model num: {model_count}')

        #add to model count
        model_count += 1
  
      # plot fractions of games won of both agents if plot is true
      #if plot and ep_count % print_at == 0 and ep_count > 100:
      #    plt.figure(figsize=(8, 5))
      #    plt.grid()
      #    plt.xlabel('Episodes')
      #    plt.ylabel(f'fraction of games lost over {fraction_calc_eps} MA')
      #    plt.title('fraction of games lost by the training agent against all trained models')
      #    for model in list(frac_dict.keys()):
      #        plt.plot(range(len(frac_dict[model])), np.array(frac_dict[model]), 
      #                 c=model_colours_dict[model],
      #                 label=model)
      #    plt.legend()
      #    plt.show()


if __name__ == '__main__':
    main()

614327




Episode: 125163, current best loss fractions: [0.01]
Episode: 125168, current best loss fractions: [0.01]
Episode: 125173, current best loss fractions: [0.01]
Episode: 125178, current best loss fractions: [0.01]
Episode: 125183, current best loss fractions: [0.01]
Episode: 125188, current best loss fractions: [0.01]
Episode: 125193, current best loss fractions: [0.01]
Episode: 125198, current best loss fractions: [0.01]
Episode: 125203, current best loss fractions: [0.01]
Episode: 125208, current best loss fractions: [0.01]
Episode: 125213, current best loss fractions: [0.01]
Episode: 125218, current best loss fractions: [0.01]
Episode: 125223, current best loss fractions: [0.01]
Episode: 125228, current best loss fractions: [0.01]
Episode: 125233, current best loss fractions: [0.01]
Episode: 125238, current best loss fractions: [0.01]
Episode: 125243, current best loss fractions: [0.01]
Episode: 125248, current best loss fractions: [0.01]
Episode: 125253, current best loss fractions: 