## ConnectX Reinforcement Learning Agent with Self-Play

Build an agent to play ConnectX based on the Kaggle Simulation environment  
Train the agent using stable-baselines3 on a gym-compatible wrapper for the Kaggle env  
Use a callback to update the opponent agent periodically during training

### ConnectX Gym Environment

In [1]:
import numpy as np
import gym

from kaggle_environments import make, evaluate
from gym import spaces

class ConnectFourGym(gym.Env):
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(1,self.rows,self.columns), dtype=int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns), reward, done, _

Loading environment lux_ai_s2 failed: No module named 'vec_noise'


### Neural Network for Policy Representation

In [2]:
import torch as th
import torch.nn as nn

from stable_baselines3 import PPO 
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

# Neural network for predicting action values
class CustomCNN(BaseFeaturesExtractor):
    
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int=128):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # CxHxW images (channels first)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

### Build and train agent with static opponent

In [3]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create ConnectFour environment 
env = ConnectFourGym(agent2="random")

# Initialize agent
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
)
model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=0)

# Train agent
model.learn(total_timesteps=500, progress_bar=True)



Output()

<stable_baselines3.ppo.ppo.PPO at 0x17807fe80>

In [3]:
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    invalid_1 = outcomes.count([None, 0])
    invalid_2 = outcomes.count([0, None])
    if invalid_1 > 0:
        print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    if invalid_2 > 0:
        print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

In [6]:
get_win_percentages(agent_ppo_trained, agent_ppo_trained2, n_rounds=100)

NameError: name 'agent_ppo_trained' is not defined

In [6]:
### Generate new agent function using model
def build_agent(model):
    def _function(obs, config):
        # Use the best model to select a column
        col, _ = model.predict(np.array(obs['board']).reshape(1, config.rows, config.columns))
        # Check if selected column is valid
        is_valid = (obs['board'][int(col)] == 0)
        # If not valid, select random move. 
        if is_valid:
            return int(col)
        else:
            return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])

    return _function

In [4]:
import random
import os
from stable_baselines3.common.callbacks import BaseCallback

class SelfPlayCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, save_dir: str, verbose=0):
        super().__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.num_rollouts = 0
        self.save_dir = save_dir

    def _init_callback(self) -> None:
        """
        Make directory save_dir for previous agents and clear previous contents
        """
        if self.save_dir is not None:
            os.makedirs(self.save_dir, exist_ok=True)
            for file in os.listdir(self.save_dir):
                os.remove(os.path.join(self.save_dir, file))

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass
    
    
    def _on_rollout_start(self) -> None:
        """
        Build opponent model for self play using saved version
        Modify self.env to use new opponent for next rollout
        """
        self.num_rollouts += 1
        if self.verbose >= 1:
            print(f"Start rollout {self.num_rollouts}")
            

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: If the callback returns False, training is aborted early.
        """
        return True

    def _on_rollout_end(self) -> None:
        """
        Save current model to save_dir/model_{num_rollouts}
        Store save file to dict with rollout number
        """
        save_path = os.path.join(self.save_dir, f"model_{self.num_rollouts}")
        self.model.save(path=save_path)
        if self.verbose >= 1:
            print(f"Saving model from rollout {self.num_rollouts} to {save_path}")

        opponent_path = os.path.join(
            self.save_dir, random.choice(os.listdir(self.save_dir)).split('.', 1)[0]
        )
        if self.verbose >= 1:
            print(f"Training versus model from {opponent_path}")
        opponent_model = PPO.load(opponent_path)
        opponent_agent = build_agent(opponent_model)
        #last_obs = self.model._last_obs
        new_env = ConnectFourGym(agent2=opponent_agent)
        self.model.set_env(env=new_env, force_reset=True)
        self.model._last_obs = self.model.env.reset()
        #self.model._last_obs = last_obs
        #self.model.env.reset()

    def _on_training_end(self) -> None:
        """
        Delete intermediate models and save_dir
        """
        #for file in os.listdir(self.save_dir):
        #    os.remove(os.path.join(self.save_dir, file))
        #os.removedirs(self.save_dir)

In [7]:
from stable_baselines3.common.callbacks import ProgressBarCallback

# Create ConnectFour environment 
env = ConnectFourGym(agent2="random")

# Initialize agent
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
)
selfplay_callback = SelfPlayCallback(save_dir="opponents", verbose=1)

model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=0, n_steps=128)

# Train agent (close progress bar first if necessary)
model.learn(total_timesteps=50000, callback=selfplay_callback)



Start rollout 1
Saving model from rollout 1 to opponents/model_1
Training versus model from opponents/model_1
Start rollout 2
Saving model from rollout 2 to opponents/model_2
Training versus model from opponents/model_1
Start rollout 3
Saving model from rollout 3 to opponents/model_3
Training versus model from opponents/model_2
Start rollout 4
Saving model from rollout 4 to opponents/model_4
Training versus model from opponents/model_2
Start rollout 5
Saving model from rollout 5 to opponents/model_5
Training versus model from opponents/model_2
Start rollout 6
Saving model from rollout 6 to opponents/model_6
Training versus model from opponents/model_3
Start rollout 7
Saving model from rollout 7 to opponents/model_7
Training versus model from opponents/model_2
Start rollout 8
Saving model from rollout 8 to opponents/model_8
Training versus model from opponents/model_1
Start rollout 9
Saving model from rollout 9 to opponents/model_9
Training versus model from opponents/model_6
Start roll

<stable_baselines3.ppo.ppo.PPO at 0x1792dca90>

In [8]:
agent_selfplay_trained = build_agent(model)

In [9]:
env = make("connectx")
env.run(agents=[agent_selfplay_trained, "random"])
env.render(mode="ipython")

In [10]:
get_win_percentages(agent_selfplay_trained, "random", n_rounds=100)

Agent 1 Win Percentage: 0.77
Agent 2 Win Percentage: 0.23


In [15]:
# Create ConnectFour environment 
env = ConnectFourGym(agent2="random")

# Initialize agent
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
)

model2 = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=0, n_steps=128)

# Train agent
model2.learn(total_timesteps=50000, progress_bar=True)

Output()

<stable_baselines3.ppo.ppo.PPO at 0x178c00700>

In [12]:
agent_random_trained = build_agent(model2)
get_win_percentages(agent_selfplay_trained, agent_random_trained, n_rounds=100)

Agent 1 Win Percentage: 0.68
Agent 2 Win Percentage: 0.32


In [14]:
for i in range(1, 50, 10):
    model_path = f"opponents/model_{i}"
    intermediate_agent = build_agent(PPO.load(model_path))
    print(f"versus intermediate agent {i}:")
    get_win_percentages(agent_selfplay_trained, intermediate_agent, n_rounds=50)

versus intermediate agent 1:
Agent 1 Win Percentage: 0.78
Agent 2 Win Percentage: 0.22
versus intermediate agent 11:
Agent 1 Win Percentage: 0.78
Agent 2 Win Percentage: 0.22
versus intermediate agent 21:
Agent 1 Win Percentage: 0.76
Agent 2 Win Percentage: 0.24
versus intermediate agent 31:
Agent 1 Win Percentage: 0.82
Agent 2 Win Percentage: 0.18
versus intermediate agent 41:
Agent 1 Win Percentage: 0.68
Agent 2 Win Percentage: 0.32
