In [133]:
import tianshou as ts
import numpy as np
from pettingzoo.classic.chess import chess
import torch as T
import torch.optim as optim
import torch.nn as nn
import gymnasium as gym

In [134]:
# Constants
STATE_SHAPE = (8, 8, 111)
ACTION_SHAPE = gym.spaces.Discrete(8*8*73)
DUMMY_CONV_INPUT = T.zeros(1, *STATE_SHAPE).permute(0, 3, 1, 2)

LEARNING_RATE = 1e-3
DISCOUNT_FACTOR = 0.9
NUM_STEPS_TO_LOOK_AHEAD = 3
NUM_STEPS_BETWEEN_TARGET_NETWORK_UPDATES = 320

TRAIN_ENV_COUNT = 2
TEST_ENV_COUNT = 10

In [135]:
def get_env():
    return ts.env.PettingZooEnv(chess.env())


In [141]:
class DeepQNetwork(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # Identify structural patterns using convolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=STATE_SHAPE[-1], out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        # Calculate the size of the output from convolutional layers
        with T.no_grad():
            conv_output_size = self.conv_layers(DUMMY_CONV_INPUT).reshape(1, -1).size(1)


        # Feed that convolutional output into a fully connected linear net to parse lower-level details
        self.fc_layers = nn.Sequential(
            nn.Linear(conv_output_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, ACTION_SHAPE.n),
        )

    def forward(self, x, state=None, info={}) -> T.Tensor:
        x = x.permute(0, 3, 1, 2)  if isinstance(x, T.Tensor) else T.from_numpy(x).permute(0, 3, 1, 2)
        x = x.float()
        x = self.conv_layers(x)
        x = x.reshape(x.size(0), -1)
        q_values = self.fc_layers(x).reshape(-1, ACTION_SHAPE.n)
        return q_values, state

    def seed(self, seed):
        return np.random.seed(seed)

In [142]:
env = get_env()
q_network = DeepQNetwork()
optimizer = optim.Adam(q_network.parameters(), lr=LEARNING_RATE)

policy = ts.policy.DQNPolicy(
    q_network, 
    optimizer, 
    discount_factor=DISCOUNT_FACTOR, 
    estimation_step=NUM_STEPS_TO_LOOK_AHEAD, 
    target_update_freq=NUM_STEPS_BETWEEN_TARGET_NETWORK_UPDATES)

In [144]:
train_envs = ts.env.DummyVectorEnv([get_env for _ in range(TRAIN_ENV_COUNT)]) # Check these ranges if badness
test_envs = ts.env.DummyVectorEnv([get_env for _ in range(TEST_ENV_COUNT)])

In [145]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, TRAIN_ENV_COUNT), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

In [146]:
result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),)
print(f'Finished training! Use {result["duration"]}')

Epoch #1:   0%|          | 10/10000 [00:00<05:21, 31.06it/s]


ValueError: total size of new array must be unchanged