In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self, board_size):
        super(PolicyNetwork, self).__init__()
        self.board_size = board_size
        self.conv1 = nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2)    # First convolutional layer with input channels=1 and output channels=32
        self.conv2 = nn.Conv2d(8, 4, kernel_size=3, stride=1, padding=1)   # Second convolutional layer with input channels=32 and output channels=64
        self.fc1 = nn.Linear(4 * (board_size ** 2), 64)                    # Fully connected layer with input features= flatten output of conv2 which is 64*(board_size^2) and output features=256; original code was (prev)//4
        self.fc2 = nn.Linear(64, board_size ** 2)                           # Fully connected layer with input features=256 and output features=board_size^2
        self.activation = torch.sigmoid
    def forward(self, x):
        x = torch.relu(self.conv1(x))        # Apply ReLU activation to the output of the first convolutional layer
        x = torch.relu(self.conv2(x))        # Apply ReLU activation to the output of the second convolutional layer
        x = x.view(x.size(0), -1)            # Reshape x into a 2D matrix with size (batch_size, -1)
        x = torch.relu(self.fc1(x))          # Apply ReLU activation to the output of the first fully connected layer
        x = self.fc2(x)                     # Output the final logits from the second fully connected layer
        return self.activation(x)

In [23]:
class PolicyNetwork(nn.Module):
    def __init__(self, board_size):
        super(PolicyNetwork, self).__init__()
        self.board_size = board_size
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2)    # First convolutional layer with input channels=1 and output channels=32
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)   # Second convolutional layer with input channels=32 and output channels=64
        self.fc1 = nn.Linear(64 * (board_size ** 2), 128)                    # Fully connected layer with input features= flatten output of conv2 which is 64*(board_size^2) and output features=256; original code was (prev)//4
        self.fc2 = nn.Linear(128, board_size ** 2)                           # Fully connected layer with input features=256 and output features=board_size^2
        self.activation = torch.sigmoid
    def forward(self, x):
        x = torch.relu(self.conv1(x))        # Apply ReLU activation to the output of the first convolutional layer
        x = torch.relu(self.conv2(x))        # Apply ReLU activation to the output of the second convolutional layer
        x = x.view(x.size(0), -1)            # Reshape x into a 2D matrix with size (batch_size, -1)
        x = torch.relu(self.fc1(x))          # Apply ReLU activation to the output of the first fully connected layer
        x = self.fc2(x)                     # Output the final logits from the second fully connected layer
        return self.activation(x)

class RenjuGame:
    def __init__(self, board_size):
        self.board_size = board_size
        self.board = np.zeros((board_size, board_size), dtype=np.int32)
        self.current_player = 1                                             # current_player = 1 or 2
        self.winner = None
        self.last_move_col = 0
        self.last_move_row = 0

    def is_valid_move(self, row, col):
        if row < 0 or row >= self.board_size or col < 0 or col >= self.board_size:
            return False
        return self.board[row][col] == 0

    def make_move(self, row, col):
        if self.is_valid_move(row, col):
            self.board[row][col] = self.current_player
            self.check_winner()
            self.current_player = 3 - self.current_player
            self.last_move_row = row
            self.last_move_col = col
        else:
            self.current_player = 3 - self.current_player


    def check_winner(self):
        directions = [(0, 1), (1, 0), (1, 1), (-1, 1)]  # horizontal, vertical, diagonal, anti-diagonal
        for dr, dc in directions:
            for row in range(self.board_size):
                for col in range(self.board_size):
                    if self.board[row][col] != 0:
                        color = self.board[row][col]
                        win = True
                        for i in range(5):
                            if row + i * dr < 0 or row + i * dr >= self.board_size or col + i * dc < 0 or col + i * dc >= self.board_size or self.board[row + i * dr][col + i * dc] != color:
                                win = False
                                break
                        if win:
                            self.winner = color
                            return
 
    

    def is_game_over(self):
        return np.count_nonzero(self.board) == self.board_size * self.board_size or self.winner is not None

    def get_state(self):
        return self.board.copy()

    def get_invalid_moves(self):
        invalid_moves = []
        for row in range(self.board_size):
            for col in range(self.board_size):
                if not self.is_valid_move(row, col):
                    invalid_moves.append(row * self.board_size + col)
                    #valid_moves.append((row, col))
        return np.array(invalid_moves)

    def print_board(self):
        for row in self.board:
            print(row)
        print()

In [None]:
import torch.optim as optim

dtype = torch.float32
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(device)


# Create game and network instances
board_size = 7
game = RenjuGame(board_size)
net = PolicyNetwork(board_size).to(device)

# Create optimizer
optimizer = optim.Adam(net.parameters(), lr=1e-2)

loss_f = nn.BCELoss()
# Training loop
loss_mean_lst = []

for i in range(10000):  # Number of games to play
    game.__init__(board_size)  # Reset the game
    action_prob_prev = None
    count = 0
    loss_lst = []
    while not game.is_game_over():
        count+=1
        # Convert the current game state to a tensor
        state = torch.Tensor(game.get_state()).unsqueeze(0).unsqueeze(0).to(device)  # Adds extra dimensions for batch and channels

        # Get the action probabilities from the network
        action_probs = net(state).view(game.board_size, game.board_size)

        # Select an action
        action_probs = action_probs.flatten()
        action_probs[game.get_invalid_moves()] = -1
        action = torch.argmax(action_probs) #torch.multinomial(action_probs.flatten(), 1)[0]
        
        
        # Convert the action back to row, col
        row, col = action // game.board_size, action % game.board_size

        # Make the move
        game.make_move(row.item(), col.item())

        #for train
        action_prob = action_probs.flatten()[action, None]

        if action_prob_prev is not None:
            optimizer.zero_grad()
            # Use a separate copy of action_prob for the loss calculation
            action_prob_for_loss = action_prob.clone().detach().requires_grad_(True)
            loss = torch.nn.ReLU()(action_prob_prev - action_prob_for_loss) #loss_f(action_prob_prev, action_prob_for_loss)
            loss.backward()
            optimizer.step() 
            loss_lst.append(loss.item())
            
            # Store action_probs
        action_prob_prev = action_prob.clone().detach().requires_grad_(True).to(device)
        
        # Let the opponent (which is also the network) make a move
        
        if not game.is_game_over():
            # Opponent's turn
            with torch.no_grad():
                state = torch.Tensor(game.get_state()).unsqueeze(0).unsqueeze(0).to(device)
                action_probs = net(state).view(game.board_size, game.board_size)
                action_probs = action_probs.flatten()
                action_probs[game.get_invalid_moves()] = 0
                action = torch.argmax(action_probs) #torch.multinomial(action_probs.flatten(), 1)[0]
                
                action = torch.multinomial(action_probs.flatten(), 1)[0]
                row, col = action // game.board_size, action % game.board_size
                game.make_move(row.item(), col.item())

    # Now, the game is over, so we can compute the reward
    reward = 1 if game.winner == 1 else 0  # Assuming the network plays as player 1
    if game.winner == None:
        reward = 0.5
    
    # Update the network
    optimizer.zero_grad()
    loss = 100*torch.nn.ReLU()(action_prob_prev - torch.tensor([reward], dtype=dtype).to(device)) #loss = loss_f(action_prob, torch.tensor([reward], dtype=dtype).to(device))
    loss.backward()
    optimizer.step()
    if i%400 ==0:
        print('epoch', i)
        print(np.mean(loss_lst), loss.item(), action_prob.detach()[0].cpu().numpy() ,count, game.winner)
        game.print_board()
        torch.save(net.state_dict(), 'policy_net.pth')
    loss_mean_lst.append(np.sum(loss_lst) + loss.item())

# Save the trained network


cuda:2
epoch 0
0.001484006643295288 49.64668273925781 0.49646682 23 2
[1 1 1 1 2 2 1]
[1 2 1 1 1 1 2]
[2 1 1 1 2 1 2]
[0 2 2 2 2 2 2]
[1 2 0 1 0 2 1]
[2 2 2 1 1 2 2]
[1 1 2 1 2 1 2]

epoch 400
0.001684549976797665 50.34335708618164 0.5034336 18 2
[1 1 0 1 1 0 1]
[2 0 2 1 2 0 0]
[0 1 0 2 2 1 1]
[2 0 1 2 2 0 1]
[1 0 2 1 2 2 2]
[1 2 2 1 1 0 2]
[1 2 2 1 0 2 0]

epoch 800
0.0019396367398175325 0.0 0.4890086 23 1
[1 2 2 1 2 1 2]
[2 2 2 1 1 0 2]
[2 1 2 1 2 1 1]
[2 2 1 1 2 1 1]
[1 2 2 1 2 2 2]
[1 0 1 1 1 2 2]
[1 1 1 1 0 2 0]



In [14]:
game.print_board()
torch.nn.Re

[2 1 2 2 0 2 0]
[2 0 2 2 0 2 2]
[0 0 0 0 2 0 0]
[0 2 2 2 2 0 0]
[0 0 0 0 0 2 0]
[2 2 2 2 0 0 2]
[2 2 0 2 2 2 0]



In [36]:
def cpu_move(game, net):
    net.eval()
    state = torch.Tensor(game.get_state()).unsqueeze(0).unsqueeze(0).to(device) 
    action_probs = net(state).view(game.board_size, game.board_size)
    action = torch.multinomial(action_probs.flatten(), 1)
    row, col = action // game.board_size, action % game.board_size
    game.make_move(row.item(), col.item())
    return None

In [37]:
from IPython.display import clear_output


game.__init__(board_size)  # Reset the game

while not game.is_game_over():
    clear_output(wait=True)

    user_input = input("Please enter your move: ")
    numbers = user_input.split(',')
    row = int(numbers[0])
    col = int(numbers[1])

    game.make_move(row,col)
    cpu_move(game,net)
    game.print_board()

print('winner is: ',game.winner)

Please enter your move:  4,2


[0 0 0 0 0 0 0 0]
[0 0 0 2 0 0 0 0]
[0 0 0 0 0 0 0 0]
[0 0 0 0 0 2 0 0]
[0 0 1 1 1 1 1 0]
[0 2 0 0 0 0 0 0]
[0 0 0 2 0 0 0 0]
[2 0 0 0 0 0 0 0]

winner is:  1


In [126]:
import torch.optim as optim

dtype = torch.float32
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

# Create game and network instances
board_size = 9
game = RenjuGame(board_size)
net = PolicyNetwork(board_size).to(device)

# Create optimizer
optimizer = optim.Adam(net.parameters(), lr=1e-2)

loss_f = nn.BCELoss()
# Training loop

for i in range(100):  # Number of games to play
    game.__init__(board_size)  # Reset the game
    action_lst = []
    while not game.is_game_over():
        # Convert the current game state to a tensor
        state = torch.Tensor(game.get_state()).unsqueeze(0).unsqueeze(0).to(device)  # Adds extra dimensions for batch and channels

        # Get the action probabilities from the network
        action_probs = net(state).view(game.board_size, game.board_size)

        # Select an action
        action = torch.multinomial(action_probs.flatten(), 1)

        # Convert the action back to row, col
        row, col = action // game.board_size, action % game.board_size

        # Make the move
        game.make_move(row.item(), col.item())

        #for train
        action_prob = action_probs.flatten()[action, None]

        # Store action_probs
        action_lst.append( action_prob.clone()) #.detach().requires_grad_(True).to(device)
        
        # Let the opponent (which is also the network) make a move
        if not game.is_game_over():
            # Opponent's turn
            state = torch.Tensor(game.get_state()).unsqueeze(0).unsqueeze(0).to(device)
            action_probs = net(state).view(game.board_size, game.board_size)
            action = torch.multinomial(action_probs.flatten(), 1)
            row, col = action // game.board_size, action % game.board_size
            game.make_move(row.item(), col.item())

    # Now, the game is over, so we can compute the reward
    reward = 1 if game.winner == 1 else -1  # Assuming the network plays as player 1

    # Update the network
    optimizer.zero_grad()
    action_tensor = torch.concat(action_lst).to(device)
    # Update the network to increase the probability of actions that led to winning, and decrease the probability of actions that led to losing
    loss = torch.sum(torch.log(action_tensor) * reward )
    loss.backward()
    optimizer.step()
    if i%5 ==0:
        print('epoch', i)
        print(loss.item(), reward)

# Save the trained network
#torch.save(net.state_dict(), 'policy_net.pth')



epoch 0
167.86912536621094 -1


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 3; 31.74 GiB total capacity; 1.18 GiB already allocated; 3.62 MiB free; 1.45 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [115]:
action_prob

tensor([[0.5002]], device='cuda:0', grad_fn=<IndexBackward0>)

In [8]:
bchild = testnode.select_best_child()

#print(bchild.visits)
#print(testnode.visits)
#print(testnode.children[11].state.winner)
#print(testnode.children[11].player)

#print(testnode.children[11].wins)
#testnode.children[11].state.print_board()

testnode.select_best_child().state.print_board()

[1 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 0 0 0 0 0]
[0 2 2 2 2 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]

