In [1]:
import math
import random
import numpy as np
import torch 
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

In [2]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    print("Using GPU: GPU requested and available.")
    dtype = torch.cuda.FloatTensor
    dtypelong = torch.cuda.LongTensor
else:
    print("NOT Using GPU: GPU not requested or not available.")
    dtype = torch.FloatTensor
    dtypelong = torch.LongTensor

NOT Using GPU: GPU not requested or not available.


In [3]:
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, prob, value):
        state = np.expand_dims(state, 0)
        self.buffer.append((state, prob, value))

    def sample(self, batch_size):
        state, prob, value = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), prob, value

    def __len__(self):
        return len(self.buffer)
    
replay_size = 160000
replay_buffer = ReplayBuffer(replay_size)

In [4]:
from models import DualRes

dualres = DualRes(10, 4, USE_CUDA)
best_player = deepcopy(dualres)

In [5]:
class AlphaLoss(nn.Module):
    def __init__(self):
        super(AlphaLoss, self).__init__()
        
    def forward(self, pred_v, v, pred_p, p):
        value_error = (pred_v - v) ** 2
        policy_error = torch.sum((-p * (1e-15 + pred_p).log()))
        total_error = (value_error.view(-1) + policy_error).mean()
        return total_error

In [6]:
def compute_loss(net, batch_size, replay_buffer, optimizer, criterion):
    state, p, v = replay_buffer.sample(batch_size)
    state = torch.tensor(np.float32(state)).type(dtype)
    p = torch.tensor(np.float32(p)).type(dtype)
    v = torch.tensor(v).type(dtype)
    
    pred_p, pred_v = net(state)
    loss = criterion(pred_v, v, pred_p, p)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [7]:
def hard_update(q_network, target_q_network):
    for t_param, param in zip(target_q_network.parameters(), q_network.parameters()):
        if t_param is param:
            continue
        new_param = param.data
        t_param.data.copy_(new_param)

In [8]:
file = open('seqs.txt', 'r')
seq_list = file.readlines()
max_length = 20
# Get sequences with length <= max_length
seqs = [s for s in seq_list if len(s) <= max_length]
test_seqs = random.sample(seqs, len(seqs) // 10)
train_seqs = [s for s in seqs if s not in test_seqs]

In [9]:
def train(num_timesteps, num_games, num_iter):
    criterion = AlphaLoss()
    optimizer = optim.SGD(dualres.parameters(), lr = 0.001, momentum = 0.9, weight_decay = 0.0001)
    
    losses = []
    batch_size = 256
    
    for ts in range(1, num_timesteps + 1):
        # Data generation
        for _ in range(num_games):
            # Pick random sequence to play
            seq = random.sample(train_seqs, 1)[0][:-1]
            
            play(seq, best_player, num_iter)
        # Update params
        loss = compute_loss(dualres, batch_size, replay_buffer, optimizer, criterion)
        losses.append(loss)
        print(loss)
        
        if ts % 1000 == 0:
            improved = evaluate(dualres, best_player, num_iter)
            if improved:
                hard_update(dualres, best_player)

In [10]:
from HP2D_Env import HP2D
from MCTS import MCTS

def play(seq, net, num_iter):
    '''
    Plays a game with the best player net and sequence seq.
    Adds len(seq) - 1 data points (s_t, pi_t, z_t) to replay buffer.
    '''
    print(seq)
    env = HP2D(seq, (10,31,31))
    states = []
    probs = []
    state = env.make_state()
    for t in range(len(seq) - 1):
        print(t)
        temp = int(t < len(seq) // 10)
        mcts = MCTS(env, net, num_iter, cpuct = 5)
        pi_t = mcts.get_prob(state, temp = temp)
        sym = get_syms(state, pi_t)
        for s, p in sym:
            states.append(s)
            probs.append(p)
        action = np.random.choice(len(pi_t), p = pi_t)
        state = env.next_state(state, action)
    reward = env.calc_score(states[-1])
    for i in range(len(states)):
        replay_buffer.push(states[i], probs[i], reward)

In [11]:
def evaluate(curr, best, num_iter):
    seqs = random.sample(test_seqs, 100)
    ctr = 0
    for s in seqs:
        env = HP2D(s[:-1], (10,31,31))
        state_c = env.make_state()
        state_b = env.make_state()
        for t in range(len(s) - 1):
            mcts_c   = MCTS(env, curr, num_iter, cpuct)
            pi_c     = mcts.get_prob(state_c, temp = 0)
            action_c = np.random.choice(len(pi_c), p = pi_c)
            state_c  = env.next_state(state_c, action)
        for t in range(len(s) - 1):
            mcts_b   = MCTS(env, best, num_iter, cpuct)
            pi_b     = mcts.get_prob(state_b, temp = 0)
            action_b = np.random.choice(len(pi_b), p = pi_b)
            state_b  = env.next_state(state_b, action)    
        if calc_score(state_c) > calc_score(state_b):
            ctr += 1
    return ctr >= 70

In [12]:
def get_syms(state, pi):
    """
    Input:
        board: current board
        pi: policy vector of size self.get_prob()
    Returns:
        symmForms: a list of [(board,pi)] where each tuple is a symmetrical
                   form of the board and the corresponding pi vector. This
                   is used when training the neural network from examples.
    """
    assert(len(pi) == 4)
    l = []
    
    def rotate(pi, n):
        for _ in range(n):
            temp = pi
            pi = [temp[1], temp[3], temp[0], temp[2]]
        return pi
    
    def flip(pi):
        return [pi[3], pi[1], pi[2], pi[0]]
        
    for i in range(1, 5):
        for j in [True, False]:
            new_state = np.rot90(state, i, (1, 2))
            new_pi = rotate(pi, i)
            if j:
                new_state = np.fliplr(new_state)
                new_pi = flip(new_pi)
            l += [(new_state, new_pi)]
    return l

In [15]:
train(10000, 9000, 300)

PHPHPHPHHPP
0
99.0
1
2
3
4
5
6
7
8
9
PHHHHPPHPHPPHPHP
0
99.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
HHHHPPHHPP
0
99.0
1
2
3
4
5
6
7
8
HHPPHHHPPHHHPPPP
0
99.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
HHHHPPHPHHHP
0
99.0
1
2
3
4
5
6
7
8
9
10
HHPPPPPPHPHHHPPH
0
99.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
HHHPHPPPPPHPP
0
99.0
1
2
3
4
5
6
7
8
9
10
11
PHPPPPPP
0
1
2
3
4
5
6
PHPPHHHHPHHHPPPPHPH
0
99.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
HHHPHPHPPP
0
99.0
1
2
3
4
5
6
7
8
tensor(55.8041, grad_fn=<MeanBackward0>)


2072