In [3]:
import math
import random
import numpy as np
import torch 
import torch.optim as optim
import torch.nn as nn
from copy import deepcopy
import asyncio
import nest_asyncio
from HP2D_Env import HP2D
from MCTS import MCTS
from tqdm import tqdm
import time
from IPython.display import clear_output
nest_asyncio.apply()

OSError: [WinError 127] The specified procedure could not be found

In [None]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    print("Using GPU: GPU requested and available.")
    dtype = torch.cuda.FloatTensor
    dtypelong = torch.cuda.LongTensor
else:
    print("NOT Using GPU: GPU not requested or not available.")
    dtype = torch.FloatTensor
    dtypelong = torch.LongTensor

In [None]:
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, prob, value):
        state = np.expand_dims(state, 0)
        self.buffer.append((state, prob, value))

    def sample(self, batch_size):
        state, prob, value = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), prob, value

    def __len__(self):
        return len(self.buffer)
    
replay_size = 160000
replay_buffer = ReplayBuffer(replay_size)

In [None]:
from models import DualRes

dualres = DualRes(10, 4, USE_CUDA)
best_player = deepcopy(dualres)

if USE_CUDA:
    dualres = dualres.cuda()
    best_player = best_player.cuda()

In [None]:
class AlphaLoss(nn.Module):
    def __init__(self):
        super(AlphaLoss, self).__init__()
        
    def forward(self, pred_v, v, pred_p, p):
        value_error = (pred_v - v) ** 2
        policy_error = torch.sum((-p * (1e-15 + pred_p).log()))
        total_error = (value_error.view(-1) + policy_error).mean()
        return total_error

In [None]:
def compute_loss(net, batch_size, replay_buffer, optimizer, criterion):
    state, p, v = replay_buffer.sample(batch_size)
    state = torch.tensor(np.float32(state)).type(dtype)
    p = torch.tensor(np.float32(p)).type(dtype)
    v = torch.tensor(v).type(dtype)
    
    pred_p, pred_v = net(state)
    loss = criterion(pred_v, v, pred_p, p)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [None]:
def hard_update(q_network, target_q_network):
    for t_param, param in zip(target_q_network.parameters(), q_network.parameters()):
        if t_param is param:
            continue
        new_param = param.data
        t_param.data.copy_(new_param)

In [None]:
file = open('seqs.txt', 'r')
seq_list = file.readlines()
max_length = 100
# Get sequences with length <= max_length
seqs = [s for s in seq_list if len(s) == max_length]
test_seqs = random.sample(seqs, len(seqs) // 10)
train_seqs = [s for s in seqs if s not in test_seqs]

In [None]:
async def train():
    await asyncio.sleep(5)
    criterion = AlphaLoss()
    optimizer = optim.SGD(dualres.parameters(), lr = 0.001, momentum = 0.9, weight_decay = 0.0001)
    
    loss = compute_loss(dualres, batch_size, replay_buffer, optimizer, criterion)
    return loss

In [None]:
async def play(num_iter, num_games):
    '''
    Plays num_games games with the current network.
    For each game, adds len(seq) - 1 data points (s_t, pi_t, z_t) to replay buffer.
    '''
    s = time.perf_counter()
    for g in range(num_games):
        seq = random.sample(train_seqs, 1)[0][:-1]
        clear_output(wait = True)
        print("Game {} / {}: {}".format(g + 1, num_games, seq))
        env = HP2D(seq, (10,31,31))
        states = []
        probs = []
        state = env.make_state()
        for t in range(len(seq) - 1):
            temp = int(t < len(seq) // 10)
            mcts = MCTS(env, dualres, num_iter, cpuct = 5)
            pi_t = mcts.get_prob(state, temp = temp)
            sym = get_syms(state, pi_t)
            for s, p in sym:
                states.append(s)
                probs.append(p)
            action = np.random.choice(len(pi_t), p = pi_t)
            state = env.next_state(state, action)
        reward = env.calc_score(states[-1])
        for i in range(len(states)):
            replay_buffer.push(states[i], probs[i], reward)
    e = time.perf_counter() - s
    print('MCTS took {} seconds'.format(e[0][0][0]))

In [None]:
async def evaluate(num_iter):
    '''
    Evaluates using 25 games with current network, compare results to hypothetical maximum.
    Returns a float between (0, 1): score / hyp_max score
    '''
    for _ in range(25):
        seqs = random.sample(test_seqs, 100)
        total = 0
        for s in seqs:
            env = HP2D(s[:-1], (10,31,31))
            state = env.make_state()
            for t in range(len(s) - 1):
                mcts   = MCTS(env, dualres, num_iter, cpuct)
                pi     = mcts.get_prob(state, temp = 0)
                action = np.random.choice(len(pi), p = pi)
                state  = env.next_state(state, action)    
            total += env.calc_score(state) / env.hyp_max()
    return total / 25

In [None]:
def get_syms(state, pi):
    """
    Input:
        board: current board
        pi: policy vector of size self.get_prob()
    Returns:
        symmForms: a list of [(board,pi)] where each tuple is a symmetrical
                   form of the board and the corresponding pi vector. This
                   is used when training the neural network from examples.
    """
    assert(len(pi) == 4)
    l = []
    
    def rotate(pi, n):
        for _ in range(n):
            temp = pi
            pi = [temp[1], temp[3], temp[0], temp[2]]
        return pi
    
    def flip(pi):
        return [pi[3], pi[1], pi[2], pi[0]]
        
    for i in range(1, 5):
        for j in [True, False]:
            new_state = np.rot90(state, i, (1, 2))
            new_pi = rotate(pi, i)
            if j:
                new_state = np.fliplr(new_state)
                new_pi = flip(new_pi)
            l += [(new_state, new_pi)]
    return l

In [None]:
async def main(num_timesteps, num_games, num_iter, batch_size):
    '''
    Executing play, train, evaluate asynchronously in parallel.
    '''

    losses = []
    scores = []

    for ts in range(1, num_timesteps + 1):
        # Data generation
        playing = loop.create_task(play(num_iter, num_games))

        # Update params
        training = loop.create_task(train())
        
        # Evaluate agent
        if ts % 1000 == 0:
            evaluating = loop.create_task(evaluate(num_iter))
            await asyncio.wait([playing, training, evaluating])
            
            score = evaluating.result()
            scores.append(score)
        else:
            await asyncio.wait([playing, training])
            
        loss = training.result()
        losses.append(loss)
        
    return losses, scores

In [None]:
for g in range(1):
    seq = random.sample(train_seqs, 1)[0][:-1]
    clear_output(wait = True)
    print("Game {} / {}: {}".format(g + 1, 1, seq))
    env = HP2D(seq, (10,31,31))
    states = []
    probs = []
    state = env.make_state()
    for t in tqdm(range(len(seq) - 1)):
        temp = int(t < len(seq) // 10)
        mcts = MCTS(env, dualres, 100, cpuct = 5)
        pi_t = mcts.get_prob(state, temp = temp)
        sym = get_syms(state, pi_t)
        for s, p in sym:
            states.append(s)
            probs.append(p)
        action = np.random.choice(len(pi_t), p = pi_t)
        state = env.next_state(state, action)
    reward = env.calc_score(states[-1])
    for i in range(len(states)):
        replay_buffer.push(states[i], probs[i], reward)

In [None]:
num_timesteps = 1
num_games = 1
num_iter = 10
batch_size = 1

loop = asyncio.get_event_loop()

losses, scores = loop.run_until_complete(main(num_timesteps, num_games, num_iter, batch_size))