In [1]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive');

sys.path.append('/content/drive/My Drive/Projects/TTT/')

Mounted at /content/drive


In [2]:
from game import TicTacToe

In [17]:
import math
import random
import pickle as pkl
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset

### **Training data**
Transition namedtuples from past play


In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


### **Define network**

In [5]:
class DQN(nn.Module):
    '''
        n_observations : number of state observations
        n_actions : reward for actions
    '''

    def __init__(self, n_observations, n_actions):

        self.features_A = 64

        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, self.features_A)
        self.layer2 = nn.Linear(self.features_A, self.features_A)
        #self.layer3 = nn.Linear(self.features_A, self.features_A)
        self.layer4 = nn.Linear(self.features_A, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        #x = F.relu(self.layer3(x))
        return self.layer4(x)

### **Hyperparameters and  Utilities**

In [6]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 128
GAMMA = 0.55
TAU = 0.005
LR = 1e-4

n_actions = 9
n_observations = 9

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

### **Helper tools**

In [18]:
def one_hot_and_tensify(position):
    one_hot = [0 if x != position else 1 for x in range(9)]
    return torch.tensor(one_hot, dtype=torch.int64, device=device)

def make_reward_tensor(reward):
    #one_hot = [0 if x != position else reward for x in range(9)]
    return torch.tensor(reward, dtype = torch.int64, device=device)

class MemoryDataset(Dataset):
    def __init__(self, raw_memory):
        self.raw_memory = raw_memory

    def __len__(self):
        return len(self.raw_memory)

    def __getitem__(self, idx):
        entry = self.raw_memory[idx]

        state = torch.tensor(list(entry[0]), dtype=torch.float32)

        action = one_hot_and_tensify(entry[1])

        next_state = torch.tensor(list(entry[2]), dtype=torch.float32)

        reward = make_reward_tensor(entry[3])
        return state, action, next_state, reward

### **Format incoming data for model**

- (state, action, next state, reward) -> (tensor, tensor, tensor, int)

In [19]:
with open('drive/My Drive/Projects/TTT/data/memory_0.pkl', 'rb') as file:
    raw_train_memory = pkl.load(file)

print(f'Number of (state, action, next_state,  reward) items is {len(raw_memory)}')

Number of (state, action, next_state,  reward) items is 711130


#### **Sandbox to troubleshoot mask issues**

In [26]:
# training dataloader
dataset = MemoryDataset(raw_train_memory)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

i = 0
rounds = 5

for state_batch, action_batch, next_state_batch, reward_batch in dataloader:
        i += 1
        if i == rounds : break

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state_batch)), device=device, dtype=torch.bool)
        non_final_next_states = torch.stack([s for s in next_state_batch if s is not None], dim = 0)

        golden_dim = state_batch.shape[0]

        print(f'dimension = {golden_dim}\tstate_batch shape {state_batch.shape}\t mask shape {non_final_mask.shape}\t next_states shape {non_final_next_states.shape}')



dimension = 128	state_batch shape torch.Size([128, 9])	 mask shape torch.Size([128])	 next_states shape torch.Size([128, 9])
dimension = 128	state_batch shape torch.Size([128, 9])	 mask shape torch.Size([128])	 next_states shape torch.Size([128, 9])
dimension = 128	state_batch shape torch.Size([128, 9])	 mask shape torch.Size([128])	 next_states shape torch.Size([128, 9])
dimension = 128	state_batch shape torch.Size([128, 9])	 mask shape torch.Size([128])	 next_states shape torch.Size([128, 9])


## **Training**


In [27]:
# training dataloader
dataset = MemoryDataset(raw_train_memory)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

for state_batch, action_batch, next_state_batch, reward_batch in dataloader:

    ## update 2nd model?
    state_batch = state_batch.to(device)
    action_batch = action_batch.to(device)
    next_state_batch = next_state_batch.to(device)
    reward_batch = reward_batch.to(device)

    # Compute a mask of non-final states
    # (a final state's next state would've been after the simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state_batch)), device=device, dtype=torch.bool)
    non_final_next_states = torch.stack([s for s in next_state_batch if s is not None], dim = 0)

    actions_taken = action_batch.argmax(dim=1, keepdim=True)


    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    ''' This is the forward pass. '''
    state_action_values = policy_net(state_batch).gather(1, actions_taken)


    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.


    current_batch_size = state_batch.shape[0]
    next_state_values = torch.zeros(current_batch_size, device=device)

    ''' Next states values calculated '''
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    state_action_values = state_action_values.squeeze(1)

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values)

    #print(f'Loss is {loss}')

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()

    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [None]:
def pick_from_DQN(player, board, *, validate_move = False):

    if player == 'o' :
        board = [x * -1 for x in board]

    state = torch.tensor(board, dtype=torch.float32, device=device)

    with torch.no_grad():
        # pass the state through the network
        q_values = policy_net(state)

    if validate_move :
        # select first valid action with  the  highest Q-value
        for index in list(torch.argsort(q_values, descending = True)):
            if state[index] == 0:
                action = index
                break

    else:
        # select the action with the highest Q-value
        action = torch.argmax(q_values).item()

    return int(action)

In [None]:
from game import train_Q_table
from game import pick_from_greedy_heuristic
from game import pick_from_Q_table
from game import pick_from_random

### **Train Q Learning Table**

In [None]:
train_q = False

if train_q:
    competitor_policy_training = 'greedy'
    training_rounds = 50000

    q_learning_rate = 0.3
    q_discount_factor = 0.55

    q_table, memory = train_Q_table(rounds = training_rounds,
                                        learning_rate = q_learning_rate,
                                        discount_factor = q_discount_factor,
                                        competitor_policy = competitor_policy_training)

    print(f'Q Table training Information :')
    print(f'\tTraining rounds : {training_rounds}')
    print(f'\tCompetitor training policy : {competitor_policy_training}')
    print(f'\n\tQ Learning Rate : {q_learning_rate}')
    print(f'\tQ Discount Factor : {q_discount_factor}')

## **Run Trial**

In [None]:
trial_rounds = 5000

In [None]:
stats = {
    1 : {
            'h' : 0,
            'v' : 0,
            'd' : 0,
            'win_lengths' : [0 for x in range(10)]
        },
    -1 : {
            'h' : 0,
            'v' : 0,
            'd' : 0,
            'win_lengths' : [0 for x in range(10)]
        },
    -2 : 0
}

for i in range(trial_rounds) :

    # alternate who goes first
    first_move_x = (i%2 == 0)
    ttt = TicTacToe(first_move_x = first_move_x)

    while ttt.win == False and ttt.age < 9 :
        if ttt.turn == 1 :
            player = 'x'

            # Policy Selection
            #position = pick_from_random(ttt.board)
            #position = pick_from_greedy_heuristic(player, ttt.board)
            #position = pick_from_Q_table(player, q_table, ttt.board)
            position = pick_from_DQN(player, ttt.board, validate_move = True)

        else :
            player = 'o'

            # Policy Selection
            position = pick_from_random(ttt.board)
            #position = pick_from_greedy_heuristic(player, ttt.board)
            #position = pick_from_Q_table(player, q_table, ttt.board)
            #position = pick_from_DQN(player, ttt.board, validate_move = True)

        result = ttt.move(player, position)
        assert(result == 100)

    # keep track of stats
    if ttt.winner == -2 :
        stats[ttt.winner] += 1
    else:
        stats[ttt.winner][ttt.win_dir] += 1
        stats[ttt.winner]['win_lengths'][ttt.age] += 1


In [None]:
stats[1]['total'] = sum([x for x in list(stats[1].values()) if isinstance(x, int)])
stats[-1]['total'] = sum([x for x in list(stats[-1].values()) if isinstance(x, int)])

In [None]:
print(f'\n\tTrial Rounds : {trial_rounds}')
print(f'\tPlayer 1 wins : {stats[1]["total"]}\t ({stats[1]["total"] / trial_rounds : .3})\t {stats[1]["win_lengths"]}')
print(f'\tPlayer -1 wins : {stats[-1]["total"]}\t ({stats[-1]["total"] / trial_rounds : .3})\t {stats[-1]["win_lengths"]})')
print(f'\tDraws : {stats[-2]}\t\t ({stats[-2] / trial_rounds : .3})')

In [None]:
torch.save(policy_net, 'drive/My Drive/Projects/TTT/model.pth')
torch.save(policy_net, 'drive/My Drive/Projects/TTT/model_weights.pth')