In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
import json
import re

In [3]:
from deep_rl import create_action_value_dict, create_value_action_dict, create_state, available_moves
from player import Player
from hand import YatzyHand

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
lr = 0.01

In [6]:
loss_fn = F.cross_entropy

In [7]:
value_dict = create_value_action_dict(Player())
action_dict = create_action_value_dict(value_dict)

## Setting Up Data

In [44]:
inputs = []
outputs = []

with open('300data.json', 'r') as file:
    data = json.load(file)

for key, value in data.items():
    state = re.findall(f'\d+', key)
    state = [float(x) for x in state]
    state = torch.tensor(state, device=device)
    inputs.append(state)

    move = action_dict[value]
    outputs.append(torch.tensor(move, device=device))


In [45]:
input_tensor = torch.stack(inputs)
output_tensor = torch.stack(outputs)

In [46]:
train_ds = TensorDataset(input_tensor, output_tensor)

In [47]:
train_dl = DataLoader(train_ds, batch_size=64)

## Helper Functions

## Model

In [48]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=0)
        self.layer1 = nn.Linear(20, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 46)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.softmax(self.layer3(x))

In [49]:
def get_model():
    model = Model()
    opt = optim.SGD(model.parameters(), lr = lr)
    return model, opt

## Evaluation

In [206]:
class EvalGame():
    def __init__(self, player, num):
        self.player = player
        self.scores = []
        self.num = num

    def get_indices(self, action):
        temp = re.findall(f'\d', value_dict[action])
        return [int(x) for x in temp]

    
    def evaluate(self):
        player = self.player

        for i in range(self.num):
            player.reset_scoresheet()

            turns = 0
            while turns < 15:
                self.turn(player)
                turns += 1
            
            score = player.calculate_score()
            self.scores.append(score)

    def turn(self, player):
        hand = YatzyHand()

        rerolls = 0
        while rerolls < 2:

            move = player.choose_move(hand, reroll=True)

            if move >= 15:
                rerolls += 1
                indices = self.get_indices(move)

                hand = hand.reroll(indices)
            else:
                break

        move = player.choose_move(hand, reroll=False)

        action = value_dict[move]

        score = getattr(hand, action)()

        player.update_scoresheet(action, score)
        
    def results(self):
        return sum(self.scores) / len(self.scores)

In [207]:
class EvalBot(Player):
    def __init__(self):
        super().__init__()
        self.model = Model()
    
    def load(self, state_dict):
        self.model.load_state_dict(state_dict)
        self.model.eval()
        
    def available_moves(self, reroll=False):
        final = []
        i = 0
        for key, value in self.scoresheet.items():
            if value == None:
                final.append(i)
            i += 1
        
        if reroll:
            for i in range(15, 46):
                final.append(i)
                i += 1
                
        return torch.tensor(final)


    
    def choose_move(self, hand, reroll=False):
        state = create_state(self.scoresheet, hand)
        options = self.available_moves(reroll=reroll)

        with torch.no_grad():
            
            available_mask = torch.tensor(tuple(map(lambda x: x not in options, range(46))), device=device, dtype=torch.bool)
            
            results = model(state)
            results[available_mask] = 0
            
            return torch.argmax(results).view(1).item()
            

In [208]:
def evaluate(num, state_dict):
    # evaluates a model by having it play num games; pass in the state_dicts
    #returns the average score over those games
    
    player = EvalBot()
    player.load(state_dict)

    game = EvalGame(player, num)

    game.evaluate()

    return game.results()

## Training

In [209]:
def train(epochs):

    start_avg = evaluate(100, model.state_dict())
    print('Starting average: {}'.format(start_avg))
    
    for i in range(epochs):
        
        for x, y in train_dl:
            preds = model(x)
            loss = loss_fn(preds, y)

            opt.zero_grad()
            loss.backward()
            opt.step()
        
        print('Epoch {} done'.format(i))
        avg = evaluate(100, model.state_dict())
        print('Average score: {}'.format(avg))

    
        

In [215]:
model, opt = get_model()


In [216]:
train(100)

Starting average: 72.53
Epoch 0 done
Average score: 65.65
Epoch 1 done
Average score: 56.36
Epoch 2 done
Average score: 59.15
Epoch 3 done
Average score: 57.84
Epoch 4 done
Average score: 67.04
Epoch 5 done
Average score: 66.61
Epoch 6 done
Average score: 69.69
Epoch 7 done
Average score: 68.42
Epoch 8 done
Average score: 71.07
Epoch 9 done
Average score: 70.77
Epoch 10 done
Average score: 67.18
Epoch 11 done
Average score: 71.57
Epoch 12 done
Average score: 72.49
Epoch 13 done
Average score: 69.09
Epoch 14 done
Average score: 72.91
Epoch 15 done
Average score: 72.2
Epoch 16 done
Average score: 71.76
Epoch 17 done
Average score: 72.31
Epoch 18 done
Average score: 70.23
Epoch 19 done
Average score: 76.35
Epoch 20 done
Average score: 72.9
Epoch 21 done
Average score: 72.52
Epoch 22 done
Average score: 72.14
Epoch 23 done
Average score: 71.3
Epoch 24 done
Average score: 69.65
Epoch 25 done
Average score: 75.92
Epoch 26 done
Average score: 70.6
Epoch 27 done
Average score: 68.93
Epoch 28 d