In [345]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
import pickle
import copy
from collections import deque

# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [346]:
class GameEnvironment:
    def __init__(self):
        self.hand = []
        self.rolls_left = 3
        self.score_card = {
            'ones': -1, 'twos': -1, 'threes': -1,
            'fours': -1, 'fives': -1, 'sixes': -1,
            'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1,
            'small_straight': -1, 'large_straight': -1,
            'yahtzee': -1, 'chance': -1,
            'total_score': 0
        }
        self.label_score_dict = {
            0: 'ones', 1: 'twos', 2: 'threes', 3: 'fours', 4: 'fives', 5: 'sixes',
            6: 'three_of_a_kind', 7: 'four_of_a_kind', 8: 'full_house',
            9: 'small_straight', 10: 'large_straight', 11: 'yahtzee', 12: 'chance'
        }
    
    def get_state(self):
        return self.hand + [self.rolls_left] + list(self.score_card.values())
    
    def get_score_card(self):
        return self.score_card.copy()

    def roll(self):
        num_dice_to_roll = 5 - len(self.hand)
        rolled_dice = [random.randint(1, 6) for _ in range(num_dice_to_roll)]
        self.hand = self.hand + rolled_dice
        self.rolls_left -= 1
    
    def get_potential_scores(self):
        # Scoring logic
        potential_scores = {
            'ones': sum([x for x in self.hand if x == 1]),
            'twos': sum([x for x in self.hand if x == 2]),
            'threes': sum([x for x in self.hand if x == 3]),
            'fours': sum([x for x in self.hand if x == 4]),
            'fives': sum([x for x in self.hand if x == 5]),
            'sixes': sum([x for x in self.hand if x == 6]),
            'three_of_a_kind': sum(self.hand) if (
                np.unique(self.hand, return_counts = True)[1].max() >= 3
            ) else 0,
            'four_of_a_kind': sum(self.hand) if (
                np.unique(self.hand, return_counts = True)[1].max() >= 4
            ) else 0,
            'full_house': 25 if (
                (np.unique(self.hand, return_counts = True)[1].max() == 3) &
                (np.unique(self.hand, return_counts = True)[1].min() == 2)
            ) else 0,
            'small_straight': 30 if (
                all(number in self.hand for number in [1,2,3,4]) or
                all(number in self.hand for number in [2,3,4,5]) or
                all(number in self.hand for number in [3,4,5,6])
            ) else 0,
            'large_straight': 40 if (
                all(number in self.hand for number in [1,2,3,4,5]) or
                all(number in self.hand for number in [2,3,4,5,6])
            ) else 0,
            'yahtzee': 50 if len(set(self.hand)) == 1 else 0,
            'chance': sum(self.hand)
        }
        # If a score is already marked, remove it from potential scores
        for score_type in potential_scores.keys():
            if self.score_card[score_type] != -1:
                potential_scores[score_type] = 0
        return potential_scores

    def mark_score(self, chosen_score_type, chosen_score):
        self.score_card[chosen_score_type] = chosen_score
        self.score_card['total_score'] += chosen_score

    def reset(self):
        self.__init__()
        return self.get_state()
    
class DiceModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DiceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
class ScoreModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ScoreModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

In [347]:
def select_dice(state, dice_model, EPSILON, THREHOLD = 0.5):
    # Exploration
    if random.random() < EPSILON:
        return torch.randint(1, 2, (5,))
    # Exploitation
    else:
        probs = dice_model(state)
        return (probs > THREHOLD).float()

def evaluate_dice_reward(pre_potentials, post_potentials): # dice_before, dice_after, 
    # Calculate potential before and after dice decision
    pre_reward = sum(pre_potentials.values()) / 100  # Normalize reward
    pre_reward += len([x for x in pre_potentials.values() if x > 0]) * 5  # Bonus for each potential score type
    post_reward = sum(post_potentials.values()) / 100
    post_reward += len([x for x in post_potentials.values() if x > 0]) * 5
    # Bonuses
    bonus = 0
    bonus += 50 if post_potentials['yahtzee'] > 0 else 0
    bonus += 25 if post_potentials['large_straight'] > 0 else 0
    bonus += 25 if pre_potentials['small_straight'] > 0 and post_potentials['small_straight'] > 0 else 0
    # Reduce bonus for re-rolling good potential scores
    for score_type in post_potentials.keys():
        if pre_potentials[score_type] > 0 and post_potentials[score_type] == 0:
            if score_type == 'chance':
                continue
            if score_type in ['yahtzee', 'large_straight', 'small_straight']:
                bonus -= 50
            if score_type in ['three_of_a_kind', 'four_of_a_kind', 'full_house']:
                bonus -= 10
            else:
                bonus -= 5
    # Final reward
    reward = post_reward - pre_reward + bonus
    return reward

def select_score(state, score_model, EPSILON):
    invalid_actions = [x for x in range(13) if state[6:-1][x] != -1]
    # Exploration
    if random.random() < EPSILON:
        valid_actions = torch.Tensor([x for x in range(13) if x not in invalid_actions])
        idx = torch.randint(0, len(valid_actions), (1,))
        return valid_actions[idx].item()
    # Exploitation
    else:
        logits = score_model(state)
        probs = torch.softmax(logits, dim=0)
        mask = torch.ones(13)
        mask[invalid_actions] = 0
        probs = probs * mask
        probs = probs / probs.sum()  
        return torch.multinomial(probs, 1).item()

def evaluate_score_reward(pre_score_card, post_score_card, potential_scores, score_decision, score_amount):
    reward = score_amount
    # Penalize choosing low score when higher scores are available
    reward -= max(potential_scores.values()) - score_amount
    # Reward reaching the 63 mark for top bonus
    reward += 50 if (
        (score_decision in ['ones', 'twos', 'threes', 'fours', 'fives', 'sixes']) and 
        (sum([x if x > 0 else 0 for x in post_score_card.values()][:6]) >= 63)
    ) else 0
    # Penalize choosing 0 for important score types
    for score_type in potential_scores.keys():
        if pre_score_card[score_type] == -1 and post_score_card[score_type] == 0:
            if score_type == 'yahtzee':
                reward -= 50
            if score_type in ['three_of_a_kind', 'small_straight', 'full_house']:
                reward -= 30
            if score_type in ['four_of_a_kind', 'large_straight', 'fours', 'fives', 'sixes']:
                reward -= 15 
    # Panelize choosing low scores for important score types
    reward -= 10 if (
        (score_decision == 'three_of_a_kind' and score_amount < 20) or
        (score_decision == 'four_of_a_kind' and score_amount < 15) or 
        (score_decision == 'sixes' and score_amount < 18) or
        (score_decision == 'fives' and score_amount < 15) or
        (score_decision == 'fours' and score_amount < 12) or
        (score_decision == 'chance' and score_amount < 15)
    ) else 0
    return reward

def update_model(model, target_model, optimizer, experiences, GAMMA):
    # Unpack experiences into separate lists, then convert into tensors
    states, actions, rewards, next_states, dones = zip(*experiences)   
    states = torch.stack(states)
    next_states = torch.stack(next_states)
    actions = torch.tensor(actions)
    rewards = torch.tensor(rewards)
    dones = torch.tensor(dones, dtype=torch.float)

    # Compute current, next and target Q values
    current_q = model(states).gather(1, actions.unsqueeze(1))
    next_q = target_model(next_states).max(1)[0].detach()
    target_q = rewards + GAMMA * next_q * (1 - dones)

    # Compute loss
    loss = F.mse_loss(current_q.squeeze(), target_q)

    # Backpropogate and update model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [348]:
def train_models(num_episodes, dice_model, score_model, GAMMA=0.99, EPSILON=0.1, THRESOLD=0.5):
    optimizer_dice = optim.Adam(dice_model.parameters())
    optimizer_score = optim.Adam(score_model.parameters())

    for episode in range(num_episodes):
        # Initialize environment
        env = GameEnvironment()
        state = torch.FloatTensor(env.reset())
        total_score_reward = 0

        while any(value < 0 for value in env.score_card.values()):
            # Initialize variables for each turn
            states = []
            dice_decisions = []
            score_potentials = []
            pre_score_card = env.get_score_card()
            env.rolls_left = 3

            # First two rolls
            for _ in range(2):
                # Roll dice
                env.roll()
                state = torch.FloatTensor(env.get_state())
                states.append(state)
                score_potentials.append(env.get_potential_scores())
                # Select dice to keep
                dice_decision = select_dice(state, dice_model, EPSILON)
                dice_decisions.append(dice_decision)
                env.hand = [env.hand[x] for x in range(5) if dice_decision[x] == 1]        
            
            # Final roll
            env.roll()
            states.append(state)
            score_potentials.append(env.get_potential_scores())

            # Calculate dice model rewards
            dice_rewards = [
                evaluate_dice_reward(score_potentials[0], score_potentials[1]),  
                evaluate_dice_reward(score_potentials[1], score_potentials[2])  
            ]

            # Select score
            state = torch.FloatTensor(env.get_state())
            score_decision = env.label_score_dict[select_score(state, score_model, EPSILON)]
            score_amount = env.get_potential_scores()[score_decision]
            env.mark_score(score_decision, score_amount)
            post_score_card = env.get_score_card()

            # Calculate score model rewards
            score_reward = evaluate_score_reward(pre_score_card, post_score_card, score_potentials[2], score_decision, score_amount)

            # Update models
            optimizer_dice.zero_grad()
            optimizer_score.zero_grad()
            total_reward = dice_rewards[0] + dice_rewards[1] + score_reward
            total_reward.backward()
            optimizer_dice.step()
            optimizer_score.step()

        # Print results
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")

In [349]:
def train_models(num_episodes, dice_model, score_model, GAMMA=0.99, EPSILON=0.1, THRESOLD=0.5, BATCH_SIZE=32):
    # Initialize optimizers, target networks and replay buffers
    dice_optimizer = optim.Adam(dice_model.parameters())
    score_optimizer = optim.Adam(score_model.parameters())
    dice_target = copy.deepcopy(dice_model)
    score_target = copy.deepcopy(score_model)
    dice_buffer = ReplayBuffer(10000)
    score_buffer = ReplayBuffer(10000)

    for episode in range(num_episodes):
        # Initialize environment
        env = GameEnvironment()
        state = torch.FloatTensor(env.reset())

        while any(value < 0 for value in env.score_card.values()):
            # Initialize variables for each turn
            states = []
            dice_decisions = []
            score_potentials = []
            pre_score_card = env.get_score_card()
            env.rolls_left = 3

            # First two rolls
            for _ in range(2):
                # Roll dice
                env.roll()
                state = torch.FloatTensor(env.get_state())
                states.append(state)
                score_potentials.append(env.get_potential_scores())
                # Select dice to keep
                dice_decision = select_dice(state, dice_model, EPSILON, THRESOLD)
                dice_decisions.append(dice_decision)
                env.hand = [env.hand[x] for x in range(5) if dice_decision[x] == 1]        
            
            # Final roll
            env.roll()
            states.append(state)
            score_potentials.append(env.get_potential_scores())

            # Calculate dice model rewards
            dice_rewards = [
                evaluate_dice_reward(score_potentials[0], score_potentials[1]),  
                evaluate_dice_reward(score_potentials[1], score_potentials[2])  
            ]

            # Select score
            state = torch.FloatTensor(env.get_state())
            score_decision = env.label_score_dict[select_score(state, score_model, EPSILON, THRESOLD)]
            score_amount = env.get_potential_scores()[score_decision]
            env.mark_score(score_decision, score_amount)
            post_score_card = env.get_score_card()

            # Calculate score model rewards
            score_reward = evaluate_score_reward(pre_score_card, post_score_card, score_potentials[2], score_decision, score_amount)

            # Update models
            optimizer_dice.zero_grad()
            optimizer_score.zero_grad()
            total_reward = dice_rewards[0] + dice_rewards[1] + score_reward
            total_reward.backward()
            optimizer_dice.step()
            optimizer_score.step()

        # Print results
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")

In [350]:
dice_model = DiceModel(20, 64, 5)
score_model = ScoreModel(20, 64, 13)
EPSILON = 0.3


In [353]:
counter = 0
STATES_TESTING = []

dice_model = DiceModel(20, 64, 5)
score_model = ScoreModel(20, 64, 13)
EPSILON = 0.3

env = GameEnvironment()
state = torch.FloatTensor(env.reset())
total_score_reward = 0

while any(value < 0 for value in env.score_card.values()):
    # Initialize variables for each turn
    states = []
    dice_decisions = []
    score_potentials = []
    pre_score_card = env.get_score_card()
    env.rolls_left = 3

    # First roll
    env.roll()
    state = torch.FloatTensor(env.get_state())
    states.append(state)
    score_potentials.append(env.get_potential_scores())
    
    # First dice decision
    dice_decision = select_dice(state, dice_model, EPSILON)
    dice_decisions.append(dice_decision)
    env.hand = [env.hand[x] for x in range(5) if dice_decision[x] == 1]

    # Second roll
    env.roll()
    state = torch.FloatTensor(env.get_state())
    states.append(state)
    score_potentials.append(env.get_potential_scores())

    # Calculate first dice model reward
    dice_rewards = [evaluate_dice_reward(score_potentials[0], score_potentials[1])]

    # Second dice decision
    dice_decision = select_dice(state, dice_model, EPSILON)
    dice_decisions.append(dice_decision)
    env.hand = [env.hand[x] for x in range(5) if dice_decision[x] == 1]
    
    # Final roll
    env.roll()
    state = torch.FloatTensor(env.get_state())
    states.append(state)
    score_potentials.append(env.get_potential_scores())

    # Calculate second dice model reward
    dice_rewards.append(evaluate_dice_reward(score_potentials[0], score_potentials[1]))

    # Select score
    state = torch.FloatTensor(env.get_state())
    score_decision = env.label_score_dict[select_score(state, score_model, EPSILON)]
    score_amount = env.get_potential_scores()[score_decision]
    env.mark_score(score_decision, score_amount)
    post_score_card = env.get_score_card()

    # Calculate score model rewards
    score_reward = evaluate_score_reward(pre_score_card, post_score_card, score_potentials[2], score_decision, score_amount)

    # LOGGING FOR TESTS
    counter += 1
    STATES_TESTING.append(state)

    print(
        f'Turn {counter}:\n'
        f'Hand: {env.hand}\n'
        f'Chosen Score: {score_decision}\n'
        f'Score Amount: {score_amount}\n'
        f'Total score: {env.score_card["total_score"]}\n'
        f'Score Card: {env.score_card}\n'
    )

print(f'TOTAL NUMBER OF TURNS: {counter}')

Turn 1:
Hand: [1, 1, 6, 1, 2]
Chosen Score: ones
Score Amount: 3
Total score: 3
Score Card: {'ones': 3, 'twos': -1, 'threes': -1, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': -1, 'chance': -1, 'total_score': 3}

Turn 2:
Hand: [1, 6, 2, 1, 5]
Chosen Score: yahtzee
Score Amount: 0
Total score: 3
Score Card: {'ones': 3, 'twos': -1, 'threes': -1, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': 0, 'chance': -1, 'total_score': 3}

Turn 3:
Hand: [2, 3, 3, 5, 1]
Chosen Score: threes
Score Amount: 6
Total score: 9
Score Card: {'ones': 3, 'twos': -1, 'threes': 6, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': 0, 'chance': -1, 'total_score': 9}

Turn 4:
Hand: [3, 3, 5, 5,

In [354]:
dice_rewards

[0.0, 0.0]

In [352]:
# counter = 0
# STATES_TESTING = []

# dice_model = DiceModel(20, 64, 5)
# score_model = ScoreModel(20, 64, 13)
# EPSILON = 0.3

# env = GameEnvironment()
# state = torch.FloatTensor(env.reset())
# total_score_reward = 0

# while any(value < 0 for value in env.score_card.values()):
#     # Initialize variables for each turn
#     states = []
#     dice_decisions = []
#     score_potentials = []
#     pre_score_card = env.get_score_card()
#     env.rolls_left = 3

#     # First roll
#     for _ in range(2):
#         # Roll dice
#         env.roll()
#         state = torch.FloatTensor(env.get_state())
#         states.append(state)
#         score_potentials.append(env.get_potential_scores())
#         # Select dice to keep
#         dice_decision = select_dice(state, dice_model, EPSILON)
#         dice_decisions.append(dice_decision)
#         env.hand = [env.hand[x] for x in range(5) if dice_decision[x] == 1]        
    
#     # Final roll
#     env.roll()
#     states.append(state)
#     score_potentials.append(env.get_potential_scores())

#     # Calculate dice model rewards
#     dice_rewards = [
#         evaluate_dice_reward(score_potentials[0], score_potentials[1]),  
#         evaluate_dice_reward(score_potentials[1], score_potentials[2])  
#     ]

#     # Select score
#     state = torch.FloatTensor(env.get_state())
#     score_decision = env.label_score_dict[select_score(state, score_model, EPSILON)]
#     score_amount = env.get_potential_scores()[score_decision]
#     env.mark_score(score_decision, score_amount)
#     post_score_card = env.get_score_card()

#     # Calculate score model rewards
#     score_reward = evaluate_score_reward(pre_score_card, post_score_card, score_potentials[2], score_decision, score_amount)

#     # LOGGING FOR TESTS
#     counter += 1
#     STATES_TESTING.append(state)

#     print(
#         f'Turn {counter}:\n'
#         f'Hand: {env.hand}\n'
#         f'Chosen Score: {score_decision}\n'
#         f'Score Amount: {score_amount}\n'
#         f'Total score: {env.score_card["total_score"]}\n'
#         f'Score Card: {env.score_card}\n'
#     )

# print(f'TOTAL NUMBER OF TURNS: {counter}')

Turn 1:
Hand: [4, 5, 2, 2, 6]
Chosen Score: threes
Score Amount: 0
Total score: 0
Score Card: {'ones': -1, 'twos': -1, 'threes': 0, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': -1, 'chance': -1, 'total_score': 0}

Turn 2:
Hand: [4, 5, 6, 4, 5]
Chosen Score: yahtzee
Score Amount: 0
Total score: 0
Score Card: {'ones': -1, 'twos': -1, 'threes': 0, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': -1, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': 0, 'chance': -1, 'total_score': 0}

Turn 3:
Hand: [4, 5, 6, 2, 2]
Chosen Score: four_of_a_kind
Score Amount: 0
Total score: 0
Score Card: {'ones': -1, 'twos': -1, 'threes': 0, 'fours': -1, 'fives': -1, 'sixes': -1, 'three_of_a_kind': -1, 'four_of_a_kind': 0, 'full_house': -1, 'small_straight': -1, 'large_straight': -1, 'yahtzee': 0, 'chance': -1, 'total_score': 0}

Turn 4:
Hand: [4