# Yahtzee Dataset

In [5]:
import numpy as np
import pandas as pd

df = pd.read_csv('yahtzee-dataset.csv')

In [2]:
DICE = 5
EYES = 6


def inc_dice_cnt(i):
    """
    Moves to the next dice combination
    """
    if i == DICE:
        return
    dice_count[i] += 1
    if dice_count[i] == EYES:
        dice_count[i] = 0
        inc_dice_cnt(i+1)


def get_cnt(arr):
    """
    Get an occurences array with the count of a value at the index of that value
    
    get_cnt([0, 1, 1, 3, 3]) == [1, 2, 0, 2, 0]
    """
    cnt = np.zeros(EYES)
    for i in arr:
        cnt[i] += 1
    return cnt


def get_seq(arr):
    """
    Get the largest number of sequential numbers in the array
    
    get_seq([0, 5, 3, 2, 4]) == 4
    """
    sort = np.unique(sorted(arr))
    seq, top_seq = 0, 0
    for i in range(1, len(sort)):
        if sort[i] - 1 == sort[i-1]:
            seq += 1
        else:
            top_seq = max(seq, top_seq)
            seq = 0
    return max(seq, top_seq)+1


def get_label(arr):
    """
    Get the Yahtzee label for a given dice combination array
    """
    cnt = get_cnt(arr)
    seq = get_seq(arr)
    if 5 in cnt:
        return 'yathzee'  # intentional typo because teacher supplied dataset also includes this typo
    if seq == 5:
        return 'large-straight'
    if seq == 4:
        return 'small-straight'
    if 3 in cnt and 2 in cnt:
        return 'full-house'
    if 4 in cnt:
        return 'four-of-a-kind'
    if 3 in cnt:
        return 'three-of-a-kind'
    return 'nothing'

In [3]:
all_throws = np.zeros((EYES**DICE, DICE), dtype=np.int)
dice_count = np.zeros(DICE, np.int)

labels = []
for i in range(all_throws.shape[0]):
    all_throws[i] = dice_count
    labels.append(get_label(dice_count))
    inc_dice_cnt(0)

df_all = pd.DataFrame(all_throws+1, columns=['dice1', 'dice2', 'dice3', 'dice4', 'dice5'])
df_all['label'] = labels
df_all.to_csv('yahtzee-all.csv', index=False)

In [4]:
df_other = pd.concat([df, df_all]).drop_duplicates(keep=False)
df_other.to_csv('yahtzee-other.csv', index=False)

In [6]:
df_other_fixed = pd.read_csv('yahtzee-other.csv')
df_fixed = pd.concat([df_other_fixed, df_all]).drop_duplicates(keep=False)
df_fixed.to_csv('yahtzee-fixed.csv', index=False)