# Experiments in Card Generation

Hearthcards contains a large database of community-approved cards in its gallery.

In [12]:
import pickle
CARDS_DB = pickle.load(open('hearthcards.pkl', 'rb'))

Let's convert it all into the Spellsource standard card format.

In [13]:
from spellsource.ext.hearthcards import from_hearthcard_to_spellsource
CARDS_DB = [from_hearthcard_to_spellsource(card) for card in CARDS_DB]

An example card:

In [19]:
%matplotlib inline
import pandas as pd
from random import choice
pd.set_option('display.max_colwidth', 0)
pd.DataFrame([choice(CARDS_DB)]).transpose()

Unnamed: 0,0
attributes,{'DEATHRATTLES': True}
baseAttack,5
baseHp,4
baseManaCost,5
collectible,True
deathrattle,"{'class': 'DamageSpell', 'value': 1, 'target': 'FRIENDLY_CHARACTERS'}"
description,"[b][b]Deathrattle:[/b][/b] If it's your opponent's turn, deal 2 damage to all enemies."
fileFormatVersion,1
heroClass,VIOLET
name,Wicked Watcher


Using the Hearthcards dataset, let's set up a benchmark, super naive "character RNN" to generate cards using the half the real, coded cards as validation and the other half as test.

At a high level, a character RNN is a sequence-to-sequence LSTM neural network whose objective is to predict the next element in a sequence given the previous sequence of elements. This can be exploited to freely generated probable sequences, but is not regarded as especially performant.

We'll implement the model using Keras.

In [35]:
# https://github.com/yxtay/char-rnn-text-generation/blob/master/keras_model.py
import os
import time

import numpy as np

from keras.callbacks import Callback, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Dropout, Embedding, LSTM, TimeDistributed
from keras.models import load_model, Sequential
from keras.optimizers import Adam

class CharRNNModel(object):
    def

def build_model(batch_size, seq_len, vocab_size, embedding_size=32,
                rnn_size=128, num_layers=2, drop_rate=0.0,
                learning_rate=0.001, clip_norm=5.0):
    """
    build character embeddings LSTM text generation model.
    """
    print("building model: batch_size=%s, seq_len=%s, vocab_size=%s, "
                "embedding_size=%s, rnn_size=%s, num_layers=%s, drop_rate=%s, "
                "learning_rate=%s, clip_norm=%s." % (
                batch_size, seq_len, vocab_size, embedding_size,
                rnn_size, num_layers, drop_rate,
                learning_rate, clip_norm))
    model = Sequential()
    # input shape: (batch_size, seq_len)
    model.add(Embedding(vocab_size, embedding_size,
                        batch_input_shape=(batch_size, seq_len)))
    model.add(Dropout(drop_rate))
    # shape: (batch_size, seq_len, embedding_size)
    for _ in range(num_layers):
        model.add(LSTM(rnn_size, return_sequences=True, stateful=True))
        model.add(Dropout(drop_rate))
    # shape: (batch_size, seq_len, rnn_size)
    model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
    # output shape: (batch_size, seq_len, vocab_size)
    optimizer = Adam(learning_rate, clipnorm=clip_norm)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    return model


def build_inference_model(model, batch_size=1, seq_len=1):
    """
    build inference model from model config
    input shape modified to (1, 1)
    """
    print("building inference model.")
    config = model.get_config()
    # edit batch_size and seq_len
    config[0]["config"]["batch_input_shape"] = (batch_size, seq_len)
    inference_model = Sequential.from_config(config)
    inference_model.trainable = False
    return inference_model


In [36]:
from keras.preprocessing.sequence import pad_sequences
import re

VALID_CHARS = ' .,abcdefghijklmnopqrstuvwxyz'

def create_dictionary():
    """
    create char2id, id2char and vocab_size
    from printable ascii characters.
    """
    chars = sorted(ch for ch in VALID_CHARS if ch not in ("\x0b", "\x0c", "\r"))
    char2id = dict((ch, i + 1) for i, ch in enumerate(chars))
    char2id.update({"": 0})
    id2char = dict((char2id[ch], ch) for ch in char2id)
    vocab_size = len(char2id)
    return char2id, id2char, vocab_size

CHAR2ID, ID2CHAR, VOCAB_SIZE = create_dictionary()


def encode_text(text, char2id=CHAR2ID):
    """
    encode text to array of integers with CHAR2ID
    """
    return np.fromiter((char2id.get(ch, 0) for ch in text), int)


def decode_text(int_array, id2char=ID2CHAR):
    """
    decode array of integers to text with ID2CHAR
    """
    return "".join((id2char[ch] for ch in int_array))


def one_hot_encode(indices, num_classes):
    """
    one-hot encoding
    """
    return np.eye(num_classes)[indices]


def batch_generator(texts: [str], batch_size=64, one_hot_features=False, one_hot_labels=False):
    """
    batch generator for sequence
    ensures that batches generated are continuous along axis 1
    so that hidden states can be kept across batches and epochs
    """
    
    # filter
    texts = [re.sub('[^%s]' % (VALID_CHARS), text.lower(), '') for text in texts]
    
    # find the longest sequence
    seq_len = max(len(s) for s in texts)
    
    # encode and pad to longest, concatenating to fit perfectly
    sequence = np.concatenate([pad_sequences(encode_text(s), maxlen=seq_len) for s in texts])
    
    # calculate effective length of text to use
    num_batches = (len(sequence) - 1) // (batch_size * seq_len)
    if num_batches == 0:
        raise ValueError("No batches created. Use smaller batch size or sequence length.")
    print("number of batches: %s." % num_batches)
    rounded_len = num_batches * batch_size * seq_len

    x = np.reshape(sequence[: rounded_len], [batch_size, num_batches * seq_len])
    if one_hot_features:
        x = one_hot_encode(x, VOCAB_SIZE)

    y = np.reshape(sequence[1: rounded_len + 1], [batch_size, num_batches * seq_len])
    if one_hot_labels:
        y = one_hot_encode(y, VOCAB_SIZE)

    epoch = 0
    while True:
        # roll so that no need to reset rnn states over epochs
        x_epoch = np.split(np.roll(x, -epoch, axis=0), num_batches, axis=1)
        y_epoch = np.split(np.roll(y, -epoch, axis=0), num_batches, axis=1)
        for batch in range(num_batches):
            yield x_epoch[batch], y_epoch[batch]
        epoch += 1

In [37]:
CHECKPOINT_PATH = './'
log_dir = './'

callbacks = [
        ModelCheckpoint(CHECKPOINT_PATH, verbose=1, save_best_only=False),
        TensorBoard(log_dir, write_graph=True, embeddings_freq=1,
                    embeddings_metadata={"embedding_1": os.path.abspath(os.path.join("data", "id2char.tsv"))}),
    ]

In [38]:
def generate_text(model, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    generated = seed
    encoded = encode_text(seed)
    model.reset_states()

    for idx in encoded[:-1]:
        x = np.array([[idx]])
        # input shape: (1, 1)
        # set internal states
        model.predict(x)

    next_index = encoded[-1]
    for i in range(length):
        x = np.array([[next_index]])
        # input shape: (1, 1)
        probs = model.predict(x)
        # output shape: (1, 1, vocab_size)
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # append to sequence
        generated += ID2CHAR[next_index]

    print("generated text: \n%s\n" % generated)
    return generated


class LoggerCallback(Callback):
    """
    callback to log information.
    generates text at the end of each epoch.
    """
    def __init__(self, text, model):
        super(LoggerCallback, self).__init__()
        self.text = text
        # build inference model using config from learning model
        self.inference_model = build_inference_model(model)
        self.time_train = self.time_epoch = time.time()

    def on_epoch_begin(self, epoch, logs=None):
        self.time_epoch = time.time()

    def on_epoch_end(self, epoch, logs=None):
        duration_epoch = time.time() - self.time_epoch
        logger.info("epoch: %s, duration: %ds, loss: %.6g.",
                    epoch, duration_epoch, logs["loss"])
        # transfer weights from learning model
        self.inference_model.set_weights(self.model.get_weights())

        # generate text
        seed = generate_seed(self.text)
        generate_text(self.inference_model, seed)

    def on_train_begin(self, logs=None):
        print("start of training.")
        self.time_train = time.time()

    def on_train_end(self, logs=None):
        duration_train = time.time() - self.time_train
        print("end of training, duration: %ds." % duration_train)
        # transfer weights from learning model
        self.inference_model.set_weights(self.model.get_weights())

        # generate text
        seed = generate_seed(self.text)
        generate_text(self.inference_model, seed, 1024, 3)

In [53]:
texts = [re.sub(pattern=r'(\r\n)|(\n)', string=re.sub(pattern=r'(\[/?[bi]\])', string=card['description'], repl=''), repl='. ') for card in CARDS_DB]
max_seq_len = max(len(t) for t in texts)
model = build_model(seq_len=max_seq_len,batch_size=8, vocab_size=VOCAB_SIZE)

building model: batch_size=8, seq_len=158, vocab_size=30, embedding_size=32, rnn_size=128, num_layers=2, drop_rate=0.0, learning_rate=0.001, clip_norm=5.0.
