In [1]:
!pip install transformers[torch]
!pip install numba
!pip install accelerate -U
!pip install wonderwords

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     

In [2]:
import os
import re
import numpy as np
import pandas as pd
import wonderwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from wonderwords import RandomWord

import torch
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFT5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

## Load data

In [3]:
## Load datasets
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
model_checkpoints_path = '/content/drive/My Drive/ds266proj/model_checkpoints'

# Read data from to Google Drive
hs_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/hs_cards_data_text.csv')
nr_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/nr_cards_data_text.csv')

Mounted at /content/drive


In [4]:
## Create the pairs of text used in the model
np.random.seed(2319)
def createPairs(dataset, setting = 'card_generation'):
  text_pairs = []
  for index, row in dataset.iterrows():
    if setting == 'card_generation':
      prompt = row['t5generate']
      answer = row['description']
    if setting == 'card_generation_alt':
      prompt = row['t5prompt']
      answer = row['description']
    dict_entry = {'prompt': prompt, 'answer': answer}
    text_pairs.append(dict_entry)

  np.random.shuffle(text_pairs)
  num_valid_samples = int(0.15 * len(text_pairs))
  num_train_samples = len(text_pairs) - 2 * num_valid_samples
  train_pairs = text_pairs[:num_train_samples]
  valid_pairs = text_pairs[num_train_samples : num_train_samples + num_valid_samples]
  test_pairs = text_pairs[num_train_samples + num_valid_samples :]

  print(f"{len(text_pairs)} total pairs")
  print(f"{len(train_pairs)} training pairs")
  print(f"{len(valid_pairs)} validation pairs")
  print(f"{len(test_pairs)} test pairs")

  return text_pairs, train_pairs, valid_pairs, test_pairs

hs_pairs, hs_train, hs_valid, hs_test = createPairs(hs_all_data, setting = 'card_generation')
nr_pairs, nr_train, nr_valid, nr_test = createPairs(nr_all_data, setting = 'card_generation')

print(nr_pairs[:5])

7525 total pairs
5269 training pairs
1128 validation pairs
1128 test pairs
2123 total pairs
1487 training pairs
318 validation pairs
318 test pairs
[{'prompt': 'generate: A jinteki, asset card using seed abnormal harpooner.', 'answer': 'The card named Tenma Line is a 2 cost asset with the subtypes clone. It has an influence requirement of 3. It has a trash cost of 4. The card text says click: Swap 2 pieces of installed ice.'}, {'prompt': 'generate: A shaper, program card using seed understood pathology.', 'answer': 'The card named Refractor is a 1 cost 2 strength program that requires 1 memory with the subtypes decoder, icebreaker. It has an influence requirement of 2. The card text says Interface ability: 1 credit: Break 1 code gate subroutine. 1 credit: plus 3 strength. Use this ability only by spending a credit from a stealth card.'}, {'prompt': 'generate: A neutral-corp, agenda card using seed disagreeable ant.', 'answer': 'The card named Priority Requisition is a 5 advancement age

In [5]:
# Save splits to separate csv files, to load only part at a time later
hs_train_file = '/content/drive/My Drive/ds266proj/hs_train.csv'
hs_valid_file = '/content/drive/My Drive/ds266proj/hs_valid.csv'
hs_test_file = '/content/drive/My Drive/ds266proj/hs_test.csv'

nr_train_file = '/content/drive/My Drive/ds266proj/nr_train.csv'
nr_valid_file = '/content/drive/My Drive/ds266proj/nr_valid.csv'
nr_test_file = '/content/drive/My Drive/ds266proj/nr_test.csv'

pd.DataFrame(hs_train).to_csv(hs_train_file)
pd.DataFrame(hs_valid).to_csv(hs_valid_file)
pd.DataFrame(hs_test).to_csv(hs_test_file)

pd.DataFrame(nr_train).to_csv(nr_train_file)
pd.DataFrame(nr_valid).to_csv(nr_valid_file)
pd.DataFrame(nr_test).to_csv(nr_test_file)

## Modeling Prep

In [6]:
## Using the pre-processor from the t5 fine tuning notebook
def preprocess_data(text_pair, tokenizer, max_length=128):
    orig_text, target_text = text_pair
    orig_encoded = tokenizer.batch_encode_plus(
        [orig_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids'][0]
    orig_attention_mask = orig_encoded['attention_mask'][0]

    target_encoded = tokenizer.batch_encode_plus(
        [target_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    label_ids = target_encoded['input_ids'][0]

    return {'input_ids': orig_input_ids,
            'attention_mask': orig_attention_mask,
            'labels': label_ids}

In [7]:
## Using the data iterator from the same notebook
class QADataIterator:

    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 max_length=128,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.max_length = max_length
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0

    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
        self.df_curr_loaded = self.df_curr_loaded.sample(frac=1)

    def __len__(self):
        return self.n_examples

    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0

        text_pair = self.df_curr_loaded[['prompt', 'answer']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1

        item_data = preprocess_data(
            text_pair,
            self.tokenizer,
            self.max_length
        )

        return item_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

## Running the model

In [31]:
# Download tokenizer and model
model_name = 't5-base'
dir_path = '/content/drive/My Drive/ds266proj/model_checkpoints/'
file_path = dir_path + 't5base-finetuned-hs-generation'
modelsave = file_path + "/pt_model"
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)

In [32]:
t5_model = T5ForConditionalGeneration.from_pretrained(model_name)

## Trying it out on hearthstone cards

In [33]:
# Create the data iterators for train and validation data, pytorch version

max_length = 64
max_load_at_once = 100

train_data_iterator = QADataIterator(
    tokenizer=t5_tokenizer,
    n_examples=len(hs_train),
    max_load_at_once=max_load_at_once,
    data_filename=hs_train_file,
    max_length=max_length
)

valid_data_iterator = QADataIterator(
    tokenizer=t5_tokenizer,
    n_examples=len(hs_valid),
    max_load_at_once=max_load_at_once,
    data_filename=hs_valid_file,
    max_length=max_length
)

In [34]:
# Specify batch size and other training arguments

batch_size = 24


args = Seq2SeqTrainingArguments(
    file_path,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    save_strategy="epoch",
    save_total_limit=1,
)

In [None]:
# Define the trainer, passing in the model, training args, and data generators, again pulled from the example notebook
trainer = Seq2SeqTrainer(
    t5_model,
    args,
    train_dataset=train_data_iterator,
    eval_dataset=valid_data_iterator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
## Train it!
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.138198
2,No log,1.011421
3,1.458000,0.952909
4,1.458000,0.916682
5,1.036400,0.894062
6,1.036400,0.879747
7,0.968900,0.871434
8,0.968900,0.868216


TrainOutput(global_step=1760, training_loss=1.1239051472056996, metrics={'train_runtime': 724.8098, 'train_samples_per_second': 58.156, 'train_steps_per_second': 2.428, 'total_flos': 3208599125360640.0, 'train_loss': 1.1239051472056996, 'epoch': 8.0})

In [None]:
# save the model
t5_model.save_pretrained(modelsave, from_pt=True)

In [35]:
# Load the model from the checkpoint
t5_model = T5ForConditionalGeneration.from_pretrained(modelsave)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
print(hs_all_data[''])

In [38]:
## Generate seeded prompts
np.random.seed(2319)
seedword = RandomWord()

hs_gen_prompts = []
hs_gen_references = []
classes = ['Priest', 'Hunter', 'Rogue', 'Paladin', 'Warlock', 'Neutral', 'Shaman', 'Druid', 'Mage', 'Warrior', 'Deathknight']
cardtypes = ['Spell'] * 100 + ['Minion'] * 125 + ['Hero'] * 5 + ['Weapon'] * 20
print(cardtypes)
## Fille in prompts and references
for thisclass in classes:
  for thistype in cardtypes:
    newprompt = f'generate: A {thisclass}, {thistype} card using seed {seedword.word(include_parts_of_speech=["adjectives"])} {seedword.word(include_parts_of_speech=["nouns"])}.'
    hs_gen_prompts.append(newprompt)
    # Take the first 5 references for each example
    ref_data = hs_all_data.loc[(hs_all_data['classes'] == thisclass.upper()) & (hs_all_data[thistype] == 1)]
    if len(ref_data) > 0:
      candidates = list(ref_data['description'].sample(n=5, replace=True))
    else:
      candidates = []
    if len(candidates) == 0:
      candidates = ["NONE"]
    hs_gen_references.append(candidates)

print(hs_gen_prompts)
print(hs_gen_references)

['Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Spell', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion', 'Minion',

In [39]:
## Test some examples
transformers.logging.set_verbosity_error()
hs_gen_candidates = []
for i, test_input_text in enumerate(hs_gen_prompts):
    test_inputs = t5_tokenizer([test_input_text], return_tensors='pt')
    test_output_ids = t5_model.generate(test_inputs['input_ids'].cuda(),
                                        max_length=64,
                                        do_sample=True,
                                        top_k=50,
                                        temperature=1)
    for out_ids in test_output_ids:
      candidate = t5_tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
      hs_gen_candidates.append(candidate)
    if i % 10 == 0:
      print(f"Example #{i}: {hs_gen_candidates[-1]}")

# Save the generated dataset of fake cards for a starting point
candidates_df = pd.DataFrame(hs_gen_candidates, columns = ['description'])

Example #0: The card named Rift Reap is a 1 cost holy spell . The card text says: Refresh all minions in your hand. If they were minions, send them to their 'Muslims'
Example #10: The card named Revelation is a 5 cost spell , and includes the effects discover. The card text says: Discover two 3 dash 4 Dragons.
Example #20: The card named Unstable Fire is a 2 cost shadow spell , and includes the effects deal damage. The card text says: Deal 5 damage to an enemy minion. Then deal 1 damage to all other enemies.
Example #30: The card named Priest of the Storm is a 3 cost shadow spell . The card text says: Give a minion plus 2 Attack. Summon one that costs (3) or less instead.
Example #40: The card named Blood Tide is a 2 cost holy spell . The card text says: Summon 5 minion with Rush, Draw and Cast a card. (Used every turn!)
Example #50: The card named Rhythm, Inspire is a 3 cost frost spell . The card text says: Summon four 1 dash 1 Priests with Rush.
Example #60: The card named Reclaim t

In [40]:
hs_fakes_file = '/content/drive/My Drive/ds266proj/hs_fakes_orig.csv'
candidates_df.to_csv(hs_fakes_file)

## Score it!

In [None]:
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=7d0fe284633ffc2f967f85cbd49a98310204611e6182d0266c8354c814cb78d9
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Load the ROUGE metric
import evaluate
from evaluate import load

rouge = evaluate.load('rouge')

In [None]:
results = rouge.compute(predictions=hs_gen_candidates, references=hs_gen_references)
print(results)

{'rouge1': 0.6311243358846487, 'rouge2': 0.44567162254257475, 'rougeL': 0.6027444540711853, 'rougeLsum': 0.6018941855280516}


## Next with netrunner

In [8]:
# Download tokenizer and model
model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model_nr = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# Create the data iterators for train and validation data, pytorch version

max_length = 64
max_load_at_once = 100

train_data_iterator = QADataIterator(
    tokenizer=t5_tokenizer,
    n_examples=len(nr_train),
    max_load_at_once=max_load_at_once,
    data_filename=nr_train_file,
    max_length=max_length
)

valid_data_iterator = QADataIterator(
    tokenizer=t5_tokenizer,
    n_examples=len(nr_valid),
    max_load_at_once=max_load_at_once,
    data_filename=nr_valid_file,
    max_length=max_length
)

In [10]:
# Specify batch size and other training arguments

batch_size = 16

# Modify this filepath to where you want to save the model after fine-tuning
dir_path = '/content/drive/My Drive/ds266proj/model_checkpoints/'
file_path = dir_path + 't5base-finetuned-nr-generation'
modelsave = file_path + "/pt_model"
args = Seq2SeqTrainingArguments(
    file_path,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
)

In [11]:
# Define the trainer, passing in the model, training args, and data generators, again pulled from the example notebook
trainer = Seq2SeqTrainer(
    t5_model_nr,
    args,
    train_dataset=train_data_iterator,
    eval_dataset=valid_data_iterator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
## Train it!
trainer.train()

{'eval_loss': 1.8951164484024048, 'eval_runtime': 1.5641, 'eval_samples_per_second': 203.317, 'eval_steps_per_second': 12.787, 'epoch': 1.0}
{'eval_loss': 1.5983517169952393, 'eval_runtime': 1.5728, 'eval_samples_per_second': 202.191, 'eval_steps_per_second': 12.716, 'epoch': 2.0}
{'eval_loss': 1.4562593698501587, 'eval_runtime': 1.5892, 'eval_samples_per_second': 200.102, 'eval_steps_per_second': 12.585, 'epoch': 3.0}
{'eval_loss': 1.3704198598861694, 'eval_runtime': 1.5863, 'eval_samples_per_second': 200.464, 'eval_steps_per_second': 12.608, 'epoch': 4.0}
{'eval_loss': 1.3201392889022827, 'eval_runtime': 1.9606, 'eval_samples_per_second': 162.195, 'eval_steps_per_second': 10.201, 'epoch': 5.0}
{'loss': 1.9148, 'grad_norm': 1.2826539278030396, 'learning_rate': 1.639784946236559e-05, 'epoch': 5.38}
{'eval_loss': 1.2896738052368164, 'eval_runtime': 1.9965, 'eval_samples_per_second': 159.281, 'eval_steps_per_second': 10.018, 'epoch': 6.0}
{'eval_loss': 1.269858956336975, 'eval_runtime': 

TrainOutput(global_step=744, training_loss=1.7482112351284231, metrics={'train_runtime': 204.9556, 'train_samples_per_second': 58.042, 'train_steps_per_second': 3.63, 'train_loss': 1.7482112351284231, 'epoch': 8.0})

In [None]:
# save the model
t5_model_nr.save_pretrained(modelsave, from_pt=True)

In [14]:
# Load the model
t5_model_nr = T5ForConditionalGeneration.from_pretrained(modelsave)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model_nr.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [20]:
print(list(nr_all_data['t5generate'])[0:5])
print(set(list(nr_all_data['faction_code'])))

['generate: A neutral-corp, asset card using seed angry slice.', 'generate: A neutral-corp, asset card using seed gigantic min.', 'generate: A neutral-corp, identity card using seed cynical forage.', 'generate: A neutral-corp, operation card using seed wary trader.', 'generate: A neutral-corp, ice card using seed snotty mustache.']
{'neutral-corp', 'neutral-runner', 'jinteki', 'weyland-consortium', 'shaper', 'nbn', 'haas-bioroid', 'criminal', 'anarch'}


 'ice', 'upgrade', 'hardware', 'resource', 'identity',
                                                'operation', 'agenda', 'program', 'asset', 'event'

In [28]:
## Generate seeded prompts
np.random.seed(2319)
seedword = RandomWord()

nr_gen_prompts = []
nr_gen_references = []
corpclasses = ['neutral-corp', 'jinteki', 'weyland-consortium', 'nbn', 'haas-bioroid']
corpcardtypes = ['agenda'] * 15 + ['asset'] * 25 + ['upgrade'] * 15 + ['operation'] * 25 + ['ice'] * 25 + ['identity'] * 10
runnerclasses = ['shaper', 'neutral-runner', 'criminal', 'anarch']
runnercardtypes = ['event'] * 25 + ['program'] * 25 + ['resource'] * 25 + ['hardware'] * 25 + ['identity'] * 10

## Fill in prompts and references
for thisclass in corpclasses:
  for thistype in corpcardtypes:
    newprompt = f'generate: A {thisclass}, {thistype} card using seed {seedword.word(include_parts_of_speech=["adjectives"])} {seedword.word(include_parts_of_speech=["nouns"])}.'
    nr_gen_prompts.append(newprompt)
    # Take the first 5 references for each example
    ref_data = nr_all_data.loc[(nr_all_data['faction_code'] == thisclass) & (nr_all_data[thistype] == 1)]
    candidates = list(ref_data['description'].sample(n=5))
    if len(candidates) == 0:
      candidates = ["NONE"]
    nr_gen_references.append(candidates)

## Fill in prompts and references
for thisclass in runnerclasses:
  for thistype in runnercardtypes:
    newprompt = f'generate: A {thisclass}, {thistype} card using seed {seedword.word(include_parts_of_speech=["adjectives"])} {seedword.word(include_parts_of_speech=["nouns"])}.'
    nr_gen_prompts.append(newprompt)
    # Take the first 5 references for each example
    ref_data = nr_all_data.loc[(nr_all_data['faction_code'] == thisclass) & (nr_all_data[thistype] == 1)]
    candidates = list(ref_data['description'].sample(n=5))
    if len(candidates) == 0:
      candidates = ["NONE"]
    nr_gen_references.append(candidates)

print(nr_gen_prompts)
print(nr_gen_references)

['generate: A neutral-corp, agenda card using seed dangerous tomato.', 'generate: A neutral-corp, agenda card using seed direful sage.', 'generate: A neutral-corp, agenda card using seed successful creche.', 'generate: A neutral-corp, agenda card using seed aback helicopter.', 'generate: A neutral-corp, agenda card using seed hesitant catsup.', 'generate: A neutral-corp, agenda card using seed nosy vanity.', 'generate: A neutral-corp, agenda card using seed defiant hospital.', 'generate: A neutral-corp, agenda card using seed erect bottom-line.', 'generate: A neutral-corp, agenda card using seed chilly circuit.', 'generate: A neutral-corp, agenda card using seed lewd jeweller.', 'generate: A neutral-corp, agenda card using seed finicky chopsticks.', 'generate: A neutral-corp, agenda card using seed eminent target.', 'generate: A neutral-corp, agenda card using seed jobless saviour.', 'generate: A neutral-corp, agenda card using seed broad profession.', 'generate: A neutral-corp, agenda

In [29]:
transformers.logging.set_verbosity_error()
nr_gen_candidates = []
for i, test_input_text in enumerate(nr_gen_prompts):
    test_inputs = t5_tokenizer([test_input_text], return_tensors='pt')
    test_output_ids = t5_model_nr.generate(test_inputs['input_ids'].cuda(),
                                        max_length=100,
                                        do_sample=True,
                                        top_k=40,
                                        temperature=1)
    for out_ids in test_output_ids:
      candidate = t5_tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
      nr_gen_candidates.append(candidate)
    if i % 10 == 0:
      print(f"Example #{i}: {nr_gen_candidates[-1]}")

# Save the generated dataset of fake cards for a starting point
candidates_df_nr = pd.DataFrame(nr_gen_candidates, columns = ['description'])

Example #0: The card named Squida is a 3 advancement agenda worth 4 points with the subtypes expansion. It has an influence requirement of 0. It has a trash cost of 4. The card text says Subroutine When you score Squida, you may uninstall 1 key
Example #10: The card named Re-Stop is a 3 advancement agenda worth 2 points with the subtypes research. It has an influence requirement of 0. The card text says When you score this agenda, you may take 1 credit from your stack.
Example #20: The card named "Boyz" is a 3 cost asset with the subtypes character. It has an influence requirement of 0. It has a trash cost of 3. The card text says Subroutine Take 1 tag. For each tag you use, you may replace 1 tag. When this asset is rezzed, you may pay 1 tag
Example #30: The card named Virgil is a 7 cost asset with the subtypes run. It has an influence requirement of 0. It has a trash cost of 3. The card text says Run a deflector. Once each turn, the Corp gets 1 tag.
Example #40: The card named Xiaoyan

In [30]:
nr_fakes_file = '/content/drive/My Drive/ds266proj/nr_fakes_orig.csv'
candidates_df_nr.to_csv(nr_fakes_file)

In [16]:
## Test some examples
transformers.logging.set_verbosity_error()

test_inputs = list(nr_all_data['t5generate']).copy()
np.random.shuffle(test_inputs)

print('Prompts')
print("-----------------------")
for item in test_inputs[0:5]:
  print(item)

print('Gens')
print("-----------------------")
for test_input_text in test_inputs[0:5]:
    test_inputs = t5_tokenizer([test_input_text], return_tensors='pt')
    test_output_ids = t5_model_nr.generate(test_inputs['input_ids'].cuda(),
                                        max_length=100,
                                        do_sample=True,
                                        top_k=40,
                                        temperature=1)

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])

Prompts
-----------------------
generate: A anarch, program card using seed jittery measurement.
generate: A shaper, hardware card using seed purring look.
generate: A haas-bioroid, operation card using seed faithful cassock.
generate: A criminal, hardware card using seed wary microwave.
generate: A anarch, event card using seed difficult surface.
Gens
-----------------------
['The card named Glow is a 1 cost program that requires 1 memory with the subtypes sabotage, icebreaker. It has an influence requirement of 3. The card text says If you make a successful run, you may trash this program. When the Corp trashes this program, pay the installed card']
['The card named Leapless is a 0 cost hardware with the subtypes chip, run. It has an influence requirement of 3. The card text says Run your run, unless you need to pay 5 credits or if you do it, only pay 2 credits. Subroutine Run this run, one of the corollaries. The Runner can do this']
['The card named Recruit is a 4 cost operation wi

## EXTRA CLEANUP FOR HUMAN SCORING

In [41]:
# Read data from to Google Drive
hs_fakes_data = pd.read_csv('/content/drive/My Drive/ds266proj/hs_fakes_orig.csv')
nr_fakes_data = pd.read_csv('/content/drive/My Drive/ds266proj/nr_fakes_orig.csv')




In [48]:
# Tack on the faction cols
classes = ['Priest', 'Hunter', 'Rogue', 'Paladin', 'Warlock', 'Neutral', 'Shaman', 'Druid', 'Mage', 'Warrior', 'Deathknight']
cardtypes = ['Spell'] * 100 + ['Minion'] * 125 + ['Hero'] * 5 + ['Weapon'] * 20
hs_classes = []
hs_types = []
for thisclass in classes:
  for thistype in cardtypes:
    hs_classes.append(thisclass)
    hs_types.append(thistype)


corpclasses = ['neutral-corp', 'jinteki', 'weyland-consortium', 'nbn', 'haas-bioroid']
corpcardtypes = ['agenda'] * 15 + ['asset'] * 25 + ['upgrade'] * 15 + ['operation'] * 25 + ['ice'] * 25 + ['identity'] * 10
runnerclasses = ['shaper', 'neutral-runner', 'criminal', 'anarch']
runnercardtypes = ['event'] * 25 + ['program'] * 25 + ['resource'] * 25 + ['hardware'] * 25 + ['identity'] * 10
nr_classes = []
nr_types = []
for thisclass in corpclasses:
  for thistype in corpcardtypes:
    nr_classes.append(thisclass)
    nr_types.append(thistype)
for thisclass in runnerclasses:
  for thistype in runnercardtypes:
    nr_classes.append(thisclass)
    nr_types.append(thistype)


print(len(hs_fakes_data))
print(len(hs_classes))
print(len(hs_types))

print(len(nr_fakes_data))
print(len(nr_classes))
print(len(nr_types))

hs_fakes_data['class'] = hs_classes
hs_fakes_data['type'] = hs_types

nr_fakes_data['faction'] = nr_classes
nr_fakes_data['type'] = nr_types

print(nr_fakes_data.head())

2750
2750
2750
1015
1015
1015
   Unnamed: 0                                        description  \
0           0  The card named Squida is a 3 advancement agend...   
1           1  The card named "The Legacy of the Nation" is a...   
2           2  The card named The Edge of the Community II is...   
3           3  The card named Grain is a 5 advancement agenda...   
4           4  The card named Interpolation is a 3 advancemen...   

        faction    type  
0  neutral-corp  agenda  
1  neutral-corp  agenda  
2  neutral-corp  agenda  
3  neutral-corp  agenda  
4  neutral-corp  agenda  


In [49]:
## Save
nr_fakes_tagging_file = '/content/drive/My Drive/ds266proj/nr_fakes_withcols.csv'
hs_fakes_tagging_file = '/content/drive/My Drive/ds266proj/hs_fakes_withcols.csv'
hs_fakes_data.to_csv(hs_fakes_tagging_file)
nr_fakes_data.to_csv(nr_fakes_tagging_file)