In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import random

random.seed(0)

cards_df = pd.read_csv('cards.csv')
cards_df = cards_df.drop('name', axis=1)
cards_df.head()

Unnamed: 0,rarity,color,type,mana_cost,power,toughness,loyalty,text
0,uncommon,W,Creature,WWXXXXX,4.0,4.0,,<begin_text> <new_line> <sentence> first strik...
1,uncommon,W,Creature,WXXXX,3.0,3.0,,<begin_text> <new_line> flying </new_line> <ne...
2,common,W,Creature,WXXX,2.0,2.0,,<begin_text> <new_line> <sentence> flying this...
3,uncommon,W,Creature,WXXX,2.0,2.0,,"<begin_text> <new_line> <precolon> <color> W,X..."
4,common,W,Instant,W,,,,<begin_text> <new_line> <sentence> prevent the...


In [2]:
# Generate text dataset
from sklearn.model_selection import train_test_split
cards_train, cards_val = train_test_split(cards_df, test_size=0.2)
cards_train.to_csv('training.txt', index=False, header=False, sep=' ', quotechar='"')
cards_val.to_csv('val.txt', index=False, header=False, sep=' ', quotechar='"')

In [3]:
import torch
from io import StringIO

class CardDataset(torch.utils.data.Dataset):
    def __init__(self, content, tokenizer):
        self.cards = []
        if isinstance(content, str):
            with open(content, "r", encoding="utf-8") as file:
                self.cards = list([line.strip() for line in file.readlines()])
                file.close()
        elif isinstance(content, pd.DataFrame):
            self.cards = self.convert_df_to_strs(content)
        else:
            assert False, "content is not a str or pd.DataFrame"
        self.tokenizer = tokenizer
    
    def convert_df_to_strs(self, df):
        if 'name' in df.columns:
            df = df.drop('name', axis=0)
        buffer = StringIO()
        df.to_csv(buffer, index=False, header=False, sep=" ")
        buffer.seek(0)
        return [line.strip() for line in buffer.readlines()]
        
    def __len__(self):
        return len(self.cards)
    
    def __getitem__(self, idx):
        return self.cards[idx]

In [4]:
custom_tokens = ["\"<card>", "</card>\"", "<line>", "<precolon>", "</precolon>", "<color>", "</color>", "<bullet>"]

from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence

bpe_tokenizer = ByteLevelBPETokenizer()

bpe_tokenizer.pre_tokenizer = Whitespace()
bpe_tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
bpe_tokenizer.add_tokens(custom_tokens)

bpe_tokenizer.train(['training.txt', 'val.txt'], min_frequency=2)

vocab_file, merges_file = tuple(bpe_tokenizer.save_model('.'))

In [5]:
from transformers import CTRLLMHeadModel, CTRLConfig
  
config = CTRLConfig(
    vocab_size=bpe_tokenizer.get_vocab_size(), 
    n_positions=256,
    n_ctx=256,
    n_embd=256,
    dff=128,
    n_layer=12,
    n_head=12
)

model = CTRLLMHeadModel(config)

model.num_parameters()

4833343

In [6]:
# initialize dataset
train_dataset = CardDataset(cards_train, bpe_tokenizer)
validation_dataset = CardDataset(cards_val, bpe_tokenizer)

In [7]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=bpe_tokenizer, mlm=False)

args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=1,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=args,                       # training arguments, defined above
    data_collator=data_collator,     # data collator
    train_dataset=train_dataset,     # training dataset
    eval_dataset=validation_dataset  # evaluation dataset
)

In [8]:
trainer.train()

wandb: Currently logged in as: jaymatthewsherman (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.27 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


AttributeError: 'str' object has no attribute 'size'