In [66]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollator,
    DataCollatorForLanguageModeling,
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
)
import os
import requests
import numpy as np

# detect cuda
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_type = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu"


model = AutoModelForCausalLM.from_pretrained('gpt2')
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
max_seq_length = min(tokenizer.model_max_length, 1024)

def get_shakespeare_dataset():
    char_tknzr = AutoTokenizer.from_pretrained("gpt2", use_fast=True).encode
    DATA_PATH = os.path.join(os.getcwd(), "datasets", "shakespeare")
    raw_path = os.path.join(DATA_PATH, "raw.txt")
    train_path = os.path.join(DATA_PATH, f"train.npy")
    test_path = os.path.join(DATA_PATH, f"test.npy")
    # if path is not even there, download all data
    if not os.path.exists(DATA_PATH):
        print("Downloading raw Shakespeare texts")
        url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
        os.makedirs(DATA_PATH, exist_ok=True)
        text = requests.get(url, timeout=60).text
        with open(raw_path, "w+", encoding="utf8") as f:
            f.write(text)
    
    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print("Tokenizing Shakespeare texts")
        # load text
        with open(raw_path, encoding="utf8") as f:
            text = "".join(f.readlines())
        i = int(0.8*len(text))
        # encode text
        x = np.array(char_tknzr(text[:i]), dtype=np.uint16)
        x_test = np.array(char_tknzr(text[i:]), dtype=np.uint16)
        # map memory
        mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=x.shape)
        mem[:] = x
        mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=x_test.shape)
        mem[:] = x_test

    # at this point we know that the binfile was properly created so we load it
    return {"train": np.memmap(train_path, dtype=np.uint16, mode="r"),
            "val": np.memmap(test_path, dtype=np.uint16, mode="r")}

dataset = get_shakespeare_dataset()
# sft_config = SFTConfig(
#     dataset_text_field="text",
#     max_seq_length=512,
#     output_dir="/tmp",
# )
# trainer = SFTTrainer(
#     "gpt2",
#     train_dataset=dataset,
#     args=sft_config,
# )
# trainer.train()

Downloading raw Shakespeare texts
Tokenizing Shakespeare texts


Token indices sequence length is longer than the specified maximum sequence length for this model (267688 > 1024). Running this sequence through the model will result in indexing errors


In [72]:
print(f"Num training tokens: {len(dataset['train'])}")
print(f"Num validation tokens: {len(dataset['val'])}")
print(dataset['train'][10:20])

dataset_batch_size = 4
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Num training tokens: 267688
Num validation tokens: 70338
[3285  502 2740   13  198  198 3237   25  198 5248]


In [60]:
prompt = "Hello. Ishiguro is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=20,
    pad_token_id=tokenizer.eos_token_id  # EOS Token
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

Hello. Ishiguro is back! If you need someone on the ship, please let me know


In [None]:
args = {}
args["lr"] = 1e-3
args["beta1"] = 0.9
args["beta2"] = 0.999

use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)
extra_args = dict(fused=True) if use_fused else dict()
opt = torch.optim.AdamW(lr=args.lr, betas=(args.beta1, args.beta2),
                                weight_decay=args.weight_decay, **extra_args)