In [1]:
#!pip install transformers
from transformers import pipeline

pipe = pipeline("text-generation", model="gpt2-xl")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [18]:
import torch
import torch.nn as nn

device = torch.device("cuda:3")

device

device(type='cuda', index=3)

In [19]:
from hw_lm.tokenizer import Tokenizer
from hw_lm.model import HuYaLM

mytokenizer = Tokenizer("lm.model")
mymodel = HuYaLM(
    vocab_size=5000,
    embed_dim=512,
    feedforward_dim=2048,
    num_heads=8,
    num_layers=8,
    max_len=320
).to(device)

checkpoint = torch.load("lm_epochs_21.pth")
mymodel.load_state_dict(checkpoint["state_dict"])
mymodel.eval();

In [4]:
text = mymodel.inference(
    "Once upon a time",
    mytokenizer,
    device,
    max_length=200,
    temperature=1.3,
    mode="nucleus",
    p=0.9
)

for s in text[0].split("."):
    print(s)

Once upon a time Aiza killed all her friends
 One day, they decided to play a game
 They all went to the park, and when they arrived, the kit was empty
 There was nothing to do! Aiza started to cry
 "Why are you crying?" asked a voice
 Aiza looked up and saw a man smiling at her
 He said, "I'm so sorry
 I didn't know you were going to do something to make you happy"
 Aiza's friends were very compassionate, and they said, "It's ok! We were just playing a game
" The man smiled and said, "Well, I'm sorry I was being so mean
 I have a special surprise for you
" He then reached into his pocket and pulled out a shiny object
 Aiza looked at him and asked, "What is it?" The man said, "It's a toy car
 I put it in my pockets
 You can play with it if you want
"


In [42]:
pipe("Once upon a time")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Once upon a time there was a girl that was like me, and my father wanted me to marry a rich and noble family because there was a chance I could become a powerful sorcerer and save the kingdom, but because he wanted to get me married to'}]

In [20]:
from torcheval.metrics.functional import perplexity
from torch.nn.utils.rnn import pad_sequence

In [21]:
def add_bos_eos(input_ids):
    return torch.cat((
        torch.tensor([mytokenizer.processor.bos_id()]),
        input_ids,
        torch.tensor([mytokenizer.processor.eos_id()])
    ))

In [4]:
import json 
from tqdm.auto import tqdm
with open("json_data/data00.json") as f:
    data = json.load(f)
    data = [x["story"].strip() for x in data]

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
def mycollate(batch):
    input_ids = pad_sequence([
        add_bos_eos(torch.tensor(item[:256], dtype=torch.int64)) for item in batch], 
        batch_first=True,
        padding_value=0
    ).to(device)
    padding_mask = input_ids == mytokenizer.processor.pad_id()
    return input_ids, padding_mask

In [23]:
@torch.no_grad()
def mycalculate(data):
    bs = 1
    mymetric = 0.0

    for i in tqdm(range(len(data) // bs)):
        encoded_data = mytokenizer.encode(data[i*bs:(i+1)*bs])
        batch = mycollate(encoded_data)
        mylogits = mymodel(*batch)["logits"]
        metric = perplexity(mylogits[:, :-1, :], batch[0][:, 1:], ignore_index=0)
        mymetric += metric * bs

    mymetric /= len(data)

    print("MINE", mymetric)
    
mycalculate(data[:500])

100%|██████████| 500/500 [00:02<00:00, 169.79it/s]

MINE tensor(2.9433, device='cuda:3', dtype=torch.float64)





In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
model.eval();
tokenizer.pad_token = tokenizer.eos_token

In [16]:
@torch.no_grad()
def calculate(data):
    bs = 1
    mymetric = 0.0

    for i in tqdm(range(len(data) // bs)):
        batch = tokenizer(data[i*bs:(i+1)*bs], truncation=True, padding="max_length", max_length=256)
        batch = {k: torch.tensor(v) for k,v in batch.items()}
        logits = model(**batch).logits
        metric = perplexity(logits[:, :-1, :], batch["input_ids"][:, 1:], ignore_index=tokenizer.eos_token_id)
        mymetric += metric * bs

    mymetric /= len(data)

    print("THEIR", mymetric)
    
calculate(data[:500])

100%|██████████| 500/500 [12:50<00:00,  1.54s/it]

THEIR tensor(6.7053, dtype=torch.float64)



