In [70]:
import csv
import numpy as np
from tqdm.notebook import tqdm

import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [72]:
device = 'cpu'
model_name = 'global_step1140'
max_length = 2048
tokenizer_known = False

In [73]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

In [74]:
model.state_dict()

OrderedDict([('gpt_neox.embed_in.weight',
              tensor([[ 0.0160, -0.0115,  0.0320,  ...,  0.0037, -0.0319, -0.0293],
                      [-0.0070, -0.0187, -0.0127,  ..., -0.0054, -0.0186,  0.0241],
                      [-0.0028,  0.0041,  0.0321,  ...,  0.0052, -0.0043, -0.0318],
                      ...,
                      [ 0.0018,  0.0016,  0.0051,  ...,  0.0093, -0.0042,  0.0026],
                      [ 0.0005,  0.0298,  0.0151,  ...,  0.0371, -0.0245, -0.0028],
                      [-0.0058,  0.0096, -0.0058,  ..., -0.0361, -0.0064, -0.0289]])),
             ('gpt_neox.layers.0.input_layernorm.weight',
              tensor([1.0078, 1.0068, 0.9946,  ..., 1.0049, 1.0059, 0.9966])),
             ('gpt_neox.layers.0.input_layernorm.bias',
              tensor([ 0.0003, -0.0017, -0.0021,  ..., -0.0007, -0.0025,  0.0034])),
             ('gpt_neox.layers.0.post_attention_layernorm.weight',
              tensor([1.0010, 0.9932, 1.0020,  ..., 1.0088, 0.9941, 1.0146])),


In [75]:
# are layernorms being loaded in correctly?
model.eval()
for k, v in model.state_dict().items():
    print('%.4f %s' % (v.sum(), k))

210.3036 gpt_neox.embed_in.weight
1020.7588 gpt_neox.layers.0.input_layernorm.weight
-0.0048 gpt_neox.layers.0.input_layernorm.bias
1023.8032 gpt_neox.layers.0.post_attention_layernorm.weight
0.0367 gpt_neox.layers.0.post_attention_layernorm.bias
1.4622 gpt_neox.layers.0.attention.rotary_emb.inv_freq
1.0120 gpt_neox.layers.0.attention.query_key_value.weight
2.2219 gpt_neox.layers.0.attention.query_key_value.bias
0.2668 gpt_neox.layers.0.attention.dense.weight
0.0010 gpt_neox.layers.0.attention.dense.bias
1.0977 gpt_neox.layers.0.mlp.dense_h_to_4h.weight
-38.1030 gpt_neox.layers.0.mlp.dense_h_to_4h.bias
9.7015 gpt_neox.layers.0.mlp.dense_4h_to_h.weight
0.0010 gpt_neox.layers.0.mlp.dense_4h_to_h.bias
1010.5244 gpt_neox.layers.1.input_layernorm.weight
0.0073 gpt_neox.layers.1.input_layernorm.bias
1026.2705 gpt_neox.layers.1.post_attention_layernorm.weight
0.0618 gpt_neox.layers.1.post_attention_layernorm.bias
1.4622 gpt_neox.layers.1.attention.rotary_emb.inv_freq
-6.7809 gpt_neox.layers.1

# Generation

In [6]:
model.eval()
output_ids = model.generate(max_length=100, do_sample=True)
output_ids

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([[50256,    19,    13,    23,    13,    17,    13,    20,   921,   460,
           651,   257,  4866,   286,   262,  3452,  2196,   286,   428,  7552,
           543,  3407,   262,  1266,  1104,   329,   534,  4410,   994,    25,
          2638,  1378, 15002,    13,   260,  1073,   548,  4215,    13,   785,
            14, 15002,    82,    13,  6494,   198,   198,    19,    13,    23,
            13,    17,    13,    21,   383,  3452,  2196,   286,   428,  7552,
          3407,   262,   749,  5434,    12, 42624,   329,  4410,   625,   642,
            13,    15,    13,    18, 15885,   198,   198,    19,    13,    23,
            13,    17,    13,    22,   921,   460,   651,   262,  3452,  2196,
           286,   428,  7552,   543,  3407,   477,   262,   749,  3033,   329]])

In [7]:
tokenizer.decode(output_ids[0])

'<|endoftext|>4.8.2.5 You can get a copy of the latest version of this extension which includes the best support for your devices here: http://download.recoverysoft.com/downloads.html\n\n4.8.2.6 The latest version of this extension includes the most bug-fixes for devices over 5.0.3.*\n\n4.8.2.7 You can get the latest version of this extension which includes all the most features for'

# Loss

In [12]:
ds = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')

Found cached dataset wikitext (/home/johnny/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

In [92]:
acc, n, tokens, words = 0, 0, 0, 0
cutoff = 2000
pbar = tqdm(ds['test'].select(range(0, cutoff)), total=cutoff)
for i in pbar:
    sentence = i['text']
    if sentence == '':
        continue
    
    words += len(sentence.split(' ')) 
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    
    model.eval()
    output = model.forward(input_ids, labels=input_ids)
    
    n += 1
    tokens += len(input_ids[0])
    acc += output.loss.item()
    pbar.set_description(f'Avg loss: {acc/n:.2f}, Token ppl: {np.exp(acc / tokens):.2f}, Word ppl: {np.exp(acc / words):.2f}')

  0%|          | 0/2000 [00:00<?, ?it/s]

# Accuracy

In [76]:
tokenizer.eos_token_id = 50256
tokenizer.pad_token_id = 50256
tokenizer.decode([198])

'\n'

In [None]:
acc, n = 0, 0
pbar = tqdm(ds['train'], total=len(ds['train']))
for i in pbar:
    sentence = i['text']
    if sentence == '':
        continue
    
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    
    cutoff = 100
    if len(input_ids[0]) <= cutoff+1:
        continue
    
    model.eval()
    output_ids = model.generate(input_ids[:,:cutoff], max_length=cutoff+1, do_sample=False)
    
    n += 1
    acc += 1 if output_ids[0,cutoff].item() == input_ids[0,cutoff].item() else 0
    
    pbar.set_description(f'Accuracy: {acc/n:.4f}')