In [1]:
import csv
import numpy as np
from tqdm.notebook import tqdm

import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
!nvidia-smi

Thu Aug 24 06:19:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:88:00.0 Off |                  N/A |
| 39%   38C    P0    53W / 250W |      0MiB / 11178MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
device = 'cuda'
model_name = 'global_step2280'
max_length = 2048
tokenizer_known = False

In [6]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

In [7]:
model.state_dict()

OrderedDict([('gpt_neox.embed_in.weight',
              tensor([[ 0.0160, -0.0115,  0.0320,  ...,  0.0037, -0.0319, -0.0293],
                      [-0.0070, -0.0187, -0.0127,  ..., -0.0054, -0.0186,  0.0241],
                      [-0.0028,  0.0041,  0.0321,  ...,  0.0052, -0.0043, -0.0318],
                      ...,
                      [ 0.0018,  0.0016,  0.0051,  ...,  0.0093, -0.0042,  0.0026],
                      [ 0.0005,  0.0298,  0.0151,  ...,  0.0371, -0.0245, -0.0028],
                      [-0.0058,  0.0096, -0.0058,  ..., -0.0361, -0.0064, -0.0289]],
                     device='cuda:0')),
             ('gpt_neox.layers.0.input_layernorm.weight',
              tensor([1.0225, 1.0215, 1.0322,  ..., 1.0283, 1.0205, 1.0020], device='cuda:0')),
             ('gpt_neox.layers.0.input_layernorm.bias',
              tensor([ 0.0049, -0.0098, -0.0223,  ...,  0.0134, -0.0051,  0.0237],
                     device='cuda:0')),
             ('gpt_neox.layers.0.post_attention_layer

In [8]:
# are layernorms being loaded in correctly?
model.eval()
for k, v in model.state_dict().items():
    print('%.4f %s' % (v.sum(), k))

146.9542 gpt_neox.embed_in.weight
1037.7881 gpt_neox.layers.0.input_layernorm.weight
-0.2250 gpt_neox.layers.0.input_layernorm.bias
999.5356 gpt_neox.layers.0.post_attention_layernorm.weight
0.0468 gpt_neox.layers.0.post_attention_layernorm.bias
1.4622 gpt_neox.layers.0.attention.rotary_emb.inv_freq
2.7440 gpt_neox.layers.0.attention.query_key_value.weight
7.8361 gpt_neox.layers.0.attention.query_key_value.bias
-0.6809 gpt_neox.layers.0.attention.dense.weight
0.0034 gpt_neox.layers.0.attention.dense.bias
-16.9326 gpt_neox.layers.0.mlp.dense_h_to_4h.weight
-42.5715 gpt_neox.layers.0.mlp.dense_h_to_4h.bias
16.7741 gpt_neox.layers.0.mlp.dense_4h_to_h.weight
0.0034 gpt_neox.layers.0.mlp.dense_4h_to_h.bias
1031.0425 gpt_neox.layers.1.input_layernorm.weight
-0.0388 gpt_neox.layers.1.input_layernorm.bias
1055.9355 gpt_neox.layers.1.post_attention_layernorm.weight
0.3230 gpt_neox.layers.1.post_attention_layernorm.bias
1.4622 gpt_neox.layers.1.attention.rotary_emb.inv_freq
-7.4484 gpt_neox.laye

# Generation

In [9]:
model.eval()
output_ids = model.generate(max_length=100, do_sample=True)
output_ids

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([[50256,   796,   796,   796,   796,   796,   352,   301,  4289,   796,
           796,   796,   796,   796,   220,   198, 50256]], device='cuda:0')

In [10]:
tokenizer.decode(output_ids[0])

'<|endoftext|> = = = = = 1st century = = = = = \n<|endoftext|>'

# Loss

In [11]:
ds = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')

Found cached dataset wikitext (/home/johnny/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
acc, n, tokens, words = 0, 0, 0, 0
cutoff = 2000
pbar = tqdm(ds['test'].select(range(0, cutoff)), total=cutoff)
for i in pbar:
    sentence = i['text']
    if sentence == '':
        continue
    
    words += len(sentence.split(' ')) 
    input_ids = tokenizer.encode(sentence, return_tensors='pt').to(device)
    
    model.eval()
    output = model.forward(input_ids, labels=input_ids)
    
    n += 1
    tokens += len(input_ids[0])
    acc += output.loss.item()
    pbar.set_description(f'Avg loss: {acc/n:.2f}, Token ppl: {np.exp(acc / tokens):.2f}, Word ppl: {np.exp(acc / words):.2f}')

  0%|          | 0/2000 [00:00<?, ?it/s]

# Accuracy

In [16]:
tokenizer.eos_token_id = 50256
tokenizer.pad_token_id = 50256
tokenizer.mask_token_id = 50256
tokenizer.decode([198])

'\n'

In [35]:
acc, n = 0, 0
pbar = tqdm(enumerate(ds['train']), total=len(ds['train']))
for i, example in pbar:
    sentence = example['text']
    if sentence == '':
        continue
    
    context = ''
    rollback = 5 - 1
    for j in range(i - rollback, i + 1):
        context += ds['train'][j]['text']
        
    input_ids = tokenizer.encode(context, return_tensors='pt').to(device)
    
    cutoff = len(input_ids[0]) - 2
    if len(input_ids[0]) <= cutoff+1:
        continue
    
    model.eval()
    output_ids = model.generate(input_ids[:,:cutoff], max_length=cutoff+1, \
                                do_sample=False, pad_token_id=tokenizer.eos_token_id)
    
    n += 1
    acc += 1 if output_ids[0,cutoff].item() == input_ids[0,cutoff].item() else 0
    
    pbar.set_description(f'Accuracy: {acc/n:.4f}')

  0%|          | 0/1801350 [00:00<?, ?it/s]

KeyboardInterrupt: 