In [5]:
import torch
import numpy as np
from transformers.pytorch_utils import Conv1D

from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

In [2]:
tokenizer_file = "./tokenizers/tinystories.json"
model_dir = "./models/8c2cf5cf9d5840ec88304907a1e91f72/0"

In [3]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
tokenizer.bos_token = "<|endoftext|>"
tokenizer.eos_token = "<|endoftext|>"

model = GPT2LMHeadModel.from_pretrained(model_dir, local_files_only=True)

In [22]:
decay = set()
no_decay = set()
whitelist_weight_modules = (torch.nn.Linear, Conv1D)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)

for mn, m in model.named_modules():
    for pn, p in m.named_parameters():
        fpn = '%s.%s' % (mn, pn) if mn else pn
        if pn.endswith('bias'):
            no_decay.add(fpn)
        elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
            decay.add(fpn)
        elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
            no_decay.add(fpn)

In [24]:
print("lm_head.weight" in decay)

True


In [9]:
for pn, p in model.named_parameters():
    p_numpy = p.detach().cpu().numpy()
    head_numpy = model.lm_head.weight.detach().cpu().numpy()
    if p_numpy.shape == head_numpy.shape:
        if np.allclose(p_numpy, head_numpy):
            print(pn)

transformer.wte.weight


In [8]:
fpn_set = set()

for mn, m in model.named_modules():
    for pn, p in m.named_parameters():
        fpn = '%s.%s' % (mn, pn) if mn else pn
        fpn_set.add(fpn)

In [14]:
list(fpn_set)[:5]

['transformer.h.2.mlp.c_proj.bias',
 'transformer.h.6.mlp.c_fc.bias',
 'transformer.h.4.mlp.c_fc.weight',
 'transformer.h.0.ln_2.bias',
 'transformer.h.1.mlp.c_fc.weight']

In [16]:
pn_set = set([pn for pn, p in model.named_parameters()])
list(pn_set)[:5]

['transformer.h.2.mlp.c_proj.bias',
 'transformer.h.6.mlp.c_fc.bias',
 'transformer.h.4.mlp.c_fc.weight',
 'transformer.h.0.ln_2.bias',
 'transformer.h.1.mlp.c_fc.weight']

In [18]:
out = fpn_set - pn_set
out

{'lm_head.weight'}

In [19]:
len(list(model.parameters()))

100

In [25]:
type(model.lm_head.weight)

torch.nn.parameter.Parameter

In [4]:
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(trainable_parameters)

total_parameters = sum(p.numel() for p in model.parameters())
print(total_parameters)

1105536
1105536


In [5]:
model.generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "transformers_version": "4.32.1"
}

In [46]:
text_input = "Once upon a time,"

In [47]:
token_input = tokenizer(text_input, return_tensors="pt")
token_output = model.generate(**token_input, top_p=0.9, max_length=80, temperature=1.0)
text_output = tokenizer.batch_decode(token_output, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [48]:
text_output

[' Once upon a time,,,,,,,']

In [22]:
for mn, m in model.named_modules():
    print(mn)


transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.attn_dropout
transformer.h.2.attn.resid_dropout
transformer.h.2.ln_2
transformer.h.2.mlp
transformer.h.2.mlp.c_fc
transformer.h.2.mlp