In [1]:
import os
import transformers

from medusa.train.train_legacy import ModelArguments, DataArguments, TrainingArguments


model_args = ModelArguments(
	model_name_or_path=os.path.expanduser('~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6'),
)

data_args = DataArguments(
	data_path=os.path.expanduser('~/data/ShareGPT_Vicuna_unfiltered/shareGPT-llama3-8B.json'),
	lazy_preprocess=True,
)

training_args = TrainingArguments(
	output_dir='./train/test',
	medusa_num_heads=5,
	medusa_num_layers=1,
)

config = transformers.AutoConfig.from_pretrained(
	model_args.model_name_or_path,
	cache_dir=training_args.cache_dir,
)
config

  from .autonotebook import tqdm as notebook_tqdm


LlamaConfig {
  "_name_or_path": "/home/camus/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [2]:
orig_ctx_len = getattr(config, "max_position_embeddings", None)
orig_ctx_len, training_args.model_max_length

(8192, 2048)

In [3]:
config.use_cache = False

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=True,
)

tokenizer.pad_token, tokenizer.unk_token, tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(None, None, '<|end_of_text|>')

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
tokenizer(["This is a test", "secondary"], padding=True)

{'input_ids': [[128000, 2028, 374, 264, 1296], [128000, 19217, 128001, 128001, 128001]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 0, 0, 0]]}

In [7]:
tokenizer.apply_chat_template([{"role": "user", "content": "This is a test"}])

[128000,
 128006,
 882,
 128007,
 271,
 2028,
 374,
 264,
 1296,
 128009,
 128006,
 78191,
 128007,
 271]

In [8]:
import torch


model = transformers.LlamaForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    config=config,
    cache_dir=training_args.cache_dir,
    torch_dtype=torch.bfloat16,
)

for param in model.base_model.parameters():
    param.requires_grad = False

model

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.09it/s]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [9]:
from medusa.model.medusa_model_legacy import MedusaModel


medusa_lm_head = MedusaModel(
    model,
    medusa_num_heads=training_args.medusa_num_heads,
    medusa_num_layers=training_args.medusa_num_layers,
    base_model_name_or_path=model_args.model_name_or_path,
)
medusa_lm_head

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MedusaModel(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): Llam

In [10]:
from medusa.train.train_legacy import make_supervised_data_module


data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
data_module

{'train_dataset': <medusa.train.train_legacy.LazySupervisedDataset at 0x723f580643d0>,
 'eval_dataset': None}

In [11]:
from torch.utils.data import DataLoader, RandomSampler
from transformers.trainer_utils import seed_worker
from transformers.data.data_collator import DataCollatorWithPadding


default_collator = DataCollatorWithPadding(tokenizer)

dataloader_params = {
	"batch_size": 1,
	"collate_fn": default_collator,
	"num_workers": 1,
	"pin_memory": training_args.dataloader_pin_memory,
	"persistent_workers": training_args.dataloader_persistent_workers,
	"sampler": RandomSampler(data_module['train_dataset']),
	"drop_last": training_args.dataloader_drop_last,
	"worker_init_fn": seed_worker,
	"prefetch_factor": training_args.dataloader_prefetch_factor,
}

loader = DataLoader(data_module['train_dataset'], **dataloader_params)
loader

<torch.utils.data.dataloader.DataLoader at 0x723f47d3fc10>

In [12]:
it = iter(loader)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
batch = next(it)
batch

{'input_ids': tensor([[128000, 128000, 128006,  ...,    775,    341,    415]]), 'labels': tensor([[  -100, 128000, 128006,  ...,    775,    341,    415]]), 'attention_mask': tensor([[True, True, True,  ..., True, True, True]])}

In [14]:
batch['input_ids'].shape, batch['labels'].shape, batch['attention_mask'].shape

(torch.Size([1, 2048]), torch.Size([1, 2048]), torch.Size([1, 2048]))

In [15]:
print(tokenizer.decode(batch['input_ids'][0], skip_special_tokens=False, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True))

<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

How would I generate a trusted certificate?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Generating a trusted certificate involves creating a public key certificate that can be trusted by a browser or other application. Here are the general steps:

**Option 1: Self-Signed Certificate**

You can generate a self-signed certificate using a tool like OpenSSL. This certificate will be trusted by default by your own application, but it will not be trusted by browsers or other applications.

1. Install OpenSSL on your system.
2. Run the following command to generate a private key and a self-signed certificate:
```
openssl req -x509 -newkey rsa:2048 -nodes -keyout example.com.key -out example.com.crt -subj "/C=US/ST=State/L=Locality/O=Organization/CN=example.com"
```
This command generates a 2048-bit RSA private key and a self-signed certificate with the subject "example.com".

**Option 2: CA-Signed Certific

In [16]:
batch['labels'].tolist()

[[-100,
  128000,
  128006,
  -100,
  128007,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  128009,
  128006,
  -100,
  128007,
  -100,
  74414,
  264,
  22542,
  16125,
  18065,
  6968,
  264,
  586,
  1401,
  16125,
  430,
  649,
  387,
  22542,
  555,
  264,
  7074,
  477,
  1023,
  3851,
  13,
  5810,
  527,
  279,
  4689,
  7504,
  1473,
  334,
  5454,
  220,
  16,
  25,
  10323,
  6354,
  1571,
  32502,
  57277,
  2675,
  649,
  7068,
  264,
  659,
  93653,
  16125,
  1701,
  264,
  5507,
  1093,
  66717,
  13,
  1115,
  16125,
  690,
  387,
  22542,
  555,
  1670,
  555,
  701,
  1866,
  3851,
  11,
  719,
  433,
  690,
  539,
  387,
  22542,
  555,
  33957,
  477,
  1023,
  8522,
  382,
  16,
  13,
  19796,
  66717,
  389,
  701,
  1887,
  627,
  17,
  13,
  6588,
  279,
  2768,
  3290,
  311,
  7068,
  264,
  879,
  1401,
  323,
  264,
  659,
  93653,
  16125,
  512,
  14196,
  4077,
  54712,
  4326,
  482,
  87,
  12448,
  482,
  943,
  798,
  6967

In [17]:
medusa_lm_head.medusa

5

In [18]:
medusa_lm_head.to('cuda')

MedusaModel(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): Llam

In [19]:
logits = medusa_lm_head(input_ids=batch["input_ids"].to('cuda'), attention_mask=batch["attention_mask"].to('cuda'))
logits

tensor([[[[ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 4.1250,  2.5781,  4.3438,  ..., -7.7188, -7.7188, -7.7188],
          ...,
          [ 7.8750,  7.5625,  2.5000,  ..., -1.7578, -1.7578, -1.7578],
          [ 6.0312,  8.0000,  6.9375,  ..., -4.1250, -4.1250, -4.1250],
          [ 0.3574, -2.2344, -1.0469,  ...,  0.3867,  0.3867,  0.3867]]],


        [[[ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 4.1250,  2.5781,  4.3438,  ..., -7.7188, -7.7188, -7.7188],
          ...,
          [ 7.9062,  7.5625,  2.4844,  ..., -1.7578, -1.7578, -1.7578],
          [ 6.0312,  7.9688,  6.9375,  ..., -4.1250, -4.1250, -4.1250],
          [ 0.3672, -2.2344, -1.0547,  ...,  0.3848,  0.3848,  0.3848]]],


        [[[ 6.9375,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 6.9375,  8.812

In [20]:
logits.shape

torch.Size([5, 1, 2048, 128256])

In [31]:
medusa_logits = logits[0, :, :-2].contiguous()
medusa_logits.shape, medusa_logits

(torch.Size([1, 2046, 128256]),
 tensor([[[ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
          [ 4.1250,  2.5781,  4.3438,  ..., -7.7188, -7.7188, -7.7188],
          ...,
          [ 3.2969,  0.1553, -0.8789,  ...,  0.4414,  0.4414,  0.4414],
          [ 6.8750,  6.8125,  4.3438,  ..., -2.5156, -2.5156, -2.5156],
          [ 7.8750,  7.5625,  2.5000,  ..., -1.7578, -1.7578, -1.7578]]],
        device='cuda:0', dtype=torch.bfloat16, grad_fn=<SliceBackward0>))

In [27]:
medusa_labels = batch['labels'][..., 2:].contiguous()
medusa_labels.shape

torch.Size([1, 2046])

In [32]:
medusa_logits = medusa_logits.view(-1, logits.shape[-1])
medusa_logits.shape, medusa_logits

(torch.Size([2046, 128256]),
 tensor([[ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
         [ 6.9062,  8.8125, 13.0000,  ..., -4.4375, -4.4375, -4.4375],
         [ 4.1250,  2.5781,  4.3438,  ..., -7.7188, -7.7188, -7.7188],
         ...,
         [ 3.2969,  0.1553, -0.8789,  ...,  0.4414,  0.4414,  0.4414],
         [ 6.8750,  6.8125,  4.3438,  ..., -2.5156, -2.5156, -2.5156],
         [ 7.8750,  7.5625,  2.5000,  ..., -1.7578, -1.7578, -1.7578]],
        device='cuda:0', dtype=torch.bfloat16, grad_fn=<ViewBackward0>))

In [28]:
medusa_labels = medusa_labels.view(-1)
medusa_labels = medusa_labels.to(medusa_logits.device)
medusa_labels.shape, medusa_labels

(torch.Size([2046]),
 tensor([128006,   -100, 128007,  ...,    775,    341,    415], device='cuda:0'))

In [29]:
from torch.nn import CrossEntropyLoss


loss_fct = CrossEntropyLoss()
loss_i = loss_fct(medusa_logits, medusa_labels)
loss_i

tensor(13., device='cuda:0', dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

In [33]:
from transformers.trainer_pt_utils import LabelSmoother

LabelSmoother.ignore_index

-100