In [1]:
# set CUDA_VISIBLE_DEVICES=1 in all notebook
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [14]:
import datasets

dataset = datasets.load_dataset("lvwerra/needle-llama3-16x512")
dataset = dataset["train"]
# dataset = dataset["haystack_text"]


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# use FA2
model = AutoModelForCausalLM.from_pretrained(model_name, device_map={"": "cuda:0"}, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:27<00:00,  6.83s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_hea

# dataset

In [3]:
# tokenize the dataset
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
inputs = tokenizer(dataset, padding=True, return_tensors="pt", truncation=False)
inputs


{'input_ids': tensor([[128001, 128001, 128001,  ...,   1522,   1401,    374],
        [128001, 128001, 128001,  ...,   1522,   1401,    374],
        [128001, 128001, 128001,  ...,   1522,   1401,    374],
        ...,
        [128000,   3947,    374,  ...,   1522,   1401,    374],
        [128000,   3947,    374,  ...,   1522,   1401,    374],
        [128000,   3947,    374,  ...,   1522,   1401,    374]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [4]:
len(dataset)

1408

In [10]:
# generate text
import torch
from tqdm import tqdm

generated = []
batch_size = 8

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = {k: v[i : i + batch_size].to(model.device)
              for k, v in inputs.items()}
    with torch.no_grad():
        out = model.generate(
            **batch,
            max_new_tokens=8,
            num_return_sequences=1,
            do_sample=False
        )
    generated.extend(tokenizer.batch_decode(out[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
    # del batch
    # break

  0%|          | 0/176 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/176 [00:02<07:09,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 2/176 [00:04<07:05,  2.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 3/176 [00:07<07:04,  2.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 4/176 [00:09<07:02,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 5/176 [00:12<07:00,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 6/176 [00:14<06:58,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 7/176 [00:17<06:55,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▍         | 8/176 [00:19<06:53,  2.46s/it]S

In [8]:
# # clear memory
# torch.cuda.empty_cache()
# torch.cuda.memory_summary(device=None, abbreviated=False)

In [9]:
generated

[' 8941. Remember it. ',
 ' 857. Remember it. 857',
 ' 3984.',
 ' 3319. Remember it. ',
 ' 2260. Remember it. ',
 ' 2770. Remember it. ',
 ' 3196. Remember it. ',
 ' 4875. Remember it. ']

In [26]:
# save generated as csv
import pandas as pd

df = pd.DataFrame(generated, columns=["hf_llama3_8b_out"])
df.to_csv("hf_llama3_8b_out.csv", index=False)

In [11]:
import joblib
joblib.dump(generated, "generated.pkl")

['generated.pkl']

In [18]:
# add generated column to the dataset
# dataset["hf_llama3_8b_out"] = generated
dataset = dataset.add_column("hf_llama3_8b_out", generated)

dataset

Dataset({
    features: ['seed', 'needle', 'position', 'context_length', 'haystack_text', 'needle_token_position', 'hf_llama3_8b_out'],
    num_rows: 1408
})

In [21]:
dataset.save_to_disk("hf_llama3_8b_out")

Saving the dataset (1/1 shards): 100%|██████████| 1408/1408 [00:00<00:00, 51245.48 examples/s]


In [22]:
# load the dataset
dataset = datasets.load_from_disk("hf_llama3_8b_out")
dataset

Dataset({
    features: ['seed', 'needle', 'position', 'context_length', 'haystack_text', 'needle_token_position', 'hf_llama3_8b_out'],
    num_rows: 1408
})

In [24]:
dataset[:5]

{'seed': [0, 1, 2, 3, 4],
 'needle': [8941, 857, 3984, 3319, 2260],
 'position': [0.0, 0.0, 0.0, 0.0, 0.0],
 'context_length': [512, 512, 512, 512, 512],
 'haystack_text': ['There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The pass key is 8941. Remember it. 8941 is the pass key. The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. The mountain is high. The snow is cold. The air is thin. Upward we climb. From base to peak. The forest is dense. The trees are tall. The path is winding. Into the woods. Out and back. The river is wide. The current is strong. The banks are muddy. Down the stream. Back and forth. The ocean is vast. The waves are calming. The sand is warm. Onward we sail. From coast to coast. Here we go. There and back again. The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. The mountain is high. The sn

In [28]:
dataset[-4:]

{'seed': [4, 5, 6, 7],
 'needle': [2167, 5666, 9996, 9871],
 'position': [1.0, 1.0, 1.0, 1.0],
 'context_length': [8192, 8192, 8192, 8192],
 'haystack_text': ['There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. The mountain is high. The snow is cold. The air is thin. Upward we climb. From base to peak. The forest is dense. The trees are tall. The path is winding. Into the woods. Out and back. The river is wide. The current is strong. The banks are muddy. Down the stream. Back and forth. The ocean is vast. The waves are calming. The sand is warm. Onward we sail. From coast to coast. Here we go. There and back again. The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. The mountain is high. The snow is cold. The air is thin. Upward we climb. From base to peak. The fo

# inputs

In [5]:
from constantsaa import dummy_inputs
dummy_inputs = dummy_inputs[:2]


In [6]:
# tokenize the dataset
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
inputs = tokenizer(dummy_inputs, padding=True, return_tensors="pt", truncation=False)
inputs


{'input_ids': tensor([[128000,   3947,    374,  ...,   1522,   1401,    374],
        [128000,   3947,    374,  ...,   1522,   1401,    374]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [9]:
# generate text
import torch
from tqdm import tqdm

generated = []
batch_size = 8

for i in tqdm(range(0, len(dummy_inputs), batch_size)):
    batch = {k: v[i : i + batch_size].to(model.device)
              for k, v in inputs.items()}
    with torch.no_grad():
        out = model.generate(
            **batch,
            max_new_tokens=8,
            num_return_sequences=1,
            do_sample=False
        )
    generated.extend(tokenizer.batch_decode(out[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
    # del batch
    # break
    

  0%|          | 0/1 [00:00<?, ?it/s]