In [2]:
import json
import argparse
import torch
import transformers
from typing import Dict
from collections import defaultdict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq, EvalPrediction
from datasets import load_dataset, concatenate_datasets, Dataset, load_from_disk
from peft import get_peft_model, LoraConfig, TaskType, PeftModel,PeftConfig
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers.trainer_pt_utils import LabelSmoother

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_id= "base/qwen/Qwen2-0_5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, load_in_8bit=True, trust_remote_code=True)
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear8bitLt(in_features=896, out_features=896, bias=True)
          (k_proj): Linear8bitLt(in_features=896, out_features=128, bias=True)
          (v_proj): Linear8bitLt(in_features=896, out_features=128, bias=True)
          (o_proj): Linear8bitLt(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear8bitLt(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear8bitLt(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2R

In [None]:
data = load_from_disk("outputs/final/Qwen2-0_5B-instruct-lora/eval_data")
# data = concatenate_datasets([d for key, d in data.items() if isinstance(d, Dataset)])

In [None]:

IGNORE_TOKEN_ID = LabelSmoother.ignore_index
def preprocess(
    messages,
    tokenizer: transformers.PreTrainedTokenizer,
    max_len: int,
) -> Dict:
    """Preprocesses the data for supervised fine-tuning."""

    texts = []
    for i, msg in enumerate(messages):
        texts.append(
            tokenizer.apply_chat_template(
                msg,
                tokenize=True,
                add_generation_prompt=False,
                padding="max_length",
                max_length=max_len,
                truncation=True,
            )
        )
    input_ids = torch.tensor(texts, dtype=torch.int)
    target_ids = input_ids.clone()
    target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
    print(tokenizer.decode(target_ids))
    attention_mask = input_ids.ne(tokenizer.pad_token_id)

    return Dataset.from_dict({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": target_ids
    })


In [None]:
messages = [[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me something about large language models."}, {"role": "assistant", "content": "Large language models are a type of language model that is trained on a large corpus of text data. They are capable of generating human-like text and are used in a variety of natural language processing tasks..."}]]
d = preprocess(messages,tokenizer,384)

In [None]:
from datasets import load_dataset, load_metric

d = preprocess(data["message"],tokenizer,584)
# Evaluate on test set
predictions = []
references = data["output"]

output = model.generate(d.to(model.device), max_length=584, num_beams=4, early_stopping=True)
prediction = tokenizer.decode(output[0], skip_special_tokens=True)
predictions.append(prediction)

# Calculate BLEU and ROUGE
rouge = load_metric("rouge")
bleu = load_metric("sacrebleu")

result_rouge = rouge.compute(predictions=predictions, references=references)
result_bleu = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("ROUGE:", result_rouge)
print("BLEU:", result_bleu["score"])


In [None]:
prompt = "Give me a short introduction to large language model."
device = "cuda" # the device to load the model onto
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)