In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen1.5-MoE-A2.7B",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
INFO:accelerate.utils.modeling:Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 1763451008 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



RuntimeError: Expected all tensors to be on the same device, but got index is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__index_select)

In [7]:
response

'A large language model is a type of artificial intelligence that is designed to understand and generate human language. These models are trained on large amounts of text data, which allows them to learn patterns and relationships in language. This makes them useful for a variety of applications, such as natural language processing, chatbots, and language translation. They are also capable of generating text that is coherent and grammatically correct, which makes them useful for tasks such as content generation and summarization.'

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from peft import IA3Config, get_peft_model, TaskType

model = prepare_model_for_kbit_training(model)

config = LoraConfig(r = 4,
                    lora_alpha=4,
                    target_modules = ["gate", "q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
                    lora_dropout=0.1
                    )

lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

trainable params: 63,706,752 || all params: 14,379,490,944 || trainable%: 0.4430


In [12]:
from datasets import load_dataset

In [13]:
dataset = load_dataset("Na0s/sft-ready-Text-Generation-Augmented-Data", split="train")

README.md:   0%|          | 0.00/344 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

data/train-00000-of-00022.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

data/train-00001-of-00022.parquet:   0%|          | 0.00/323M [00:00<?, ?B/s]

data/train-00002-of-00022.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

data/train-00003-of-00022.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

data/train-00004-of-00022.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

data/train-00005-of-00022.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

data/train-00006-of-00022.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

data/train-00007-of-00022.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

data/train-00008-of-00022.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

data/train-00009-of-00022.parquet:   0%|          | 0.00/316M [00:00<?, ?B/s]

data/train-00010-of-00022.parquet:   0%|          | 0.00/347M [00:00<?, ?B/s]

data/train-00011-of-00022.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

data/train-00012-of-00022.parquet:   0%|          | 0.00/476M [00:00<?, ?B/s]

data/train-00013-of-00022.parquet:   0%|          | 0.00/594M [00:00<?, ?B/s]

data/train-00014-of-00022.parquet:   0%|          | 0.00/252M [00:00<?, ?B/s]

data/train-00015-of-00022.parquet:   0%|          | 0.00/77.0M [00:00<?, ?B/s]

data/train-00016-of-00022.parquet:   0%|          | 0.00/92.4M [00:00<?, ?B/s]

data/train-00017-of-00022.parquet:   0%|          | 0.00/95.4M [00:00<?, ?B/s]

data/train-00018-of-00022.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

data/train-00019-of-00022.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

data/train-00020-of-00022.parquet:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

data/train-00021-of-00022.parquet:   0%|          | 0.00/109M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7667416 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=2048,
    )

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    num_proc=8,   # use 8 CPU cores
    remove_columns=["text"],
)

In [15]:
from trl import SFTConfig
import logging
logging.getLogger("trl.trainer.sft_trainer").setLevel(logging.ERROR)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=True)

trainer = SFTTrainer(
    model = lora_model,
    train_dataset = dataset,
    processing_class = tokenizer,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 16,
        packing = True,
        group_by_length = True,
        warmup_steps = 5,
        bf16 = True,
        max_steps=10000,
        learning_rate = 2e-4,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        eval_strategy="no",
        do_eval=False,
        output_dir = "./outputs",
        push_to_hub=True,
        remove_unused_columns=False,
    )
)

torch.cuda.empty_cache()

trainer.train()

trainer.model.save_pretrained("Qwen1.5_MoE_lora_model")

Tokenizing train dataset:   0%|          | 0/7667416 [00:00<?, ? examples/s]

KeyboardInterrupt: 