In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import ( 
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    )
from accelerate import init_empty_weights, infer_auto_device_map
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer
import os
# Set environment variables to use GPU 1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import requests



def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "Can you please let us know more details about your ",
})

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import init_empty_weights, infer_auto_device_map

def get_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True
    )
    
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map=None,
            low_cpu_mem_usage=True
        )
    
    device_map = infer_auto_device_map(model, max_memory={0: "16GiB", "cpu": "32GiB"})
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    model.gradient_checkpointing_enable()
    return model, tokenizer

In [4]:
model_id = "meta-llama/Meta-Llama-3-8B"
model, tokenizer = get_model_and_tokenizer(model_id)
model.to("cuda:0")

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.91s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.89s/it]
You shouldn't move a model that is dispatched using accelerate hooks.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [5]:
glue_dataset = load_dataset("glue", "sst2")
print(glue_dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [6]:
def preprocess_function(examples):
    if "sentence2" in examples:
        inputs = examples["sentence1"] + " [SEP] " + examples["sentence2"]
    else:
        inputs = examples["sentence"]
    labels = examples["label"]
    return {"input_text": inputs, "labels": labels}

tokenized_dataset = glue_dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.map(
    lambda examples: tokenizer(examples["input_text"], truncation=True, padding="max_length"),
    batched=True,
)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1821/1821 [00:00<00:00, 13876.64 examples/s]


In [7]:
# LoRA configuration
peft_config = LoraConfig(
    r=8,  # Low-rank adaptation size
    lora_alpha=16, 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama3-glue",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    num_train_epochs=3,
    logging_steps=10,
    learning_rate=2e-4,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    fp16=True
)



In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    peft_config=peft_config,
    args=training_args
)

trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 11.90 GiB of which 1022.88 MiB is free. Including non-PyTorch memory, this process has 10.89 GiB memory in use. Of the allocated memory 10.62 GiB is allocated by PyTorch, and 117.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
trainer.save_model("./llama3-glue-finetuned")
tokenizer.save_pretrained("./llama3-glue-finetuned")