In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import sagemaker
import sqlite3
from datasets import load_from_disk
from peft import LoraConfig, TaskType
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, default_data_collator, Trainer, TrainingArguments, AutoTokenizer
import torch

from scripts.fine_tuning import find_all_linear_names, create_peft_model

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# PEFT

In [2]:
sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()

In [3]:
model_id = "Qwen/Qwen2-1.5B-Instruct"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False,
    device_map="auto",
    quantization_config=bnb_config
)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [7]:
modules = find_all_linear_names(model)

In [8]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=modules,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [9]:
model.enable_input_require_grads()

In [10]:
model = create_peft_model(
    model=model,
    peft_config=peft_config,
    gradient_checkpointing=True,
    bf16=True if torch.cuda.get_device_capability()[0] == 8 else False
)
model.print_trainable_parameters()

Found 7 modules to quantize: ['q_proj', 'gate_proj', 'o_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj']
trainable params: 73,859,072 || all params: 1,617,573,376 || trainable%: 4.5660
trainable params: 73,859,072 || all params: 1,617,573,376 || trainable%: 4.5660


In [11]:
training_args = TrainingArguments(
    output_dir="/tmp/qwen2",
    per_device_train_batch_size=3,
    bf16=True if torch.cuda.get_device_capability()[0] == 8 else False,
    learning_rate=2e-4,
    num_train_epochs=3,
    gradient_checkpointing=True,
    logging_dir="/tmp/qwen2/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    seed = 3578,
)

In [12]:
lm_dataset = load_from_disk(f's3://{sagemaker_session_bucket}/text_2_sql/dataset/train')
lm_dataset_eval = load_from_disk(f's3://{sagemaker_session_bucket}/text_2_sql/dataset/valid')

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    eval_dataset=lm_dataset_eval,
    processing_class=tokenizer,
    data_collator=default_data_collator
)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [13]:
# Start training
trainer.train()

Step,Training Loss
10,0.307
20,0.091
30,0.0551
40,0.0487
50,0.0413
60,0.0347
70,0.0343
80,0.03
90,0.0291
100,0.0269


TrainOutput(global_step=132, training_loss=0.05916652851032488, metrics={'train_runtime': 397.8948, 'train_samples_per_second': 0.98, 'train_steps_per_second': 0.332, 'total_flos': 6633527810457600.0, 'train_loss': 0.05916652851032488, 'epoch': 3.0})

In [18]:
model.save_pretrained("./finetuned_llm/qwen2")

In [19]:
torch.cuda.empty_cache()