In [1]:
from dataclasses import dataclass, field
import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments
from trl import SFTTrainer, ModelConfig, get_kbit_device_map, get_peft_config, get_quantization_config

In [2]:
import logging
logging.basicConfig(format='%(message)s',
                    level=logging.INFO,
                    force=True)

In [3]:
@dataclass
class ScriptArguments:
    dataset_name: str = field(default="timdettmers/openassistant-guanaco", metadata={"help": "the dataset name"})
    dataset_text_field: str = field(default="text", metadata={"help": "the text field of the dataset"})
    max_seq_length: int = field(default=512, metadata={"help": "The maximum sequence length for SFT Trainer"})

In [8]:
args = ScriptArguments(dataset_name='timdettmers/openassistant-guanaco',
                       dataset_text_field='text',
                       max_seq_length=512)
trainingArgs = TrainingArguments(adafactor=False,
                                 adam_beta1=0.9,
                                 adam_beta2=0.999,
                                 adam_epsilon=1e-08,
                                 auto_find_batch_size=False,
                                 bf16=False,
                                 bf16_full_eval=False,
                                 data_seed=None,
                                 dataloader_drop_last=False,
                                 dataloader_num_workers=0,
                                 dataloader_pin_memory=True,
                                 ddp_backend=None,
                                 ddp_broadcast_buffers=None,
                                 ddp_bucket_cap_mb=None,
                                 ddp_find_unused_parameters=None,
                                 ddp_timeout=1800,
                                 debug=[],
                                 deepspeed=None,
                                 disable_tqdm=False,
                                 dispatch_batches=None,
                                 do_eval=False,
                                 do_predict=False,
                                 do_train=False,
                                 eval_accumulation_steps=None,
                                 eval_delay=0,
                                 eval_steps=None,
                                 fp16=False,
                                 fp16_full_eval=False,
                                 fp16_opt_level=1,
                                 fsdp=[],
                                 fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
                                 fsdp_min_num_params=0,
                                 fsdp_transformer_layer_cls_to_wrap=None,
                                 full_determinism=False,
                                 gradient_accumulation_steps=16,
                                 gradient_checkpointing=True,
                                 greater_is_better=None,
                                 group_by_length=False,
                                 hub_always_push=False,
                                 ignore_data_skip=False,
                                 include_inputs_for_metrics=False,
                                 include_tokens_per_second=False,
                                 jit_mode_eval=False,
                                 label_names=None,
                                 label_smoothing_factor=0.0,
                                 learning_rate=1.41e-05,
                                 load_best_model_at_end=False,
                                 local_rank=0,
                                 log_on_each_node=True,
                                 logging_dir="sft_openassistant-guanaco/runs/Jan30_18-27-15_kali",
                                 logging_first_step=False,
                                 logging_nan_inf_filter=True,
                                 logging_steps=1.0,
                                 max_grad_norm=1.0,
                                 max_steps=-1,
                                 metric_for_best_model=None,
                                 no_cuda=False,
                                 num_train_epochs=1.0,
                                 optim_args=None,
                                 output_dir="sft_openassistant-guanaco",
                                 overwrite_output_dir=False,
                                 past_index=-1,
                                 per_device_eval_batch_size=8,
                                 per_device_train_batch_size=64,
                                 prediction_loss_only=False,
                                 push_to_hub=False,
                                 push_to_hub_model_id=None,
                                 push_to_hub_organization=None,
                                 remove_unused_columns=True,
                                 resume_from_checkpoint=None,
                                 run_name="sft_openassistant-guanaco",
                                 save_on_each_node=False,
                                 save_safetensors=False,
                                 save_steps=500,
                                 save_total_limit=None,
                                 seed=42,
                                 sharded_ddp=[],
                                 skip_memory_metrics=True,
                                 tf32=None,
                                 torch_compile=False,
                                 torch_compile_backend=None,
                                 torch_compile_mode=None,
                                 torchdynamo=None,
                                 tpu_metrics_debug=False,
                                 tpu_num_cores=None,
                                 use_cpu=False,
                                 use_ipex=False,
                                 use_legacy_prediction_loop=False,
                                 use_mps_device=False,
                                 warmup_ratio=0.0,
                                 warmup_steps=0,
                                 weight_decay=0.0,)
model_config = ModelConfig(model_name_or_path='facebook/opt-350m',
                           model_revision='main',
                           torch_dtype=torch.float16,
                           trust_remote_code=False,
                           attn_implementation=None,
                           use_peft=True,
                           lora_r=64,
                           lora_alpha=16,
                           lora_dropout=0.05,
                           lora_target_modules=None,
                           lora_modules_to_save=None,
                           load_in_8bit=False,
                           load_in_4bit=True,
                           bnb_4bit_quant_type='nf4',
                           use_bnb_nested_quant=False)

In [9]:
model_config

ModelConfig(model_name_or_path='facebook/opt-350m', model_revision='main', torch_dtype=torch.float16, trust_remote_code=False, attn_implementation=None, use_peft=True, lora_r=64, lora_alpha=16, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, load_in_8bit=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)

In [10]:
torch_dtype = model_config.torch_dtype
        
logging.info(torch_dtype)

torch.float16


In [11]:
quantization_config = get_quantization_config(model_config)
quantization_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "float16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [12]:
model_kwargs = dict(
        revision=model_config.model_revision,
        trust_remote_code=model_config.trust_remote_code,
        # attn_implementation=model_config.attn_implementation,
        torch_dtype=torch_dtype,
        use_cache=False if trainingArgs.gradient_checkpointing else True,
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
    )

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
tokenizer.pad_token

'</s>'

In [14]:
raw_datasets = load_dataset(args.dataset_name)
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

Found cached dataset json (/home/kamal/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
from peft import LoraConfig
# The LoraConfig is generated automatically below
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [15]:
get_peft_config(model_config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=64, target_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [15]:
trainer = SFTTrainer(
    model=model_config.model_name_or_path,
    model_init_kwargs=model_kwargs,
    args=trainingArgs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=args.max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    peft_config=get_peft_config(model_config),
)

Found cached dataset generator (/home/kamal/.cache/huggingface/datasets/generator/default-1fba4001f2c326ea/0.0.0)
Found cached dataset generator (/home/kamal/.cache/huggingface/datasets/generator/default-c0ff235749306824/0.0.0)


In [16]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.12 GiB (GPU 0; 11.73 GiB total capacity; 9.75 GiB already allocated; 592.00 MiB free; 10.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [32]:
torch.cuda.empty_cache()