In [1]:
!pip install -qqq -U transformers datasets accelerate peft trl bitsandbytes deepspeed --progress-bar off

In [2]:
import gc
import os
import json
from kaggle_secrets import UserSecretsClient

In [3]:
# Get keys from Secrets
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [4]:
with open('/kaggle/input/ds-config/ds_config_zero3.json') as f:
    ds_config = json.load(f)
ds_config_dict=dict(zero3=ds_config)

In [5]:
ds_config_dict["zero3"]

{'fp16': {'enabled': 'auto',
  'loss_scale': 0,
  'loss_scale_window': 1000,
  'initial_scale_power': 16,
  'hysteresis': 2,
  'min_loss_scale': 1},
 'bf16': {'enabled': 'auto'},
 'optimizer': {'type': 'AdamW',
  'params': {'lr': 'auto',
   'weight_decay': 'auto',
   'torch_adam': True,
   'adam_w_mode': True}},
 'scheduler': {'type': 'WarmupLR',
  'params': {'warmup_min_lr': 'auto',
   'warmup_max_lr': 'auto',
   'warmup_num_steps': 'auto'}},
 'zero_optimization': {'stage': 3,
  'offload_optimizer': {'device': 'cpu', 'pin_memory': True},
  'offload_param': {'device': 'cpu', 'pin_memory': True},
  'overlap_comm': True,
  'contiguous_gradients': True,
  'sub_group_size': 1000000000.0,
  'reduce_bucket_size': 'auto',
  'stage3_prefetch_bucket_size': 'auto',
  'stage3_param_persistence_threshold': 'auto',
  'stage3_max_live_parameters': 1000000000.0,
  'stage3_max_reuse_distance': 1000000000.0,
  'stage3_gather_16bit_weights_on_model_save': 'auto'},
 'gradient_accumulation_steps': 1,
 'gr

In [None]:
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

In [18]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Device: {DEVICE}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

# Check the type and quantity of GPUs
if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))


Device: cuda
CUDA Version: 12.1
Pytorch 2.1.2
Num CPUs: 4
Num GPUs: 2
GPU Type: Tesla T4


### Llama_3_8b

In [11]:
# Model
base_model = "meta-llama/Meta-Llama-3-8B"

In [None]:
dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(100)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.01)

In [6]:
from accelerate import notebook_launcher
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

In [7]:

def main():
    
    from transformers import BitsAndBytesConfig
    from trl import ORPOConfig, ORPOTrainer, setup_chat_format
    from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
    from accelerate import Accelerator

#     from accelerate import FullyShardedDataParallelPlugin
#     from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

#     fsdp_plugin = FullyShardedDataParallelPlugin(
#         state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),
#         optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=False, rank0_only=False),
#     )

#     accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

    from accelerate import Accelerator, DeepSpeedPlugin
    deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=ds_config_dict["zero3"],
            gradient_accumulation_steps=4,
            gradient_clipping=1.0,
            zero_stage=3,
            offload_optimizer_device=None,
            offload_param_device=None,
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
    
    kwargs = {
        "fp16.enabled": True, 
        "fp16.auto_cast": False,
        "bf16.enabled": False,
        "optimizer.params.lr": 8e-6,
        "optimizer.params.weight_decay": 0.0,
        "scheduler.params.warmup_min_lr": 0.0,
        "scheduler.params.warmup_max_lr": 5e-5,
        "scheduler.params.warmup_num_steps": 0,
        "train_micro_batch_size_per_gpu": 1,
        "gradient_clipping": 1.0,
        "train_batch_size": 1,
        "zero_optimization.reduce_bucket_size": 5e5,
        "zero_optimization.stage3_prefetch_bucket_size": 5e5,
        "zero_optimization.stage3_param_persistence_threshold": 5e5,
#         "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
    }
    deepspeed_plugin.deepspeed_config_process(**kwargs)
    
    accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision="fp16")
#     accelerator = Accelerator(mixed_precision='fp16')
#     accelerator = Accelerator()
    
    device_map = {"": accelerator.process_index}
#     device_map = {"": "cuda:" + str(int(os.environ.get("LOCAL_RANK") or 0))}
#     device_map={'':torch.cuda.current_device()}

    
    # QLoRA config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=torch.float16,
    )

    # LoRA config
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
#         target_modules=["all_linear"],
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    
    base_model = "meta-llama/Meta-Llama-3-8B"
    new_model = "Llama-3-8B_FT_ORPO_9k"
    
    tokenizer = AutoTokenizer.from_pretrained(base_model, token=HF_TOKEN)
    
#     tokenizer.pad_token = tokenizer.eos_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
#         device_map="auto",
#         device_map=device_map,
        token=HF_TOKEN,
        attn_implementation="eager",
        torch_dtype=torch.float16,
    )
    
    model, tokenizer = setup_chat_format(model, tokenizer)
    model = prepare_model_for_kbit_training(model)
    
    dataset_name = "mlabonne/orpo-dpo-mix-40k"
    dataset = load_dataset(dataset_name, split="all")
    dataset = dataset.shuffle(seed=42).select(range(900)) # Only use 30 samples for test

    def format_chat_template(row):
        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
        return row

    dataset = dataset.map(
        format_chat_template,
        num_proc= os.cpu_count(),
    )
    dataset = dataset.train_test_split(test_size=0.01)
    
#     torch.cuda.empty_cache()
    
    orpo_args = ORPOConfig(
        learning_rate=8e-6,
        lr_scheduler_type="linear",
        max_length=1024,
        max_prompt_length=512,
        beta=0.1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        num_train_epochs=1,
        evaluation_strategy="steps",
#         eval_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        warmup_steps=10,
        report_to="none",
        output_dir="./results/",
        remove_unused_columns=False,
#         fp16=True,
#         bf16=False,
#         fsdp="full_shard",
#         fsdp_config={'min_num_params': 2000, 'offload_params': False, 'sharding_strategy': 1},
        ddp_find_unused_parameters=False,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": True}, #must be false for DDP
    )

    trainer = ORPOTrainer(
        model=model,
        args=orpo_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        tokenizer=tokenizer,
    )

    print(device_map)
    print(f'n_gpu: {orpo_args.n_gpu}; Mode: {orpo_args.parallel_mode}')
    print(f'Num Processes: {accelerator.num_processes}; Device: {accelerator.device}; Process Index: {accelerator.process_index}')
    print(f'Accel Type: {accelerator.distributed_type}')

    
    trainer.train()
    trainer.save_model(new_model)
    

In [8]:
%%time

notebook_launcher(main, num_processes=2)

Launching training on 2 GPUs.


2024-05-17 20:07:11.165676: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 20:07:11.165676: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 20:07:11.165739: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 20:07:11.165794: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 20:07:11.322840: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

[2024-05-17 20:07:22,860] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-05-17 20:07:22,860] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)





/opt/conda/compiler_compat/ld: cannot find -laio: /No optsuch/ condafile/ compiler_compator/ lddirectory:
 cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
collect2: error: ld returned 1 exit status


[2024-05-17 20:07:23,445] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-05-17 20:07:23,446] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-05-17 20:07:23,447] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44245 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

{'': 0}
n_gpu: 1; Mode: ParallelMode.DISTRIBUTED
Num Processes: 2; Device: cuda:0; Process Index: 0
Accel Type: DEEPSPEED
{'': 1}
n_gpu: 1; Mode: ParallelMode.DISTRIBUTED
Num Processes: 2; Device: cuda:1; Process Index: 1
Accel Type: DEEPSPEED


Could not estimate the number of tokens of the input, floating-point operations will not be computed
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
23,3.5213,4.612779,32.9037,0.274,0.152,-0.365334,-0.322751,0.4,-0.042582,-3.227514,-3.653337,-1.626554,-1.480288,4.911569,-0.983836,-0.455553
46,3.0355,3.803152,32.8194,0.274,0.152,-0.309846,-0.27018,0.4,-0.039667,-2.701799,-3.098464,-1.863598,-1.721089,3.758012,-0.98111,-0.448183
69,2.7509,3.280714,33.0137,0.273,0.151,-0.27584,-0.240344,0.2,-0.035496,-2.403441,-2.758399,-2.068769,-1.894428,2.879775,-0.97635,-0.434299
92,2.9651,2.717235,33.026,0.273,0.151,-0.233753,-0.208966,0.6,-0.024787,-2.089662,-2.337535,-1.88988,-1.685683,2.470328,-0.900982,-0.305739


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
23,3.5213,4.612779,32.9074,0.273,0.152,-0.433638,-0.425617,0.6,-0.008021,-4.256167,-4.336378,-1.928928,-1.985342,4.818929,-0.76259,-0.077687
46,3.0355,3.803152,32.8229,0.274,0.152,-0.376735,-0.366885,0.6,-0.00985,-3.668853,-3.76735,-2.447012,-2.318133,4.089953,-0.776574,-0.095637
69,2.7509,3.280714,33.0155,0.273,0.151,-0.369881,-0.349238,0.4,-0.020643,-3.49238,-3.698811,-2.561463,-2.327574,3.819503,-0.857361,-0.199253
92,2.9651,2.717235,33.0292,0.272,0.151,-0.299139,-0.290305,0.6,-0.008834,-2.903047,-2.991386,-2.227511,-1.786906,3.145942,-0.773504,-0.077684



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.


CPU times: user 1.92 s, sys: 822 ms, total: 2.75 s
Wall time: 2h 27min 33s


### Merge Adapter with Base model

In [None]:
# Flush memory
del trainer, model
gc.collect()
torch.cuda.empty_cache()


In [12]:
# Reload tokenizer and model

from trl import setup_chat_format

tokenizer = AutoTokenizer.from_pretrained(base_model, token=HF_TOKEN)
fp16_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    token=HF_TOKEN,
)
fp16_model, tokenizer = setup_chat_format(fp16_model, tokenizer)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
# merge fine tuned adapter
from peft import PeftModel

new_model = '/kaggle/working/Llama-3-8B_FT_ORPO_9k'

# Merge adapter with base model
model = PeftModel.from_pretrained(fp16_model, new_model)
model = model.merge_and_unload()

In [14]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128258, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

### Inference with Fine-tuned model

In [20]:
%%time
question = 'What is the basic structure of a SQL query to join to tables on a field like ID'
# question = 'When is labor day celebrated in USA'

# Tokenize the prompt
inputs = tokenizer(question, return_tensors="pt").to(DEVICE)
# Generate the outputs from prompt
generate_ids = model.generate(**inputs, max_new_tokens=256)
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)[0]

print('generated_text: ', generated_text)

generated_text:  What is the basic structure of a SQL query to join to tables on a field like ID?
I have two tables, let's call them A and B.
A has a column called ID.
B has a column called ID.
I want to join these two tables on the ID field. How do I do this?
The basic syntax is:
SELECT  * FROM  A
INNER JOIN  B  ON  A.ID = B.ID
This will return all rows from A and B where the ID field is the same in both tables.
If you want to return only rows where the ID field is the same in both tables, you can use INNER JOIN instead of JOIN .
To return only rows where the ID field is different, you can use LEFT JOIN instead of JOIN .
If you want to return all rows from A and only rows from B where the ID field is the same in both tables, you can use RIGHT JOIN instead of JOIN .
To return only rows where the ID field is different, you can use LEFT JOIN instead of JOIN .
To return only rows where the ID field is different, you can use RIGHT JOIN instead of JOIN .
To return only rows where the ID fie