# GPU Memory limits

## Goal

Find the maximum sequence length that can be used for training or inference.

## Code

- https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct
- https://huggingface.co/docs/transformers/en/main_classes/text_generation

In [None]:
import os
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

In [None]:
import logging
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from datasets import Dataset
import torch
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

import sys
sys.path.append(os.path.realpath("../scripts"))
from finetuning import get_data_collator, prepare_model_for_kbit_training

In [None]:
plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

In [None]:
def generate_text_with_desired_length(model, tokenizer, sequence_length):
    prompt = "Write a long essay about the impact of artificial intelligence on modern society."
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    t0 = time.time()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=sequence_length,
        min_new_tokens=sequence_length,
    )
    logging.info(f"Generation speed {sequence_length/ (time.time() - t0):.2f} tokens/sec for {sequence_length} tokens")
    generated_ids = generated_ids[:, len(model_inputs.input_ids[0]):]
    assert len(generated_ids[0]) == sequence_length, f"Generated sequence length {len(generated_ids[0])} does not match the desired length {sequence_length}"
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
def generate_text_with_long_prompt(model, tokenizer, input_length, sequence_length=128):
    prompt = "Write a long essay about the impact of artificial intelligence on modern society. "
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    n_repeats = input_length // len(model_inputs.input_ids[0])
    model_inputs = tokenizer([prompt*n_repeats], return_tensors="pt").to(model.device)
    real_input_length = len(model_inputs.input_ids[0])
    t0 = time.time()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=sequence_length,
        min_new_tokens=sequence_length,
    )
    logging.info(f"Generation speed {sequence_length/ (time.time() - t0):.2f} tokens/sec for {real_input_length} input tokens ({input_length})")
    generated_ids = generated_ids[:, len(model_inputs.input_ids[0]):]
    assert len(generated_ids[0]) == sequence_length, f"Generated sequence length {len(generated_ids[0])} does not match the desired length {sequence_length}"
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
def train_on_prompt_with_desired_length(model, tokenizer, extra_training_arguments=dict(), sequence_length=2048):
    log_gpu_memory()
    system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
    user_message = "Fate whispers to the warrior, 'You cannot withstand the storm. "
    assistant_message = "The warrior whispers back, 'I am the storm.' "
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": assistant_message}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=False)
    n_repeats = sequence_length // (len(tokenizer(prompt)['input_ids'])-18)

    messages = [
        {"role": "system", "content": system_message*n_repeats},
        {"role": "user", "content": user_message* n_repeats},
        {"role": "assistant", "content": assistant_message*n_repeats}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=False)

    print(len(tokenizer(prompt)['input_ids']))
    # train_dataset = Dataset.from_dict({'input_ids': tokenizer.encode(prompt, return_tensors='pt').squeeze(0).tolist()*5})
    train_dataset = Dataset.from_dict({'input_ids': tokenizer.encode(prompt, return_tensors='pt')})

    training_arguments = SFTConfig(
        output_dir=None, #'/mnt/hdd0/Kaggle/arc25/trainings/20250505_TTT/debug',
        save_strategy='no',
        num_train_epochs=5,
        warmup_ratio=0.1,
        learning_rate=1e-5,
        optim="paged_adamw_8bit",
        max_grad_norm=1.0,
        max_seq_length=sequence_length,

        do_eval=False,
        eval_strategy="no",
        logging_strategy='no',
        # logging_steps=0, #50,
        # log_level="info",
        report_to='none',

        # parameters added to make the code work with accelerate
        # dispatch_batches=False,
        # https://huggingface.co/transformers/v4.9.1/main_classes/trainer.html#trainingarguments
        ddp_find_unused_parameters=False, # only used with accelerate, got a warning saying that it slows down if True

        ignore_data_skip=True, # otherwise it takes too long to start training when resuming from checkpoint

        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        **extra_training_arguments,
    )

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        data_collator=get_data_collator(tokenizer),
        args=training_arguments,
    )
    try:
        t0 = time.time()
        trainer.train()
        logging.info(f"Training speed {len(train_dataset)*training_arguments.num_train_epochs / (time.time() - t0):.2f} samples/sec for {len(train_dataset)} samples with {sequence_length} tokens ({len(tokenizer(prompt)['input_ids'])})")
        log_gpu_memory()
        output = dict(
            training_speed=len(train_dataset)*training_arguments.num_train_epochs / (time.time() - t0),
            sequence_length=sequence_length,
            gpu_max_memory=torch.cuda.max_memory_allocated(0)/1024**3,
            use_liger_kernel=extra_training_arguments.get('use_liger_kernel', False),
            gradient_checkpointing=extra_training_arguments.get('gradient_checkpointing', False),
        )
        return output
    except torch.cuda.OutOfMemoryError as e:
        logging.error(f"Out of memory error: {e}")
        logging.error(f"Failed to train on sequence length {sequence_length} with input length {len(tokenizer(prompt)['input_ids'])}")
        log_gpu_memory()
        return False

def log_gpu_memory():
    for device in range(torch.cuda.device_count()):
        logging.info(f'GPU {device} memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.1f} GB, max memory allocated: {torch.cuda.max_memory_allocated(device)/1024**3:.1f} GB')

## Inference Experiments

In [None]:
parameters = '3B'
base_model_path = f'/home/gbarbadillo/models/Qwen2.5-Coder-{parameters}-Instruct'
model = AutoModelForCausalLM.from_pretrained(
    base_model_path, torch_dtype="auto", device_map="auto", quantization_config=None)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

In [None]:
generate_text_with_desired_length(model, tokenizer, sequence_length=32700)

In [None]:
for i in range(3, 15):
    generate_text_with_desired_length(model, tokenizer, sequence_length=2**i)

In [None]:
for i in range(10, 20):
    generate_text_with_long_prompt(model, tokenizer, input_length=2**i)

```
# 0.5B
1468MiB
2025-06-26 17:43:40,263 - root - INFO - generate_text_with_desired_length - Generation speed 13.90 tokens/sec for 8 tokens
2025-06-26 17:43:40,583 - root - INFO - generate_text_with_desired_length - Generation speed 50.32 tokens/sec for 16 tokens
2025-06-26 17:43:41,201 - root - INFO - generate_text_with_desired_length - Generation speed 51.90 tokens/sec for 32 tokens
2025-06-26 17:43:42,433 - root - INFO - generate_text_with_desired_length - Generation speed 52.05 tokens/sec for 64 tokens
2025-06-26 17:43:44,979 - root - INFO - generate_text_with_desired_length - Generation speed 50.33 tokens/sec for 128 tokens
2025-06-26 17:43:49,982 - root - INFO - generate_text_with_desired_length - Generation speed 51.19 tokens/sec for 256 tokens
2025-06-26 17:44:00,041 - root - INFO - generate_text_with_desired_length - Generation speed 50.91 tokens/sec for 512 tokens
2025-06-26 17:44:20,302 - root - INFO - generate_text_with_desired_length - Generation speed 50.55 tokens/sec for 1024 tokens
2025-06-26 17:44:59,880 - root - INFO - generate_text_with_desired_length - Generation speed 51.75 tokens/sec for 2048 tokens
2025-06-26 17:46:19,066 - root - INFO - generate_text_with_desired_length - Generation speed 51.73 tokens/sec for 4096 tokens
2025-06-26 17:48:58,518 - root - INFO - generate_text_with_desired_length - Generation speed 51.38 tokens/sec for 8192 tokens
2025-06-26 18:18:20,443 - root - INFO - generate_text_with_desired_length - Generation speed 49.80 tokens/sec for 32700 tokens
1942MiB

2025-06-26 18:02:58,654 - root - INFO - generate_text_with_long_prompt - Generation speed 43.89 tokens/sec for 953 input tokens (1024)
2025-06-26 18:03:01,348 - root - INFO - generate_text_with_long_prompt - Generation speed 47.64 tokens/sec for 1905 input tokens (2048)
2025-06-26 18:03:04,092 - root - INFO - generate_text_with_long_prompt - Generation speed 46.84 tokens/sec for 3823 input tokens (4096)
2025-06-26 18:03:06,782 - root - INFO - generate_text_with_long_prompt - Generation speed 47.91 tokens/sec for 7645 input tokens (8192)
2025-06-26 18:03:09,736 - root - INFO - generate_text_with_long_prompt - Generation speed 44.17 tokens/sec for 15289 input tokens (16384)
2025-06-26 18:03:13,410 - root - INFO - generate_text_with_long_prompt - Generation speed 35.73 tokens/sec for 30577 input tokens (32768)
Token indices sequence length is longer than the specified maximum sequence length for this model (61167 > 32768). Running this sequence through the model will result in indexing errors
OOM

# 1.5B
3700MiB
2025-06-26 18:35:18,660 - root - INFO - generate_text_with_desired_length - Generation speed 39.62 tokens/sec for 32700 tokens
4648MiB

# 3B
6882MiB
2025-06-26 19:30:47,143 - root - INFO - generate_text_with_desired_length - Generation speed 27.40 tokens/sec for 32700 tokens
8002MiB

# 7B
14782MiB
2025-06-26 17:49:53,393 - root - INFO - generate_text_with_desired_length - Generation speed 12.97 tokens/sec for 8 tokens
2025-06-26 17:49:53,819 - root - INFO - generate_text_with_desired_length - Generation speed 37.77 tokens/sec for 16 tokens
2025-06-26 17:49:54,590 - root - INFO - generate_text_with_desired_length - Generation speed 41.66 tokens/sec for 32 tokens
2025-06-26 17:49:56,079 - root - INFO - generate_text_with_desired_length - Generation speed 43.03 tokens/sec for 64 tokens
2025-06-26 17:49:59,066 - root - INFO - generate_text_with_desired_length - Generation speed 42.88 tokens/sec for 128 tokens
2025-06-26 17:50:04,982 - root - INFO - generate_text_with_desired_length - Generation speed 43.28 tokens/sec for 256 tokens
2025-06-26 17:50:17,005 - root - INFO - generate_text_with_desired_length - Generation speed 42.59 tokens/sec for 512 tokens
2025-06-26 17:50:40,825 - root - INFO - generate_text_with_desired_length - Generation speed 42.99 tokens/sec for 1024 tokens
2025-06-26 17:51:30,304 - root - INFO - generate_text_with_desired_length - Generation speed 41.39 tokens/sec for 2048 tokens
15094MiB
2025-06-26 17:53:14,419 - root - INFO - generate_text_with_desired_length - Generation speed 39.34 tokens/sec for 4096 tokens
15768MiB
2025-06-26 19:25:33,969 - root - INFO - generate_text_with_desired_length - Generation speed 32.29 tokens/sec for 8000 tokens
16552MiB
2025-06-26 19:20:14,560 - root - INFO - generate_text_with_desired_length - Generation speed 26.03 tokens/sec for 16000 tokens
21088MiB
2025-06-26 18:37:14,045 - root - INFO - generate_text_with_desired_length - Generation speed 18.63 tokens/sec for 32700 tokens
20344MiB

2025-06-26 18:00:57,276 - root - INFO - generate_text_with_long_prompt - Generation speed 35.11 tokens/sec for 953 input tokens (1024)
2025-06-26 18:01:00,889 - root - INFO - generate_text_with_long_prompt - Generation speed 35.50 tokens/sec for 1905 input tokens (2048)
2025-06-26 18:01:05,198 - root - INFO - generate_text_with_long_prompt - Generation speed 29.77 tokens/sec for 3823 input tokens (4096)
2025-06-26 18:01:11,104 - root - INFO - generate_text_with_long_prompt - Generation speed 21.84 tokens/sec for 7645 input tokens (8192)
2025-06-26 18:01:20,953 - root - INFO - generate_text_with_long_prompt - Generation speed 13.04 tokens/sec for 15289 input tokens (16384)
2025-06-26 18:01:39,310 - root - INFO - generate_text_with_long_prompt - Generation speed 7.00 tokens/sec for 30577 input tokens (32768)
Token indices sequence length is longer than the specified maximum sequence length for this model (61167 > 32768). Running this sequence through the model will result in indexing errors
OOM
```

Inference speed is almost the same, but GPU utilization is much higher with the 7B model.
Both models give OOM when trying to generate for more than 32k tokens, which is bigger than the maximum sequence lenght of the models.

Tricks to speedup inference. https://chatgpt.com/c/685d743d-c194-8012-a34d-8cc17e18c9d0

## Training experiments

### Base model

In [None]:
parameters = '1.5B'
base_model_path = f'/home/gbarbadillo/models/Qwen2.5-Coder-{parameters}-Instruct'

model = AutoModelForCausalLM.from_pretrained(
    base_model_path, torch_dtype="auto", device_map="auto", quantization_config=None)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

extra_training_arguments = dict(
    use_liger_kernel=True,
    gradient_checkpointing=True,
)
metrics = []
for i in range(9, 16):
    ret = train_on_prompt_with_desired_length(model, tokenizer, extra_training_arguments=extra_training_arguments, sequence_length=2**i)
    if not ret:
        break
    else:
        ret['parameters'] = parameters
        metrics.append(ret)
print(metrics)

In [None]:
metrics = [
    {'training_speed': 4.655445421911063, 'sequence_length': 512, 'gpu_max_memory': 2.462963104248047, 'use_liger_kernel': False, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 3.9653569772744977, 'sequence_length': 1024, 'gpu_max_memory': 4.045621395111084, 'use_liger_kernel': False, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 2.968316333629294, 'sequence_length': 2048, 'gpu_max_memory': 7.171669960021973, 'use_liger_kernel': False, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 1.966691887013347, 'sequence_length': 4096, 'gpu_max_memory': 13.438164234161377, 'use_liger_kernel': False, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 2.5164281932803156, 'sequence_length': 512, 'gpu_max_memory': 2.151851177215576, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 2.494196685444533, 'sequence_length': 1024, 'gpu_max_memory': 2.2452754974365234, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 2.332637597012394, 'sequence_length': 2048, 'gpu_max_memory': 3.2845101356506348, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 1.861314681655273, 'sequence_length': 4096, 'gpu_max_memory': 5.405813217163086, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 1.192822335023071, 'sequence_length': 8192, 'gpu_max_memory': 9.536457061767578, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 0.5998322364807198, 'sequence_length': 16384, 'gpu_max_memory': 17.890711307525635, 'use_liger_kernel': True, 'gradient_checkpointing': False, 'parameters': '0.5B'},
    {'training_speed': 3.608580192860072, 'sequence_length': 512, 'gpu_max_memory': 2.3958849906921387, 'use_liger_kernel': False, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 3.3559291385392607, 'sequence_length': 1024, 'gpu_max_memory': 2.686540126800537, 'use_liger_kernel': False, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.8218909039434448, 'sequence_length': 2048, 'gpu_max_memory': 4.455604553222656, 'use_liger_kernel': False, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 1.8272229339675379, 'sequence_length': 4096, 'gpu_max_memory': 7.92591667175293, 'use_liger_kernel': False, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 0.9762823414311411, 'sequence_length': 8192, 'gpu_max_memory': 14.867601871490479, 'use_liger_kernel': False, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.4116993324550493, 'sequence_length': 512, 'gpu_max_memory': 2.1479249000549316, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.3112110518355933, 'sequence_length': 1024, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.1304113427597624, 'sequence_length': 2048, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 1.6853356769346826, 'sequence_length': 4096, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 1.0546785281657463, 'sequence_length': 8192, 'gpu_max_memory': 2.299820899963379, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 0.5164192806887002, 'sequence_length': 16384, 'gpu_max_memory': 2.7489829063415527, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 0.19934655001806584, 'sequence_length': 32768, 'gpu_max_memory': 4.195651054382324, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
]

df = pd.DataFrame(metrics)

# Create single figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot lines for each combination including 'parameters'
for (param, use_liger, ckpt), group in df.groupby(['parameters', 'use_liger_kernel', 'gradient_checkpointing']):
    label = f"{param}, liger={use_liger}, ckpt={ckpt}"
    grp = group.sort_values('sequence_length')
    
    ax1.plot(grp['sequence_length'], grp['training_speed'], marker='o', label=label)
    ax2.plot(grp['sequence_length'], grp['gpu_max_memory'], marker='o', label=label)

# Format axes
ax1.set_xlabel('Sequence Length')
ax1.set_ylabel('Training Speed (steps/s)')
ax1.set_title('Training Speed vs. Sequence Length')
ax1.set_xscale('log', base=2)
ax1.grid(which='both')
ax1.legend()

ax2.set_xlabel('Sequence Length')
ax2.set_ylabel('GPU Max Memory (GB)')
ax2.set_title('GPU Max Memory vs. Sequence Length')
ax2.set_xscale('log', base=2)
ax2.grid(which='both')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
metrics = [
    {'training_speed': 2.4116993324550493, 'sequence_length': 512, 'gpu_max_memory': 2.1479249000549316, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.3112110518355933, 'sequence_length': 1024, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 2.1304113427597624, 'sequence_length': 2048, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 1.6853356769346826, 'sequence_length': 4096, 'gpu_max_memory': 2.1861486434936523, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 1.0546785281657463, 'sequence_length': 8192, 'gpu_max_memory': 2.299820899963379, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 0.5164192806887002, 'sequence_length': 16384, 'gpu_max_memory': 2.7489829063415527, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},
    {'training_speed': 0.19934655001806584, 'sequence_length': 32768, 'gpu_max_memory': 4.195651054382324, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '0.5B'},

    {'training_speed': 0.9357929681571886, 'sequence_length': 512, 'gpu_max_memory': 6.255135536193848, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 1.5455698166270475, 'sequence_length': 1024, 'gpu_max_memory': 6.256848335266113, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 1.205905206620072, 'sequence_length': 2048, 'gpu_max_memory': 6.271763801574707, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 0.797322104364142, 'sequence_length': 4096, 'gpu_max_memory': 6.290440082550049, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 0.46114625260948194, 'sequence_length': 8192, 'gpu_max_memory': 6.542620658874512, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 0.20690177653984218, 'sequence_length': 16384, 'gpu_max_memory': 7.331274032592773, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},
    {'training_speed': 0.081450650173374, 'sequence_length': 32768, 'gpu_max_memory': 9.025110721588135, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '1.5B'},


    {'training_speed': 0.8638907205129681, 'sequence_length': 512, 'gpu_max_memory': 12.280466079711914, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.8811807386255249, 'sequence_length': 1024, 'gpu_max_memory': 12.284631729125977, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.5706510433354517, 'sequence_length': 2048, 'gpu_max_memory': 12.286478042602539, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.3893175661184382, 'sequence_length': 4096, 'gpu_max_memory': 12.286478042602539, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.23316316250353458, 'sequence_length': 8192, 'gpu_max_memory': 12.496784210205078, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.10437733898784911, 'sequence_length': 16384, 'gpu_max_memory': 13.497037887573242, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'}, 
     {'training_speed': 0.043671263797107014, 'sequence_length': 32768, 'gpu_max_memory': 15.487967491149902, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '3B'},

    {'training_speed': 0.5432105493633811, 'sequence_length': 512, 'gpu_max_memory': 8.299020290374756, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
    {'training_speed': 0.36647652800592556, 'sequence_length': 1024, 'gpu_max_memory': 8.36922025680542, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
    {'training_speed': 0.19076424496290637, 'sequence_length': 2048, 'gpu_max_memory': 9.071717262268066, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
    {'training_speed': 0.09205249217306562, 'sequence_length': 4096, 'gpu_max_memory': 10.451850414276123, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
    {'training_speed': 0.040783707644366486, 'sequence_length': 8192, 'gpu_max_memory': 13.209789276123047, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
    {'training_speed': 0.01631617233317498, 'sequence_length': 16384, 'gpu_max_memory': 18.84087610244751, 'use_liger_kernel': True, 'gradient_checkpointing': True, 'parameters': '7B'},
]

df = pd.DataFrame(metrics)

# Create single figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot lines for each combination including 'parameters'
for (param, use_liger, ckpt), group in df.groupby(['parameters', 'use_liger_kernel', 'gradient_checkpointing']):
    label = f"{param}, liger={use_liger}, ckpt={ckpt}"
    grp = group.sort_values('sequence_length')
    
    ax1.plot(grp['sequence_length'], grp['training_speed'], marker='o', label=label)
    ax2.plot(grp['sequence_length'], grp['gpu_max_memory'], marker='o', label=label)

# Format axes
ax1.set_xlabel('Sequence Length')
ax1.set_ylabel('Training Speed (steps/s)')
ax1.set_title('Training Speed vs. Sequence Length')
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.grid(which='both')
ax1.legend()

ax2.set_xlabel('Sequence Length')
ax2.set_ylabel('GPU Max Memory (GB)')
ax2.set_title('GPU Max Memory vs. Sequence Length')
ax2.set_xscale('log')
ax2.grid(which='both')
ax2.legend()

plt.tight_layout()
plt.show()

### QLora

In [None]:
parameters = '7B'
base_model_path = f'/home/gbarbadillo/models/Qwen2.5-Coder-{parameters}-Instruct'

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
    llm_int8_enable_fp32_cpu_offload= True,
    llm_int8_skip_modules=['gate', 'lm_head'],
)

peft_config = LoraConfig(
    # lora_alpha: LoRA scaling factor.
    lora_alpha=64, #64,
    lora_dropout=0.1, # 0.1, althought Vaca suggested to use 0.05 for big models
    # r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    r=32, #16
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    use_rslora=False,
    use_dora=False,
    init_lora_weights=True # bool | Literal['gaussian', 'olora', 'pissa', 'pissa_niter_[number of iters]', 'loftq'] = True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path, torch_dtype="auto", device_map="auto", quantization_config=bnb_config)
model = get_peft_model(model, peft_config)


gradient_checkpointing = True
prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

extra_training_arguments = dict(
    use_liger_kernel=True,
    gradient_checkpointing=gradient_checkpointing,
)


metrics = []
for i in range(9, 16):
    ret = train_on_prompt_with_desired_length(model, tokenizer, extra_training_arguments=extra_training_arguments, sequence_length=2**i)
    if not ret:
        break
    else:
        ret['parameters'] = parameters
        metrics.append(ret)
print(metrics)

https://docs.google.com/spreadsheets/d/1NmmCZA7gPOyoBypwvpw_JhYdjcvqNFHibX_WahwTHIM/edit?gid=0#gid=0&range=A341

Try with quantization.