In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
max_seq_length = 2048
dtype = torch.bfloat16  # Use bfloat16 for better performance on GPUs with Tensor Cores
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_id = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

# 1) Load the meta-provided tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_kwargs = {"torch_dtype": torch.bfloat16, "device_map": "auto"}

# 2) Tell HF to reserve two new specials
new_specials = ["<|OTHER|>", "<|ME|>", "<|DT_LONG|>", "<|DT_SHORT|>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": new_specials})
print(f"Added {num_added} tokens (at IDs {tokenizer.convert_tokens_to_ids(new_specials)})")



Added 4 tokens (at IDs [128256, 128257, 128258, 128259])


In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=dtype,
)

In [3]:
# 3) Load the model with the new special tokens
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config if load_in_4bit else None,
    trust_remote_code=True,
    **model_kwargs,
)
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128260, 4096, padding_idx=128004)

In [4]:
def get_target_modules(model):
    # Initialize a Set to Store Unique Layers
    unique_layers = set()
    
    # Iterate Over All Named Modules in the Model
    for name, module in model.named_modules():
        # Check if the Module Type Contains 'Linear4bit'
        if any(layer in str(type(module)) for layer in ["Linear4bit", "Linear"]):
            # Extract the Type of the Layer
            layer_type = name.split('.')[-1]
            
            # Add the Layer Type to the Set of Unique Layers
            unique_layers.add(layer_type)

    # Return the Set of Unique Layers Converted to a List
    return list(unique_layers)

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# 1) Make sure all layers that need gradients are unfrozen, and any necessary buffers
#    (e.g. for layernorm) are adjust-for-training.
model = prepare_model_for_kbit_training(model)

# 2) Define your LoRA config
lora_config = LoraConfig(
    r=16,                        # bottleneck rank
    lora_alpha=32,               # scaling factor
    target_modules=get_target_modules(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 3) Wrap your model
model = get_peft_model(model, lora_config)


# 4) Print the trainable parameters
model.print_trainable_parameters() #6,815,744 8,037,109,760 0.0848

trainable params: 44,060,736 || all params: 8,074,354,752 || trainable%: 0.5457


In [None]:
# for param in model.parameters():
#     param.requires_grad = False

# embedding_weight = model.get_input_embeddings().weight
# embedding_weight.requires_grad = True

# # Register hook to zero out gradients for all but new token IDs
# def mask_gradients(grad):
#     mask = torch.zeros_like(grad)
#     mask[new_token_ids] = 1.0
#     return grad * mask

# embedding_weight.register_hook(mask_gradients)

<torch.utils.hooks.RemovableHandle at 0x7f85b420a830>

In [6]:
model.print_trainable_parameters() #6,815,744 8,037,109,760 0.0848

trainable params: 44,060,736 || all params: 8,074,354,752 || trainable%: 0.5457


In [8]:
from datasets import load_dataset, DatasetDict

# Load a local JSONL file 
raw_ds = load_dataset("json", data_files="train.jsonl", split="train")

# Hold out 10% for the test set

split1 = raw_ds.train_test_split(test_size=0.1, seed=3407)
# From the remaining 90%, hold out 10% for validation (i.e. 9% of original)
split2 = split1["train"].train_test_split(test_size=0.1, seed=3407)

# Combine the splits into a DatasetDict
dataset_splits = DatasetDict({
    "train": split2["train"],
    "val":   split2["test"],
    "test":  split1["test"]
})

print({k: len(v) for k, v in dataset_splits.items()})

# Make the training split available as `dataset` for SFTTrainer
dataset = dataset_splits["train"]

# (Optional) Save splits to disk for later reuse
dataset_splits.save_to_disk("dataset_splits")

{'train': 26208, 'val': 2913, 'test': 3236}


Saving the dataset (0/1 shards):   0%|          | 0/26208 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2913 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3236 [00:00<?, ? examples/s]

In [None]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",   # or "longest" if you’re not using packing
        return_tensors="pt"
    )
    tokens["labels"] = tokens["input_ids"].clone()  # 💡 critical for CLM
    return tokens

tokenized_dataset = dataset.map(tokenize_function, remove_columns=["text"])


In [None]:
tokenized_dataset = dataset.map(tokenize_function, remove_columns=["text"])


Map:   0%|          | 0/26208 [00:00<?, ? examples/s]

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        bf16 = True,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
    ),
)

TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'

In [None]:
# load & serve via FastAPI instead of CLI
import os
from peft import PeftModel
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# uvicorn inference:app --host 0.0.0.0 --port 8000 --reload

# —– load config & pick latest checkpoint —–
base_model_id = os.getenv("BASE_MODEL_ID", "unsloth/Llama-3.2-3B-bnb-4bit")
model_dir     = os.getenv("LORA_WEIGHTS",   "outputs/checkpoint-2000")

if os.path.isdir(model_dir):
    subs = sorted(
        d for d in os.listdir(model_dir)
        if d.startswith("checkpoint") and os.path.isdir(os.path.join(model_dir, d))
    )
    if subs:
        model_dir = os.path.join(model_dir, subs[-1])

# —– tokenizer init (load from your fine-tuned folder so the new tokens & embeddings are preserved) —–
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# —– model init (load merged, fine-tuned weights + embeddings in one go) —–
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
    device_map="auto",
)

print(base_model.get_input_embeddings().weight.shape)
print(base_model.get_input_embeddings().weight[-10:])
print(base_model.get_input_embeddings().weight[-10:])

base_model.resize_token_embeddings(len(tokenizer), )
# Load the fine-tuned model weights
model = PeftModel.from_pretrained(base_model, model_dir)
print(model.get_input_embeddings().weight.shape)


torch.Size([128256, 3072])
tensor([[-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052],
        [-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052],
        [-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052],
        ...,
        [-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052],
        [-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052],
        [-0.0083,  0.0036,  0.0050,  ..., -0.0024, -0.0020, -0.0052]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)


SafetensorError: Error while deserializing header: MetadataIncompleteBuffer

In [9]:
base_model.dtype

torch.float16

In [7]:
from pydantic import BaseModel
class Args(BaseModel):
    model_id: str         = "unsloth/Llama-3.2-3B-bnb-4bit"
    train_file: str       = "train.jsonl"
    output_dir: str       = "outputs"
    batch_size: int       = 1
    epochs: int           = 5
    lr: float             = 2e-4
    seed: int             = 3407
    wandb_project: str    = "MeGPT"
    wandb_name: str       = f"MeGPT-{'timestamp'}"

args = Args()

In [29]:
"""
train.py - Fixed version
"""

import argparse
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from datasets import load_dataset, DatasetDict
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments
import os
import wandb
from datetime import datetime
import numpy as np
from torch.profiler import (
    profile as torch_profile,
    ProfilerActivity,
    schedule,
    tensorboard_trace_handler,
)
from transformers import TrainerCallback

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

def get_lora_targets(model):
    unique = set()
    for name, module in model.named_modules():
        if any(key in type(module).__name__ for key in ("Linear4bit","Linear")):
            unique.add(name.split(".")[-1])
    return list(unique)

def prepare_tokenizer(model_id):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    specials = ["<|OTHER|>","<|ME|>","<|DT_LONG|>","<|DT_SHORT|>"]
    tok.add_special_tokens({"additional_special_tokens": specials})
    # ensure we have a pad token
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    return tok

# def tokenize_and_build_labels(examples, tokenizer=None):
#     # tokenize the "text" field
#     outputs = tokenizer(
#         examples["text"],
#         truncation=True,
#         padding="max_length",
#         max_length=2048    # or "longest" if you prefer dynamic padding
#     )
#     # for causal-LM, labels == input_ids
#     outputs["labels"] = outputs["input_ids"].copy()
#     return outputs


def resize_embeddings(model, tokenizer, target_weights: torch.Tensor = None):
    """
    Resize the input and output embeddings of the model to match the tokenizer size.
    This function also initializes the new embeddings with a mean and covariance
    based on the existing embeddings.
    """
    # 0) Ensure the model is in evaluation mode
    model.eval()

    # 1) Figure out sizes
    old_vocab = len(model.get_input_embeddings().weight)
    new_vocab = len(tokenizer)
    n_new     = new_vocab - old_vocab
    emb       = model.get_input_embeddings()
    device    = emb.weight.device
    dim       = emb.weight.size(1)
    generator = torch.Generator(device=device).manual_seed(0)

    # 2) Resize to grow the matrix (new rows init’d by HF default)
    model.resize_token_embeddings(new_vocab)

    if target_weights is not None:
        # If target_weights are provided, use them to initialize the new embeddings
        with torch.no_grad():
            emb.weight[old_vocab:] = target_weights.to(device=device, dtype=emb.weight.dtype)
            model.tie_weights()  # Re-tie input and output embeddings
        print(f"Resized embeddings with provided target weights: {target_weights.shape}")
        return
    else:
        # 3) Grab the “old” weights and compute mean + covariance
        with torch.no_grad():
            W_old = emb.weight[:old_vocab]                # shape [old_vocab, dim]
            mu     = W_old.mean(dim=0, keepdim=True)       # [1, dim]
            X     = W_old - mu                             # zero-centered
            # unbiased covariance matrix: [dim, dim]
            sigma     = (X.T @ X) / (old_vocab - 1)

            # regularize Σ for numerical stability
            eps = 1e-5
            sigma += torch.eye(dim, device=device) * eps

            # Cholesky factorization of Σ = L Lᵀ
            L32 = torch.linalg.cholesky(sigma.to(torch.float32))     # [dim, dim], float32
            L   = L32.to(device=device, dtype=torch.bfloat16)       # back to bfloat16 or float16

            # 4) Sample n_new vectors: z ~ N(0, I) →  μ + z @ Lᵀ
            z = torch.randn(n_new, dim, device=device, generator=generator).to(torch.bfloat16)    # [n_new, dim]
            new_embs = mu + (z @ L.T)                      # [n_new, dim]

            # 5) Overwrite the new rows
            model.get_output_embeddings().weight[old_vocab:] = new_embs

        # 6) Re-tie input and output embeddings
        model.tie_weights()

def preprocess_data(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True)


def load_and_split(path, seed):
    raw = load_dataset("json", data_files=path, split="train")
    
    # Fix 1: Remove the input_ids mapping since SFTTrainer uses text field
    # The dataset already has "text" field from prep.py, so we don't need to map anything
    
    s = raw.train_test_split(test_size=0.05, seed=seed)
    splits = DatasetDict({
        "train": s["train"],
        "eval":   s["test"],
    })
    splits.save_to_disk(os.path.join("dataset_splits"))
    print("Dataset sizes:", {k: len(v) for k,v in splits.items()})
    return splits


# callback that advances the profiler on each step
class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof
    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()
    def on_train_end(self, args, state, control, **kwargs):
        self.prof.__exit__(None, None, None)  # stop profiling





torch.manual_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)

# 1) tokenizer
tokenizer = prepare_tokenizer(args.model_id)
tokenizer.save_pretrained(args.output_dir)

# 2) model + 4-bit quantization
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    args.model_id,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto")

# Calculate the number of new tokens
num_new_tokens = len(tokenizer) - len(model.get_input_embeddings().weight)

# Resize embeddings to match tokenizer size
# resize_embeddings(model, tokenizer)
model.resize_token_embeddings(len(tokenizer))

# 3) prepare LoRA
model = prepare_model_for_kbit_training(model)
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=get_lora_targets(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


# 4) data
splits = load_and_split(args.train_file, args.seed)
train_split = preprocess_data(splits["train"], tokenizer)
eval_split = preprocess_data(splits["eval"], tokenizer)




trainable params: 26,415,168 || all params: 3,239,177,280 || trainable%: 0.8155




Saving the dataset (0/1 shards):   0%|          | 0/30739 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1618 [00:00<?, ? examples/s]

Dataset sizes: {'train': 30739, 'eval': 1618}


In [30]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,              # causal LM
    pad_to_multiple_of=8    # optional but can help performance
)

In [31]:

# 5) trainer
train_args = TrainingArguments(
    output_dir=args.output_dir,
    per_device_train_batch_size=args.batch_size,
    gradient_accumulation_steps=8,
    max_steps=5,
    # num_train_epochs=args.epochs,
    learning_rate=args.lr,
    warmup_ratio=0.03,
    bf16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    fp16=False,  # bfloat16 is used
    dataloader_num_workers=4,
    weight_decay=0.01,
    seed=args.seed,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs",

    #wandb
    report_to="wandb",
    run_name=args.wandb_name,
)

trainer = Trainer(
    model=model,
    train_dataset=train_split,
    eval_dataset=eval_split,
    args=train_args,
    data_collator=data_collator
)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` be

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/transformers/trainer_utils.py", line 872, in __call__
    return self.data_collator(features)
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 46, in __call__
    return self.torch_call(features)
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 1018, in torch_call
    "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
  File "/home/cole_harrison/meGPT/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 437, in _torch_collate_batch
    length_of_first = examples[0].size(0)
AttributeError: 'tokenizers.Encoding' object has no attribute 'size'
