In [1]:
%pip install --upgrade pip
%pip install -q -U bitsandbytes accelerate peft transformers trl datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import wandb
from huggingface_hub import notebook_login

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Hyperparameters and configuration"""

    # Model settings
    MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

    # Dataset
    DATASET_PATH = r"C:\Users\Anwender\humor-project\data\humor_training_data_25000.jsonl"

    # QLoRA settings
    LORA_R = 16                   # LoRA attention dimension
    LORA_ALPHA = 32                # Alpha parameter for LoRA scaling
    LORA_DROPOUT = 0.05            # Dropout probability for LoRA layers

    # Target modules for LoRA (Llama 3 specific)
    TARGET_MODULES = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]

    # Training hyperparameters
    OUTPUT_DIR = "./llama3-humor-lora"
    NUM_EPOCHS = 3
    BATCH_SIZE = 20                 # Per device batch size
    GRADIENT_ACCUM_STEPS = 1       # Effective batch size = 20 * 1 = 20
    LEARNING_RATE = 2e-4
    MAX_SEQ_LENGTH = 256
    WARMUP_RATIO = 0.03
    LOGGING_STEPS = 10
    SAVE_STEPS = 100
    EVAL_STEPS = 50

    # Optimization
    OPTIM = "paged_adamw_8bit"     # Memory-efficient optimizer
    FP16 = False
    BF16 = True                     # Use bfloat16 if available
    GRADIENT_CHECKPOINTING = True  # DISABLED - only when we have enough VRAM (was True)

    # Data loading - MAXIMIZE THROUGHPUT
    DATALOADER_NUM_WORKERS = 10     # Increased for faster data loading (was 2)
    DATALOADER_PIN_MEMORY = True    # NEW - faster CPU->GPU transfer
    DATALOADER_PREFETCH_FACTOR = 4  # NEW - prefetch batches

    # Packing - EFFICIENCY BOOST
    USE_PACKING = False              # NEW - pack multiple samples per sequence

    # Data split
    TRAIN_SPLIT = 0.95

    # WandB (optional)
    USE_WANDB = False
    WANDB_PROJECT = "llama3-humor-generation"

    # Seed
    SEED = 42

config = Config()

In [4]:
def setup_environment():
    """Install required packages and login to Hugging Face"""

    print("üîß Setting up environment...")

    # Check for GPU
    if torch.cuda.is_available():
        print(f"‚úì GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("‚ö†Ô∏è  No GPU detected. Training will be very slow!")

    # Login to Hugging Face (needed for Llama 3)
    print("\nüîê Please login to Hugging Face:")
    print("   You need access to Llama 3 model (request at: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)")
    notebook_login()

    # Optional: WandB login
    if config.USE_WANDB:
        print("\nüìä Logging into Weights & Biases...")
        wandb.login()

    print("‚úì Environment setup complete\n")

In [5]:
# ============================================================================
# DATA LOADING
# ============================================================================

def load_and_prepare_dataset(dataset_path):
    """Load dataset and split into train/test"""

    print(f"üìÇ Loading dataset from {dataset_path}...")

    # Load dataset
    dataset = load_dataset('json', data_files=dataset_path, split='train')

    print(f"  Total examples: {len(dataset)}")

    # Split into train/test
    dataset = dataset.train_test_split(
        test_size=1 - config.TRAIN_SPLIT,
        seed=config.SEED
    )

    print(f"  Train examples: {len(dataset['train'])}")
    print(f"  Test examples: {len(dataset['test'])}")

    # Show sample
    print("\nüìã Sample training example:")
    sample = dataset['train'][0]
    print(f"  User: {sample['messages'][0]['content'][:100]}...")
    print(f"  Assistant: {sample['messages'][1]['content'][:100]}...")

    return dataset

In [None]:
# ============================================================================
# MODEL LOADING
# ============================================================================

def load_model_and_tokenizer():
    """Load Llama 3 with 4-bit quantization for QLoRA"""

    print(f"\nü§ñ Loading model: {config.MODEL_NAME}")

    # Quantization config for 4-bit
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if config.BF16 else torch.float16,
        bnb_4bit_use_double_quant=True,  # Nested quantization for more memory savings
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        config.MODEL_NAME,
        trust_remote_code=True
    )

    # Set padding token (Llama 3 doesn't have one by default)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    tokenizer.padding_side = "right"  # Required for training

    print("  ‚úì Tokenizer loaded")

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        config.MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        max_memory={0: "22GiB", "cpu": "40GiB"},
        trust_remote_code=True,
        # NEW: Enable optimizations 
        torch_dtype=torch.bfloat16 if config.BF16 else torch.float16,
        # attn_implementation="flash_attention_2",  # Use FlashAttention if available
    )

    print("  ‚úì Model loaded with 4-bit quantization")

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=config.GRADIENT_CHECKPOINTING  # Use config setting
    )

    print("  ‚úì Model prepared for k-bit training")

    return model, tokenizer

In [7]:
# ============================================================================
# LORA CONFIGURATION
# ============================================================================

def setup_lora(model):
    """Configure and apply LoRA to the model"""

    print("\nüîß Setting up LoRA adapters...")

    lora_config = LoraConfig(
        r=config.LORA_R,
        lora_alpha=config.LORA_ALPHA,
        target_modules=config.TARGET_MODULES,
        lora_dropout=config.LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)

    # Print trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    trainable_percent = 100 * trainable_params / all_params

    print(f"  ‚úì LoRA adapters applied")
    print(f"  Trainable params: {trainable_params:,} ({trainable_percent:.2f}%)")
    print(f"  All params: {all_params:,}")

    return model


In [8]:
def format_chat_template(example, tokenizer):
    """
    Format examples using Llama 3's chat template.
    The dataset already has 'messages' field in the correct format.
    """
    # Apply chat template
    formatted_text = tokenizer.apply_chat_template(
        example['messages'],
        tokenize=False,
        add_generation_prompt=False
    )

    return {"text": formatted_text}

In [None]:
from trl import SFTTrainer, SFTConfig

def train_model(model, tokenizer, dataset):

    print("\nüöÄ Starting training with OPTIMIZED settings...")
    print(f"  Batch size: {config.BATCH_SIZE}")
    print(f"  Gradient accumulation: {config.GRADIENT_ACCUM_STEPS}")
    print(f"  Effective batch size: {config.BATCH_SIZE * config.GRADIENT_ACCUM_STEPS}")
    print(f"  Max sequence length: {config.MAX_SEQ_LENGTH}")
    print(f"  Packing enabled: {config.USE_PACKING}")
    print(f"  Gradient checkpointing: {config.GRADIENT_CHECKPOINTING}")


    # 1Ô∏è‚É£ Apply chat template once 
    def apply_template(example):
        text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
        return {"text": text}

    print("üìù Applying chat template to dataset...")
    train_dataset = dataset["train"].map(
        apply_template,
        remove_columns=dataset["train"].column_names,
        desc="Formatting train dataset",
        num_proc=4,  # NEW - parallel processing
    )

    eval_dataset = dataset["test"].map(
        apply_template,
        remove_columns=dataset["test"].column_names,
        desc="Formatting eval dataset",
        num_proc=4,  # NEW - parallel processing
    )

    print("‚úì Datasets formatted")
    print(f"Sample:\n{train_dataset[0]['text'][:300]}")

    # 2Ô∏è‚É£ SFT config
    training_args = SFTConfig(
        output_dir=config.OUTPUT_DIR,
        num_train_epochs=config.NUM_EPOCHS,

        # Batch settings - OPTIMIZED
        per_device_train_batch_size=config.BATCH_SIZE,
        per_device_eval_batch_size=config.BATCH_SIZE,
        gradient_accumulation_steps=config.GRADIENT_ACCUM_STEPS,
        gradient_checkpointing=config.GRADIENT_CHECKPOINTING,  # DISABLED for speed

        # Optimizer settings
        optim=config.OPTIM,
        learning_rate=config.LEARNING_RATE,
        warmup_ratio=config.WARMUP_RATIO,
        lr_scheduler_type="cosine",
        max_grad_norm=1.0,  # Gradient clipping

        # Logging and saving
        logging_steps=config.LOGGING_STEPS,
        save_steps=config.SAVE_STEPS,
        eval_steps=config.EVAL_STEPS,
        eval_strategy="steps",
        save_strategy="steps",
        logging_strategy="steps",
        logging_first_step=True,
        log_level="info",
        disable_tqdm=False,   # progress bar

        # Precision
        fp16=config.FP16,
        bf16=config.BF16,

        # Model management
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        # Reporting
        report_to="wandb" if config.USE_WANDB else [],
        run_name=f"llama3-humor-r{config.LORA_R}-optimized" if config.USE_WANDB else None,

        # Reproducibility
        seed=config.SEED,

        # Data loading - PERFORMANCE CRITICAL
        dataloader_num_workers=config.DATALOADER_NUM_WORKERS,
        dataloader_pin_memory=config.DATALOADER_PIN_MEMORY,
        dataloader_prefetch_factor=config.DATALOADER_PREFETCH_FACTOR,
        group_by_length=True,  # Group similar lengths together

        # SFT-specific - OPTIMIZED
        max_length=config.MAX_SEQ_LENGTH,  # Reduced for humor
        packing=config.USE_PACKING,  # ENABLED - major speedup
        dataset_text_field="text",

        # NEW: Additional optimizations
        ddp_find_unused_parameters=False,  # Faster distributed training
        remove_unused_columns=True,
        label_names=["labels"],  # Explicit label column
    )

    # 3Ô∏è‚É£ Trainer ‚Äî NO tokenizer / processing_class
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    print("\n" + "=" * 60)
    print("TRAINING STARTED")
    print("=" * 60)

    trainer.train()

    print("\n‚úì TRAINING COMPLETE")

    trainer.save_model()
    tokenizer.save_pretrained(config.OUTPUT_DIR)

    return trainer

In [10]:
# ============================================================================
# INFERENCE TESTING
# ============================================================================

def test_model(model, tokenizer):
    """Test the fine-tuned model with sample inputs"""

    print("\nüß™ Testing fine-tuned model...")

    # Put model in evaluation mode
    model.eval()

    # Test cases
    test_cases = [
        ("telescope", "sandwich"),
        ("elephant", "programming"),
        ("coffee", "quantum"),
    ]

    for word1, word2 in test_cases:
        prompt = f"Generate a funny joke that naturally includes both of these words: '{word1}' and '{word2}'. The joke should be creative, humorous, and incorporate both words seamlessly."

        # Format with chat template
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Tokenize
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Decode
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the assistant's response
        if "assistant" in generated:
            joke = generated.split("assistant")[-1].strip()
        else:
            joke = generated[len(formatted_prompt):].strip()

        print(f"\n{'='*60}")
        print(f"Words: {word1}, {word2}")
        print(f"{'='*60}")
        print(f"Joke: {joke}")

    print("\n‚úì Testing complete")

In [11]:
# ============================================================================
# MAIN PIPELINE
# ============================================================================

"""Run the complete fine-tuning pipeline"""

print("="*60)
print("LLAMA 3 HUMOR GENERATION - QLORA FINE-TUNING")
print("="*60)

# Setup
setup_environment()

LLAMA 3 HUMOR GENERATION - QLORA FINE-TUNING
üîß Setting up environment...
‚úì GPU detected: NVIDIA GeForce RTX 5090 Laptop GPU
  Memory: 25.65 GB

üîê Please login to Hugging Face:
   You need access to Llama 3 model (request at: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
‚úì Environment setup complete



In [12]:
# Load data
dataset = load_and_prepare_dataset(config.DATASET_PATH)

üìÇ Loading dataset from humor_training_data_25000.jsonl...
  Total examples: 25000
  Train examples: 23749
  Test examples: 1251

üìã Sample training example:
  User: Can you create a joke that naturally incorporates 'angry' and 'late'? Make it humorous and creative....
  Assistant: My husband is like the New York subway... He makes me angry, frustrated, and late for work but I sti...


In [15]:
# Load model
model, tokenizer = load_model_and_tokenizer()


ü§ñ Loading model: meta-llama/Meta-Llama-3-8B-Instruct


`torch_dtype` is deprecated! Use `dtype` instead!


  ‚úì Tokenizer loaded


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

  ‚úì Model loaded with 4-bit quantization
  ‚úì Model prepared for k-bit training


In [16]:
# Setup LoRA
model = setup_lora(model)


üîß Setting up LoRA adapters...
  ‚úì LoRA adapters applied
  Trainable params: 41,943,040 (0.92%)
  All params: 4,582,543,360


In [17]:
tokenizer.apply_chat_template(
    dataset["train"][0]["messages"],
    tokenize=False
)

"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nCan you create a joke that naturally incorporates 'angry' and 'late'? Make it humorous and creative.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy husband is like the New York subway... He makes me angry, frustrated, and late for work but I still can't help but ride him every day.<|eot_id|>"

In [18]:
# # Train
trainer = train_model(model, tokenizer, dataset)


üöÄ Starting training with OPTIMIZED settings...
  Batch size: 20
  Gradient accumulation: 1
  Effective batch size: 20
  Max sequence length: 256
  Packing enabled: False
  Gradient checkpointing: True
üìù Applying chat template to dataset...


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


‚úì Datasets formatted
Sample:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you create a joke that naturally incorporates 'angry' and 'late'? Make it humorous and creative.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

My husband is like the New York subway... He makes me angry, frustrated, and late fo


Tokenizing train dataset:   0%|          | 0/23749 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/23749 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1251 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1251 [00:00<?, ? examples/s]


TRAINING STARTED


The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
skipped Embedding(128256, 4096): 501.0M params
skipped: 501.0M params
***** Running training *****
  Num examples = 23,749
  Num Epochs = 3
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 3,564
  Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,1.630625,1.710829
100,1.240953,1.46298
150,1.197524,1.388717
200,1.163696,1.326078
250,1.183858,1.312957
300,1.116703,1.266122
350,1.11501,1.254115
400,1.102608,1.230322
450,1.089451,1.172595
500,1.056166,1.151025


The following columns in the Evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1251
  Batch size = 20
The following columns in the Evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1251
  Batch size = 20
Saving model checkpoint to ./llama3-humor-lora\checkpoint-100
loading configuration file config.json from cache at C:\Users\Anwender\.cache\huggingface\hub\models--meta-llama--Meta-Llama-3-8B-Instruct\snapshots\8afb486c1db24fe5011ec46dfbe5b5dccdb575c2\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attent


‚úì TRAINING COMPLETE


loading configuration file config.json from cache at C:\Users\Anwender\.cache\huggingface\hub\models--meta-llama--Meta-Llama-3-8B-Instruct\snapshots\8afb486c1db24fe5011ec46dfbe5b5dccdb575c2\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": null,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_parameters": {
    "rope_theta": 500000.0,
    "rope_type": "default"
  },
  "tie_word_embeddings": false,
  "transformers_version": "5.1.0",
  "use_cache": true,
  "vocab_size": 128256
}

chat template saved in ./llama3-h

In [19]:
# # Test
test_model(model, tokenizer)

print("\n" + "="*60)
print("üéâ PIPELINE COMPLETE!")
print("="*60)
print(f"\nModel saved to: {config.OUTPUT_DIR}")
print("\nTo use your model:")
print("  1. Load with: model = AutoModelForCausalLM.from_pretrained(...)")
print("  2. Load LoRA: model = PeftModel.from_pretrained(model, 'path/to/lora')")
print("  3. Generate jokes!")


üß™ Testing fine-tuned model...

Words: telescope, sandwich
Joke: What do you call a sandwich made with a telescope? Telescopic.

Words: elephant, programming
Joke: Programming jokes are fun...... but only when executed properly.  Edit: Whoops! Looks like I accidentally posted this in the wrong place. Oops, wrong elephant.

Words: coffee, quantum
Joke: You can't pour coffee into a quantum superposition of cups......because you'd have to be in two states of mind.

‚úì Testing complete

üéâ PIPELINE COMPLETE!

Model saved to: ./llama3-humor-lora

To use your model:
  1. Load with: model = AutoModelForCausalLM.from_pretrained(...)
  2. Load LoRA: model = PeftModel.from_pretrained(model, 'path/to/lora')
  3. Generate jokes!


In [None]:
# !zip -r ./llama3-humor-lora/checkpoint-18.zip ./llama3-humor-lora/checkpoint-18