In [1]:
# ==========================================
# 1. INSTALL STABLE LOCAL DEPENDENCIES
# ==========================================
# Pinning dspy-ai to 2.4.17 for stability with local models
!pip install -q dspy-ai==2.4.17 bitsandbytes accelerate transformers datasets

import torch
import dspy
import gc
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate
from datasets import load_dataset
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer

# ==========================================
# 2. LOAD LOCAL MODEL (Memory Optimized)
# ==========================================
# Clean up any previous junk in memory
torch.cuda.empty_cache()
gc.collect()

model_id = "unsloth/llama-3-8b-Instruct-bnb-4bit"
print(f"Loading {model_id}...")

# 4-bit Quantization to fit on Free Colab T4
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

hf_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Critical Fix: Set pad token to avoid infinite generation loops
tokenizer.pad_token = tokenizer.eos_token
hf_model.config.pad_token_id = tokenizer.eos_token_id

# ==========================================
# 3. ROBUST CUSTOM WRAPPER (The Fix)
# ==========================================
class LocalLlamaWrapper(dspy.LM):
    def __init__(self, model, tokenizer):
        super().__init__("local-llama")
        self.model = model
        self.tokenizer = tokenizer

    def basic_request(self, prompt, **kwargs):
        # Ensure prompt is on the correct device
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=100,   # Reduced max tokens to save memory
                do_sample=False,      # Deterministic
                pad_token_id=self.tokenizer.eos_token_id,
                temperature=0.0
            )

        # Decode only the new response tokens
        generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
        response = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
        return [response.strip()]

    # FIX: Handles both 'prompt' string and 'messages' list to prevent TypeError
    def __call__(self, prompt=None, messages=None, **kwargs):
        if messages and not prompt:
            # Convert chat messages to a single Llama-3 formatted string
            prompt = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

        if not prompt:
            prompt = " " # Safety fallback

        return self.basic_request(prompt, **kwargs)

# Connect Wrapper
lm = LocalLlamaWrapper(hf_model, tokenizer)
dspy.settings.configure(lm=lm)
print("✅ Local Model Connected.")

# ==========================================
# 4. DATA & TASK
# ==========================================
print("Loading Data...")
dataset = load_dataset("squad", split="validation")

def convert_to_dspy(row):
    return dspy.Example(
        question=row['question'],
        context=row['context'],
        answer=row['answers']['text'][0]
    ).with_inputs('question', 'context')

# TINY splits to prevent Crashing (5 Train / 10 Dev)
trainset = [convert_to_dspy(x) for x in dataset.select(range(0, 5))]
devset = [convert_to_dspy(x) for x in dataset.select(range(5, 15))]

class QASignature(dspy.Signature):
    """Answer questions based on the context. Give short answers."""
    context = dspy.InputField(desc="facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="short answer")

class QAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(QASignature)

    def forward(self, question, context):
        return self.prog(question=question, context=context)

# ==========================================
# 5. OPTIMIZATION LOOP
# ==========================================
def validate_answer(gold, pred, trace=None):
    if not pred.answer: return False
    return gold.answer.lower() in pred.answer.lower()

# Free up memory before evaluation
gc.collect()
torch.cuda.empty_cache()

print("\n--- PHASE 1: Baseline Evaluation ---")
evaluator = Evaluate(devset=devset, metric=validate_answer, num_threads=1, display_progress=True, display_table=0)
baseline_score = evaluator(QAModule())
print(f"Baseline Accuracy: {baseline_score}%")

print("\n--- PHASE 2: Optimization (Training Prompt) ---")
# BootstrapFewShot learns by example. We set max_bootstrapped_demos=1 for speed/memory.
teleprompter = BootstrapFewShot(metric=validate_answer, max_bootstrapped_demos=1)

print("Compiling... (This will take ~1-2 mins)")
optimized_program = teleprompter.compile(QAModule(), trainset=trainset)

print("\n--- PHASE 3: Final Evaluation ---")
final_score = evaluator(optimized_program)

print(f"\n=== RESULTS ===")
print(f"Baseline: {baseline_score}%")
print(f"Optimized: {final_score}%")

print("\n--- The Optimized Prompt ---")
lm.inspect_history(n=1)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.9/297.9 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

  from google.cloud.aiplatform.utils import gcs_utils


Loading unsloth/llama-3-8b-Instruct-bnb-4bit...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

✅ Local Model Connected.
Loading Data...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]


--- PHASE 1: Baseline Evaluation ---
  0%|          | 0/10 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Average Metric: 8 / 10  (80.0): 100%|██████████| 10/10 [01:14<00:00,  7.42s/it]
Baseline Accuracy: 80.0%

--- PHASE 2: Optimization (Training Prompt) ---
Compiling... (This will take ~1-2 mins)


 20%|██        | 1/5 [00:13<00:54, 13.55s/it]


Bootstrapped 1 full traces after 2 examples in round 0.

--- PHASE 3: Final Evaluation ---
Average Metric: 7 / 10  (70.0): 100%|██████████| 10/10 [02:37<00:00, 15.78s/it]

=== RESULTS ===
Baseline: 80.0%
Optimized: 70.0%

--- The Optimized Prompt ---




