In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

from transformers import BitsAndBytesConfig

In [2]:
question = """There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups How big is each group of bananas?"""

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, prepare_model_for_kbit_training

# 1. Load quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2. Load base model in 4-bit
base = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    quantization_config=bnb_config
)

print(sum(p.numel() for p in base.parameters()))

# 3. Prepare base for k-bit LoRA training
base = prepare_model_for_kbit_training(base)

# 4. Load LoRA adapter (this must point to your adapter subdir!)
peft_model = PeftModel.from_pretrained(
    base,
    "../outputs/tinyllama-gsm8k-lora/adapter",
    is_trainable=True  # 👈 **this is critical** to activate LoRA
)

# 5. Count parameters before merge
total_params = sum(p.numel() for p in peft_model.parameters())
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable %: {100 * trainable_params / total_params:.4f}%")


615606272
Total parameters: 620,111,872
Trainable parameters: 4,505,600
Trainable %: 0.7266%


In [5]:
# BASE MODEL

# Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token  # pad token required for batching

# Inference prompt

# question above
prompt = f"### Question:\n{question}\n\n### Answer:\n"

# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=False,      # greedy decoding
)

# Print result
total_params = sum(p.numel() for p in model.parameters())
print(total_params)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

1100048384
### Question:
There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups How big is each group of bananas?

### Answer:
The bananas are organized into 93 groups, and the oranges are organized into 290 groups. Therefore, the bananas are organized into 93 groups, and the oranges are organized into 290 groups.


In [28]:
## eval
import argparse, re, torch
from pathlib import Path
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, prepare_model_for_kbit_training
import math

NUM_RE = re.compile(r"[-+]?\d*\.?\d+")

def last_number(text: str):
    nums = NUM_RE.findall(text)
    return nums[-1].lstrip("0") if nums else None        # strip leading zeros for fair match

def build_svamp_prompt(ex):
    if "question_concat" in ex:          # mirror with pre-joined field
        qtext = ex["question_concat"]
    else:                                # ChilleD mirror → join Body + Question
        qtext = f"{ex['Body'].strip()} {ex['Question'].strip()}"
    return f"### Question:\n{qtext}\n\n### Answer:\n"

logs = []
outs = []
@torch.inference_mode()
def accuracy(model, tok, dataset, prompt_fn, gold_fn, n=None):
    model.eval()
    device = next(model.parameters()).device
    total, correct = 0, 0

    iterator = dataset if not n else dataset.select(range(n))
    for ex in iterator:
        prompt = prompt_fn(ex)
        inputs = tok(prompt, return_tensors="pt").to(device)

        out_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
        )[0]
        
        outs.append(tok.decode(out_ids, skip_special_tokens=True)) # take this out
        
        pred  = last_number(tok.decode(out_ids, skip_special_tokens=True))
        gold  = gold_fn(ex)

        logs.append((pred, gold))

        if pred is not None and gold is not None and math.isclose(float(pred), float(gold)):
            correct += 1
        total += 1
    return correct / total if total else 0.0

In [None]:
    
gsm_acc  = accuracy(model, tok, gsm8k,
                    prompt_fn=lambda ex: f"### Question:\n{ex['question'].strip()}\n\n### Answer:\n",
                    gold_fn   =lambda ex: last_number(ex["answer"]),
                    n=args.n)

In [29]:
svamp_acc = accuracy(base, base_tok, svamp,
                     prompt_fn=build_svamp_prompt,
                     gold_fn  =lambda ex: str(ex["Answer"]).strip()
                    )

ValueError: could not convert string to float: ''

In [30]:
logs

[('62', '27'),
 ('15', '4'),
 ('36', '16'),
 ('312', '64'),
 ('26', '31'),
 ('24', '720'),
 ('1', '21'),
 ('38', '64'),
 ('419', '450'),
 ('', '143550')]

In [21]:
gsm8k = load_dataset("gsm8k", "main", split="test")
svamp = load_dataset("ChilleD/SVAMP", split="test")     # ✅ works


In [24]:
# 2. base model (fp16)
print("🔹 Loading TinyLlama-1.1B base …")
base = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    torch_dtype=torch.float16,
    device_map="auto",
)
base_tok = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
base_tok.pad_token = base_tok.eos_token

# 3. LoRA‐fine-tuned (4-bit)
print("🔹 Loading LoRA adapter …")
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
lora_base = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    quantization_config=bnb_cfg,
    device_map="auto",
)
lora_base = prepare_model_for_kbit_training(lora_base)

lora_model = PeftModel.from_pretrained(
    lora_base,
    "../outputs/tinyllama-gsm8k-lora/adapter",   # ← your path
    is_trainable=False,  # inference only
)
lora_tok = AutoTokenizer.from_pretrained("../outputs/tinyllama-gsm8k-lora")
lora_tok.pad_token = lora_tok.eos_token

🔹 Loading TinyLlama-1.1B base …
🔹 Loading LoRA adapter …


In [None]:
# 4. evaluate
print("\n🚀 Evaluating BASE model …")
acc_base_gsm  = accuracy(base,  base_tok, gsm8k, args.n)
acc_base_svam = accuracy(base,  base_tok, svamp, args.n)

print("\n🚀 Evaluating LoRA-FINETUNED model …")
acc_lora_gsm  = accuracy(lora_model, lora_tok, gsm8k, args.n)
acc_lora_svam = accuracy(lora_model, lora_tok, svamp, args.n)

# 5. report
print("\n════════════ ACCURACY ════════════")
print(f"{'Model':<15} | {'GSM8K':>7} | {'SVAMP':>7}")
print("-" * 33)
print(f"{'BASE':<15} | {acc_base_gsm*100:6.2f}% | {acc_base_svam*100:6.2f}%")
print(f"{'LoRA-FT':<15} | {acc_lora_gsm*100:6.2f}% | {acc_lora_svam*100:6.2f}%")