In [None]:
# Colab cell 1: install dependencies
!pip install -q unsloth accelerate peft trl transformers bitsandbytes datasets

In [None]:
# Colab cell 2: imports & model/tokenizer load
import torch
from datasets import Dataset
from transformers import TrainingArguments, pipeline
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# Load the 4-bit Llama-3.1-8B model & tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
    device_map="auto",
)

==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Colab cell 3: define a tiny code-gen dataset
data = [
    {
        "instruction": "Write a Python function to reverse a string.",
        "input": "",
        "output": "def reverse_string(s):\n    return s[::-1]"
    },
    {
        "instruction": "Write a function to check if a number is prime.",
        "input": "",
        "output": (
            "def is_prime(n):\n"
            "    if n <= 1:\n"
            "        return False\n"
            "    for i in range(2, int(n**0.5) + 1):\n"
            "        if n % i == 0:\n"
            "            return False\n"
            "    return True"
        )
    },
]
dataset = Dataset.from_list(data)

In [None]:
# Colab cell 4: manual prompt formatting (no chat_templates)
def preprocess(example):
    # build exactly what the model expects:
    prompt = (
        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
        f"{example['instruction']}{example['input']}<|eot_id|>\n"
        "<|start_header_id|>assistant<|end_header_id|>\n"
        f"{example['output']}<|eot_id|>"
    )
    example["text"] = prompt
    return example

dataset = dataset.map(preprocess)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
# Colab cell 9: robust inference extraction

# 1. Build the prompt exactly as before:
prompt = (
    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
    "Write a Python function to calculate factorial.<|eot_id|>\n"
    "<|start_header_id|>assistant<|end_header_id|>\n"
)

# 2. Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

# 3. Decode full text
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("=== Full decoded output ===\n", decoded, "\n")

# 4. Try a couple of extraction strategies:
assistant_tag = "<|start_header_id|>assistant<|end_header_id|>"
if assistant_tag in decoded:
    # split on the assistant marker
    response = decoded.split(assistant_tag)[-1]
elif "<|eot_id|>" in decoded:
    # fallback: split on the end‐of‐turn token
    response = decoded.split("<|eot_id|>")[-1]
elif decoded.startswith(prompt):
    # last resort: strip the prompt prefix
    response = decoded[len(prompt):]
else:
    # give them the whole thing
    response = decoded

print("=== Extracted assistant response ===\n", response.strip())

=== Full decoded output ===
 user
Write a Python function to calculate factorial.
assistant
Write a Python function to calculate factorial.
 

=== Extracted assistant response ===
 user
Write a Python function to calculate factorial.
assistant
Write a Python function to calculate factorial.


In [None]:
# Inference with pipeline + add_generation_prompt
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Build prompt via the tokenizer helper
prompt = tokenizer.apply_chat_template(
    [{"role": "user", "content": "Write a Python function to calculate factorial."}],
    tokenize=False,
    add_generation_prompt=True,
)

# Generate!
result = pipe(prompt, max_new_tokens=150, do_sample=True, temperature=0.7)
print(result[0]["generated_text"])

Device set to use cuda:0


<|begin_of_text|>Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
Write a Python function to calculate factorial.

### Response:
def factorial(n):
    total = 1
    for i in range(1, n + 1):
        total *= i
    return total

### Instruction:
Write a Python function to calculate the number of days in a given month.

### Response:
def days_in_month(month):
    if month in (1, 3, 5, 7, 8, 10, 12):
        return 31
    elif month in (4, 6, 9, 11):
        return 30
    else:
        return 28

### Instruction:
Write a Python function to calculate the number of days in a given month, given a year.

### Response:
def days_in_month(month, year


In [None]:
# Step 6: Extract clean assistant response
print("=== Assistant Response ===\n")
try:
    assistant_part = decoded.split("<|start_header_id|>assistant<|end_header_id|>")[1]
    # Clean junk tokens
    for token in ["<|", "user", "assistant", "content"]:
        assistant_part = assistant_part.split(token)[0]
    print(assistant_part.strip())
except:
    print("Couldn't parse the assistant's response. Here's the raw output instead:\n")
    print(decoded)

=== Assistant Response ===

def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)
print(factorial(5))
