## Set OpenAI API Key for Teacher–Student Dataset Creation

In [None]:
# %%
import os
from ipywidgets import Password, Button, Output, VBox

pw = Password(description='API key:')
btn = Button(description='Set key')
out = Output()

def on_click(_):
    key = (pw.value or "").strip()
    os.environ["OPENAI_API_KEY"] = key
    with out:
        out.clear_output()
        if key.startswith("sk-"):
            print("✅ OPENAI_API_KEY set for this session.")
        else:
            print("⚠️ Set, but it doesn't look like a typical key (no 'sk-').")

btn.on_click(on_click)
VBox([pw, btn, out])


VBox(children=(Password(description='API key:'), Button(description='Set key', style=ButtonStyle()), Output())…

## Generate Instruction-Tuning Dataset from PDF Using OpenAI

In [27]:
# %% [markdown]
# Small dataset maker: PDF ➜ prompt/completion pairs ➜ HF DatasetDict (train)

# %% 
# !pip install -q pypdf datasets openai  # uncomment if needed

import os, json, re
from typing import List, Dict

from pypdf import PdfReader
from datasets import Dataset, DatasetDict
from openai import OpenAI

# ==== CONFIG ====
PDF_PATH = "./Hassan_Hamidi_v825.pdf"              # <-- put your PDF path here
N_EXAMPLES = 25                     # how many examples to generate
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")  # pick a cheap, capable model
# You must have OPENAI_API_KEY set in your environment.
# e.g., in bash: export OPENAI_API_KEY="sk-..."

# (Optional) extra steerage for the kind of instructions you want:
SEED_INSTRUCTION = (
    "Current document is a CV, so focus on career, skills, and achievements. use firt person perspective and contextualize the answers as if you are the person in the CV."
    "Create diverse, helpful instruction-style Q&A from the document. "
    "Prefer short, practical prompts and concise, factual answers."
    
)

# ==== PDF → TEXT ====
def extract_pdf_text(path: str, max_chars: int = 20000) -> str:
    reader = PdfReader(path)
    chunks = []
    total = 0
    for page in reader.pages:
        t = page.extract_text() or ""
        if not t:
            continue
        chunks.append(t)
        total += len(t)
        if total >= max_chars:
            break
    return ("\n\n".join(chunks))[:max_chars]

# ==== CALL OPENAI TO GENERATE EXAMPLES ====
def gen_examples_from_text(text: str, n: int, model: str) -> List[Dict[str, str]]:
    client = OpenAI()

    system_msg = (
        "You are a data generation assistant for instruction-tuning datasets. "
        "Use ONLY the provided document text. Avoid fabricating facts."
    )
    user_msg = f"""
Given the document below, create {n} instruction-style prompt/completion pairs.

Requirements:
- Each item must be an object with keys "prompt" and "completion".
- Prompts should be diverse and useful (requests, how-tos, explanations).
- Completions must be concise (<= 120 words), correct, safe, and grounded in the document.
- Do NOT include any preamble or commentary.
- Return ONLY a JSON array (no markdown fences).

Extra steerage: {SEED_INSTRUCTION}

DOCUMENT:
<<<
{text}
>>>
"""

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0.7,
    )

    raw = resp.choices[0].message.content.strip()

    # Try to grab the JSON array if the model added anything extra.
    m = re.search(r"\[\s*{.*}\s*\]", raw, flags=re.S)
    json_str = m.group(0) if m else raw

    data = json.loads(json_str)
    if not isinstance(data, list):
        raise ValueError("Model did not return a JSON array.")
    cleaned = []
    for ex in data[:n]:
        p = (ex.get("prompt") or "").strip()
        c = (ex.get("completion") or "").strip()
        if p and c:
            cleaned.append({"prompt": p, "completion": c})
    if len(cleaned) != n:
        raise ValueError(f"Expected {n} items, got {len(cleaned)}.")
    return cleaned

# ==== RUN ====
doc_text = extract_pdf_text(PDF_PATH)
examples = gen_examples_from_text(doc_text, N_EXAMPLES, OPENAI_MODEL)

train = Dataset.from_list(examples)
dset = DatasetDict({"train": train})

# Save JSONL for training-friendly consumption
out_jsonl = "train.json"
train.to_json(out_jsonl, orient="records", lines=True, force_ascii=False)

print(dset)  # Should show: DatasetDict({ train: Dataset({ features: ['prompt','completion'], num_rows: 4 }) })
print(f"Saved {len(train)} rows to {out_jsonl}")


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 25
    })
})
Saved 25 rows to train.json


In [7]:
from transformers import pipeline
model_name = "Qwen/Qwen2.5-3B-Instruct"
ask_llm = pipeline(
    model = model_name,
    device ="cuda"

    
)
print(ask_llm('who am I?')[0]['generated_text'])

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda


who am I? You are referring to a well-known American singer, songwriter, and actress. You are Mariah Carey. Born on March 27, 1969, in Huntington, New York, she is one of the best-selling music artists of all time, with sales exceeding 200 million units worldwide. She has won numerous awards, including 5 Grammy Awards, and is known for her powerful vocals, distinctive voice, and hit songs like "All I Want for Christmas Is You," "Hero," and "Emotions." Mariah Carey is also a successful actress and entrepreneur. Is there anything specific you'd like to know about her?


In [30]:
from datasets import load_dataset
raw_data = load_dataset('json' , data_files='train.json')
print(raw_data)
x = raw_data['train'][2]
print(x)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 25
    })
})
{'prompt': "What was your capstone project during your Bachelor's degree?", 'completion': "During my Bachelor's degree, my capstone project involved developing a license plates detection system using YOLOv3 and traditional image processing techniques."}


In [31]:

from transformers import AutoTokenizer
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(sample):
    sample = sample["prompt"] + "\n" + sample['completion']
    tokenized = tokenizer(
        sample , 
        max_length= 128,
        truncation = True,
        padding = "max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized



data = raw_data.map(preprocess)
print(data["train"][0])

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

{'prompt': 'What research are you currently involved in during your PhD?', 'completion': 'I am currently investigating diffusion models for generating high-quality and diverse synthetic medical images. This research supports data augmentation and interpretability, aiming to improve accuracy, fairness, and robustness in disease classification, particularly in rare or imbalanced imaging cases.', 'input_ids': [3838, 3412, 525, 498, 5023, 6398, 304, 2337, 697, 29561, 5267, 40, 1079, 5023, 23890, 57330, 4119, 369, 23163, 1550, 22092, 323, 16807, 27268, 6457, 5335, 13, 1096, 3412, 11554, 821, 78785, 323, 14198, 2897, 11, 37078, 311, 7269, 13403, 11, 50741, 11, 323, 21765, 2090, 304, 8457, 23850, 11, 7945, 304, 8848, 476, 732, 58402, 31658, 5048, 13, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643

## Load Qwen Model and Apply LoRA Configuration for Causal LM

In [32]:
from peft import LoraConfig ,get_peft_model ,TaskType
from transformers import AutoModelForCausalLM
import torch
model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name , 
    device_map = "cuda" ,
    torch_dtype = torch.float16 , 
)

lora_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    target_modules = ["q_proj" , "k_proj" , "v_proj"]
    
)

model = get_peft_model(model , lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Define TrainingArguments and Fine-Tune Model

In [33]:
from transformers import TrainingArguments , Trainer 

training_args = TrainingArguments(
    num_train_epochs = 100,
    learning_rate = 0.0001,
    logging_steps = 50,
    
)


trainer = Trainer(
    model = model ,
    args = training_args,
    train_dataset = data['train'] 
)
trainer.train()

Step,Training Loss
50,5.2617
100,1.1624
150,0.8821
200,0.7963
250,0.7075
300,0.6591
350,0.6375
400,0.6156


TrainOutput(global_step=400, training_loss=1.3402782917022704, metrics={'train_runtime': 82.567, 'train_samples_per_second': 30.278, 'train_steps_per_second': 4.845, 'total_flos': 5332378583040000.0, 'train_loss': 1.3402782917022704, 'epoch': 100.0})

## Save model

In [40]:
trainer.save_model("./who_am_I_model")
tokenizer.save_pretrained("./who_am_I_model")

('./who_am_I_model/tokenizer_config.json',
 './who_am_I_model/special_tokens_map.json',
 './who_am_I_model/chat_template.jinja',
 './who_am_I_model/vocab.json',
 './who_am_I_model/merges.txt',
 './who_am_I_model/added_tokens.json',
 './who_am_I_model/tokenizer.json')

In [41]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

MODEL_ADAPTER_PATH = "./who_am_I_model"

# Load LoRA config and base model
peft_cfg = PeftConfig.from_pretrained(MODEL_ADAPTER_PATH)
base_model = AutoModelForCausalLM.from_pretrained(
    peft_cfg.base_model_name_or_path,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
    trust_remote_code=True
)

# Load adapter weights into base model
peft_model = PeftModel.from_pretrained(base_model, MODEL_ADAPTER_PATH)
peft_model.eval()

# Load tokenizer (uses saved tokenizer in adapter dir)
peft_tokenizer = AutoTokenizer.from_pretrained(MODEL_ADAPTER_PATH, trust_remote_code=True)
if peft_tokenizer.pad_token_id is None:
    peft_tokenizer.pad_token = peft_tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Ask some question

In [48]:
# ---------- Widgets ----------
system_box = Textarea(
    value="You are Hassan Hamidi. Answer all questions in my language based only on information you have. "
          "Answer short and do not continue the communication after a short answer.",
    description="System:",
    layout=dict(width="100%", height="80px"),
)


prompt_box = Textarea(
    value="What is your current research focus in his PhD program?",
    description="Prompt:",
    layout=dict(width="100%", height="90px"),
)

max_new_tokens = IntSlider(value=128, min=16, max=256, step=16, description="Max new")
temperature = FloatSlider(value=0.1, min=0.0, max=2.0, step=0.05, description="Temp")
top_p = FloatSlider(value=0.99, min=0.1, max=1.0, step=0.05, description="Top-p")
repetition_penalty = FloatSlider(value=1.1, min=1.0, max=2.0, step=0.05, description="Rep pen.")
use_chat_template = Checkbox(value=True, description="Use chat template (if available)")

generate_btn = Button(description="Generate", button_style="primary")
clear_btn = Button(description="Clear Output")
status_lbl = Label(value="")

out = Output()

controls_row1 = HBox([max_new_tokens, temperature, top_p, repetition_penalty])
controls_row2 = HBox([use_chat_template, generate_btn, clear_btn, status_lbl])
ui = VBox([system_box, prompt_box, controls_row1, controls_row2, out])

display(ui)

# ---------- Generation handler ----------
@torch.inference_mode()
def do_generate(_btn):
    generate_btn.disabled = True
    status_lbl.value = "Generating..."
    with out:
        clear_output(wait=True)
        try:
            system_prompt = system_box.value.strip()
            prompt = prompt_box.value.strip()
            if not prompt:
                print("Please enter a prompt.")
                return

            if use_chat_template.value and hasattr(peft_tokenizer, "apply_chat_template"):
                chat = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ]
                text = peft_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
            else:
                text = system_prompt + "\n\n" + prompt

            inputs = peft_tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=min(getattr(peft_tokenizer, "model_max_length", 4096), 8192),
            )

            device = next(peft_model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            eos = peft_tokenizer.eos_token_id
            gen_ids = peft_model.generate(
                **inputs,
                max_new_tokens=int(max_new_tokens.value),
                do_sample=(temperature.value > 0),
                temperature=float(temperature.value),
                top_p=float(top_p.value),
                repetition_penalty=float(repetition_penalty.value),
                eos_token_id=eos,
                pad_token_id=peft_tokenizer.pad_token_id,
            )

            text_out = peft_tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            print(text_out)
        except Exception as e:
            print(f"❌ Error: {e}")
        finally:
            status_lbl.value = "Done."
            generate_btn.disabled = False

def do_clear(_btn):
    with out:
        clear_output()
    status_lbl.value = ""

generate_btn.on_click(do_generate)
clear_btn.on_click(do_clear)


VBox(children=(Textarea(value='You are Hassan Hamidi. Answer all questions in my language based only on inform…