In [2]:
# Cell 1 — Install pinned deps (no `datasets`, so no pyarrow)
# Purpose: get a stable TRL/PEFT stack and disable W&B prompts.

!pip uninstall -y -q streamlit
!pip install -q --upgrade "transformers==4.43.3" "trl==0.9.6" "peft==0.11.1" \
                         "accelerate==0.33.0" "bitsandbytes==0.43.1" "evaluate==0.4.2"

import os
os.environ["WANDB_MODE"] = "disabled"     # no wandb login prompts


[0m

In [5]:
# --- Server snapshot ---
!whoami && echo "PWD: $PWD"
!python -V
!nvidia-smi || echo "No NVIDIA GPU"
!df -h | sed -n '1,12p'
!echo "Home dir: $HOME"


p017psy
PWD: /home/p017psy
Python 3.12.2
/bin/bash: nvidia-smi: command not found
No NVIDIA GPU
Filesystem      Size  Used Avail Use% Mounted on
devtmpfs         63G     0   63G   0% /dev
tmpfs            63G  5.2M   63G   1% /dev/shm
tmpfs            63G  4.1G   59G   7% /run
tmpfs            63G     0   63G   0% /sys/fs/cgroup
/dev/sda1       526G  145G  382G  28% /
/dev/sdb1       1.8T  1.4T  341G  81% /home
bcg2:/bcg2       33T   32T  1.4T  97% /bcg2
bcg2:/bcg        17T   15T  2.0T  89% /bcg
kwak:/kwak       17T   14T  2.9T  83% /kwak
tmpfs            13G     0   13G   0% /run/user/1000
tmpfs            13G     0   13G   0% /run/user/1014
Home dir: /home/p017psy


In [1]:
# --- Project setup (inside your home dir, not /mnt/data) ---
from pathlib import Path

BASE_DIR  = Path.home() / "unittime"
DATA_DIR  = BASE_DIR / "goal1_out"       # put taskA.jsonl, taskB.jsonl here
MODEL_DIR = BASE_DIR / "goal2_models"    # fine-tuned models will go here
CACHE_DIR = BASE_DIR / "cache"           # Hugging Face cache

# Create folders if not already present
for d in [DATA_DIR, MODEL_DIR, CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("BASE_DIR :", BASE_DIR)
print("DATA_DIR :", DATA_DIR, "exists?", DATA_DIR.exists())
print("MODEL_DIR:", MODEL_DIR, "exists?", MODEL_DIR.exists())
print("CACHE_DIR:", CACHE_DIR, "exists?", CACHE_DIR.exists())



BASE_DIR : /home/p017psy/unittime
DATA_DIR : /home/p017psy/unittime/goal1_out exists? True
MODEL_DIR: /home/p017psy/unittime/goal2_models exists? True
CACHE_DIR: /home/p017psy/unittime/cache exists? True


In [2]:
# Cell A — locate Goal-1 JSONL files and stage them

import os, shutil
from pathlib import Path

BASE_DIR  = Path.home() / "unittime"
DATA_DIR  = BASE_DIR / "goal1_out"
MODEL_DIR = BASE_DIR / "goal2_models"
CACHE_DIR = BASE_DIR / "cache"
for d in (DATA_DIR, MODEL_DIR, CACHE_DIR):
    d.mkdir(parents=True, exist_ok=True)

def find_first(root: Path, name: str):
    for p in root.rglob(name):
        if p.is_file():
            return p
    return None

home = Path.home()
a_src = find_first(home, "taskA.jsonl")
b_src = find_first(home, "taskB.jsonl")

print("Found A at:", a_src)
print("Found B at:", b_src)

if a_src:
    shutil.copy2(a_src, DATA_DIR / "taskA.jsonl")
if b_src:
    shutil.copy2(b_src, DATA_DIR / "taskB.jsonl")

print("Now in DATA_DIR:", DATA_DIR)
print([p.name for p in DATA_DIR.glob("*")])


Found A at: None
Found B at: None
Now in DATA_DIR: /home/p017psy/unittime/goal1_out
[]


In [4]:
# Cell A — Locate taskA.jsonl/taskB.jsonl under your HOME and copy them to goal1_out
import os, shutil
from pathlib import Path

HOME = Path.home()
DEST = HOME / "unittime" / "goal1_out"
DEST.mkdir(parents=True, exist_ok=True)

candidates = []
for root, dirs, files in os.walk(HOME):
    for fn in files:
        if fn in ("taskA.jsonl", "taskB.jsonl"):
            candidates.append(Path(root) / fn)

print("Found candidates:")
for p in candidates:
    print("  ", p)

# Copy the first match of each file name
names = {"taskA.jsonl": None, "taskB.jsonl": None}
for p in candidates:
    if p.name in names and names[p.name] is None:
        names[p.name] = p

for name, src in names.items():
    if src is not None:
        shutil.copy2(src, DEST / name)
        print(f"Copied: {src} -> {DEST/name}")
    else:
        print(f"NOT FOUND: {name}")

print("Now in goal1_out:", list(DEST.glob("*.jsonl")))



Found candidates:
NOT FOUND: taskA.jsonl
NOT FOUND: taskB.jsonl
Now in goal1_out: []


In [5]:
# Cell B — Minimal ETL: UniTime XML -> taskA.jsonl / taskB.jsonl in goal1_out
from pathlib import Path
import json, xml.etree.ElementTree as ET
import pandas as pd

HOME = Path.home()
OUT = HOME / "unittime" / "goal1_out"
OUT.mkdir(parents=True, exist_ok=True)

# 1) Find a UniTime XML (root tag 'timetable')
def find_unittime_xml(start: Path):
    for p in start.rglob("*.xml"):
        try:
            r = ET.parse(str(p)).getroot()
            if r.tag == "timetable" and r.get("nrDays") and r.get("slotsPerDay"):
                return p
        except Exception:
            pass
    return None

XML = find_unittime_xml(HOME)
assert XML, "No UniTime XML found under your HOME. Place one under ~/unittime or HOME and rerun."
print("Using XML:", XML)

# 2) Parse XML -> lists
tree = ET.parse(str(XML)); root = tree.getroot()
nr_days = int(root.get("nrDays")); slots_per_day = int(root.get("slotsPerDay"))

classes, times, rooms, instr = [], [], [], []
for c in root.find("classes").findall("class"):
    cid = c.get("id")
    classes.append({
        "class_id": cid, "offering": c.get("offering"), "subpart": c.get("subpart"),
        "class_limit": int(c.get("classLimit")) if c.get("classLimit") else None,
        "dates_mask": c.get("dates")
    })
    for t in c.findall("time"):
        times.append({
            "class_id": cid,
            "days": t.get("days"),
            "start": int(t.get("start") or 0),
            "length": int(t.get("length") or 0),
            "pref": float(t.get("pref")) if t.get("pref") is not None else None
        })
    for r in c.findall("room"):
        rooms.append({
            "class_id": cid,
            "room_id": str(r.get("id")),
            "pref": float(r.get("pref")) if r.get("pref") is not None else None
        })
    for ins in c.findall("instructor"):
        instr.append({"class_id": cid, "instructor_id": str(ins.get("id"))})

df_cls   = pd.DataFrame(classes)
df_times = pd.DataFrame(times)
df_rooms = pd.DataFrame(rooms)
df_ins   = pd.DataFrame(instr)

# 3) Build Task A jsonl (per-class)
taskA = []
for _, row in df_cls.iterrows():
    cid = row["class_id"]
    tdf = df_times[df_times["class_id"] == cid]
    rdf = df_rooms[df_rooms["class_id"] == cid]
    idf = df_ins[df_ins["class_id"] == cid]
    sample = {
        "instruction": "Assign a feasible room and time for the class given the candidates and constraints.",
        "input": {
            "nr_days": nr_days, "slots_per_day": slots_per_day,
            "class_id": cid, "subpart": row.get("subpart"),
            "class_limit": int(row["class_limit"]) if pd.notna(row["class_limit"]) else None,
            "dates_mask": row.get("dates_mask"),
            "instructors": [int(x) if str(x).isdigit() else x for x in idf["instructor_id"].tolist()],
            "candidate_times": tdf[["days","start","length","pref"]].to_dict(orient="records"),
            "candidate_rooms": rdf[["room_id","pref"]].to_dict(orient="records")
        },
        "output": {"assignments": []}
    }
    taskA.append(sample)

with open(OUT/"taskA.jsonl", "w", encoding="utf-8") as f:
    for ex in taskA:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print("Wrote", OUT/"taskA.jsonl", "count:", len(taskA))

# 4) Build Task B jsonl (per-offering)
taskB = []
for offering, grp in df_cls.groupby("offering"):
    classes_payload = []
    for _, row in grp.iterrows():
        cid = row["class_id"]
        tdf = df_times[df_times["class_id"] == cid]
        rdf = df_rooms[df_rooms["class_id"] == cid]
        idf = df_ins[df_ins["class_id"] == cid]
        classes_payload.append({
            "class_id": cid, "subpart": row.get("subpart"),
            "class_limit": int(row["class_limit"]) if pd.notna(row["class_limit"]) else None,
            "dates_mask": row.get("dates_mask"),
            "instructors": [int(x) if str(x).isdigit() else x for x in idf["instructor_id"].tolist()],
            "candidate_times": tdf[["days","start","length","pref"]].to_dict(orient="records"),
            "candidate_rooms": rdf[["room_id","pref"]].to_dict(orient="records")
        })
    taskB.append({
        "instruction": "Assign feasible rooms and times for all classes in this offering.",
        "input": {"offering_id": str(offering), "nr_days": nr_days, "slots_per_day": slots_per_day,
                  "classes": classes_payload},
        "output": {"assignments": []}
    })

with open(OUT/"taskB.jsonl", "w", encoding="utf-8") as f:
    for ex in taskB:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print("Wrote", OUT/"taskB.jsonl", "count:", len(taskB))


Using XML: /home/p017psy/Jupyter_Notebook/pu-spr07-cs.xml
Wrote /home/p017psy/unittime/goal1_out/taskA.jsonl count: 521
Wrote /home/p017psy/unittime/goal1_out/taskB.jsonl count: 39


In [6]:
from pathlib import Path
DATA_DIR = Path.home()/ "unittime"/"goal1_out"
print("Goal1 files:", [p.name for p in DATA_DIR.glob("*.jsonl")])


Goal1 files: ['taskA.jsonl', 'taskB.jsonl']


In [7]:
# Cell 3 — Load JSONL & split
# Purpose: Read taskA/taskB into Python lists and make a 90/10 split.

from pathlib import Path
import json, random

DATA_DIR = Path.home() / "unittime" / "goal1_out"
SEED = 42
random.seed(SEED)

def load_jsonl_list(path: Path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for ln in f:
            rows.append(json.loads(ln))
    return rows

def split_list(items, val_ratio=0.1, seed=SEED):
    items = items[:]  # copy
    random.Random(seed).shuffle(items)
    n = len(items)
    val = max(1, int(n * val_ratio))
    return items[:-val], items[-val:]

taskA_path = DATA_DIR / "taskA.jsonl"
taskB_path = DATA_DIR / "taskB.jsonl"

dsA = load_jsonl_list(taskA_path)
dsB = load_jsonl_list(taskB_path)

trainA, valA = split_list(dsA, 0.1)
trainB, valB = split_list(dsB, 0.1)

print("Stage A:", len(trainA), "train,", len(valA), "val")
print("Stage B:", len(trainB), "train,", len(valB), "val")


Stage A: 469 train, 52 val
Stage B: 36 train, 3 val


In [8]:
# Cell 4 — Prompt formatting
# Purpose: Convert each example to a single 'text' string for causal LM training.

SYSTEM_PROMPT = "You are a helpful timetable assistant. Always return strict JSON."

def fmt(ex):
    instr = ex.get("instruction","").strip()
    inp = json.dumps(ex.get("input", {}), ensure_ascii=False, sort_keys=True)
    out = json.dumps(ex.get("output", {}), ensure_ascii=False, sort_keys=True)
    return {"text": f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{instr}\nINPUT:\n{inp}\n<|assistant|>\n{out}"}

trainA_f = list(map(fmt, trainA))
valA_f   = list(map(fmt, valA))
trainB_f = list(map(fmt, trainB))
valB_f   = list(map(fmt, valB))

print(trainA_f[0]["text"][:400])


<|system|>
You are a helpful timetable assistant. Always return strict JSON.
<|user|>
Assign a feasible room and time for the class given the candidates and constraints.
INPUT:
{"candidate_rooms": [{"pref": 0.0, "room_id": "43"}], "candidate_times": [{"days": "0101000", "length": 12, "pref": 0.0, "start": 126}], "class_id": "205", "class_limit": null, "dates_mask": "0000000000000000000000000000000


In [9]:
# Cell 5 — Model & LoRA setup (CUDA-aware)
# Purpose: If GPU is available → QLoRA (4-bit). Else → CPU LoRA (fp32), tiny batches.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
USE_CUDA = torch.cuda.is_available()

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Model load
if USE_CUDA:
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Loaded base with QLoRA (4-bit).")
else:
    # CPU fallback (slow). Keep sequence lengths/batches small.
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float32,
        device_map="cpu",
    )
    print("Loaded base on CPU (no GPU detected).")

# LoRA config
peft_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none", task_type="CAUSAL_LM"
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded base on CPU (no GPU detected).


In [10]:
# Cell 5 — Model & LoRA setup (CUDA-aware)
# Purpose: If GPU is available → QLoRA (4-bit). Else → CPU LoRA (fp32), tiny batches.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
USE_CUDA = torch.cuda.is_available()

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Model load
if USE_CUDA:
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Loaded base with QLoRA (4-bit).")
else:
    # CPU fallback (slow). Keep sequence lengths/batches small.
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float32,
        device_map="cpu",
    )
    print("Loaded base on CPU (no GPU detected).")

# LoRA config
peft_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none", task_type="CAUSAL_LM"
)


Loaded base on CPU (no GPU detected).


In [12]:
# Cell 5 — Load base on CPU (no GPU detected)
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # small-ish; still slow on CPU

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# CPU load, no quantization
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="cpu",
    torch_dtype=torch.float32
)

print("Loaded base on CPU (no GPU detected).")


Loaded base on CPU (no GPU detected).


In [15]:
# Cell 4b — Format JSONL into plain text training items

import json
from pathlib import Path

DATA_DIR = Path.home() / "unittime" / "goal1_out"

def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

rawA = load_jsonl(DATA_DIR / "taskA.jsonl")
rawB = load_jsonl(DATA_DIR / "taskB.jsonl")

def format_examples(raw):
    formatted = []
    for ex in raw:
        instr = ex.get("instruction", "")
        inp   = ex.get("input", "")
        out   = ex.get("output", "")
        # Turn the dict into a readable string
        if isinstance(inp, dict):
            inp = json.dumps(inp, ensure_ascii=False)
        if isinstance(out, dict):
            out = json.dumps(out, ensure_ascii=False)
        txt = f"Instruction:\n{instr}\n\nInput:\n{inp}\n\nOutput:\n{out}"
        formatted.append({"text": txt})
    return formatted

taskA_fmt = format_examples(rawA)
taskB_fmt = format_examples(rawB)

# Tiny train/val split
splitA = int(0.9 * len(taskA_fmt))
trainA_f, valA_f = taskA_fmt[:splitA], taskA_fmt[splitA:]

splitB = int(0.9 * len(taskB_fmt))
trainB_f, valB_f = taskB_fmt[:splitB], taskB_fmt[splitB:]

print("TaskA -> train:", len(trainA_f), "val:", len(valA_f))
print("TaskB -> train:", len(trainB_f), "val:", len(valB_f))
print("Sample:", trainA_f[0]["text"][:300], "...")


TaskA -> train: 468 val: 53
TaskB -> train: 35 val: 4
Sample: Instruction:
Assign a feasible room and time for the class given the candidates and constraints.

Input:
{"nr_days": 7, "slots_per_day": 288, "class_id": "1244", "subpart": "766", "class_limit": 22, "dates_mask": "00000000000000000000000000000000000000111111001111101111110111111011111101111110111111 ...


In [17]:
# Sanity check: what do the first 2 items look like?
def peek(items, n=2):
    import itertools, json
    for i, ex in enumerate(itertools.islice(items, n)):
        print(f"\n--- sample {i} type={type(ex)} keys={list(ex.keys()) if isinstance(ex, dict) else 'n/a'}")
        if isinstance(ex, dict):
            for k,v in ex.items():
                if isinstance(v, (dict, list)): 
                    print(f"  {k}: {str(v)[:160]}...")
                else:
                    print(f"  {k}: {str(v)[:160]}")

peek(trainA_f)




--- sample 0 type=<class 'dict'> keys=['text']
  text: Instruction:
Assign a feasible room and time for the class given the candidates and constraints.

Input:
{"nr_days": 7, "slots_per_day": 288, "class_id": "1244"

--- sample 1 type=<class 'dict'> keys=['text']
  text: Instruction:
Assign a feasible room and time for the class given the candidates and constraints.

Input:
{"nr_days": 7, "slots_per_day": 288, "class_id": "1245"


In [18]:
# Cell 2 — Imports & Config (server-safe, no /mnt/data)
from pathlib import Path
import json, random, os, sys, platform
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

# --- Project roots (under your home dir, writable)
BASE_DIR  = Path.home() / "unittime"
DATA_DIR  = BASE_DIR / "goal1_out"       # where taskA.jsonl/taskB.jsonl should be
MODEL_DIR = BASE_DIR / "goal2_models"    # where we will save models
CACHE_DIR = BASE_DIR / "cache"           # optional HF cache

# Make sure they exist
MODEL_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Point HF cache to a writable spot (optional)
os.environ.setdefault("HF_HOME", str(CACHE_DIR))

TASKA = DATA_DIR / "taskA.jsonl"   # per-class dataset
TASKB = DATA_DIR / "taskB.jsonl"   # per-offering dataset

print("DATA_DIR :", DATA_DIR)
print("MODEL_DIR:", MODEL_DIR)
print("CACHE_DIR:", CACHE_DIR)
print("DATA_DIR exists:", DATA_DIR.exists())
print("Files in DATA_DIR:", [p.name for p in DATA_DIR.glob('*')])

# Hard checks (fail early if missing)
assert TASKA.exists(), f"Missing {TASKA} — place it there and re-run this cell."
assert TASKB.exists(), f"Missing {TASKB} — place it there and re-run this cell."

# Base model (small + chat tuned; good for QLoRA on modest GPUs)
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Reproducibility
SEED = 42
random.seed(SEED); torch.manual_seed(SEED)

# Env info
print("\nEnv info:")
print(" Python :", sys.version.split()[0], "|", platform.platform())
print(" CUDA   :", torch.cuda.is_available(), "| torch.cuda:", torch.version.cuda)
if torch.cuda.is_available():
    print(" GPU    :", torch.cuda.get_device_name(0))
print(" Base   :", BASE_MODEL)



DATA_DIR : /home/p017psy/unittime/goal1_out
MODEL_DIR: /home/p017psy/unittime/goal2_models
CACHE_DIR: /home/p017psy/unittime/cache
DATA_DIR exists: True
Files in DATA_DIR: ['taskA.jsonl', 'taskB.jsonl']

Env info:
 Python : 3.12.2 | Linux-4.18.0-348.el8.0.2.x86_64-x86_64-with-glibc2.28
 CUDA   : False | torch.cuda: None
 Base   : TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [19]:
# -------- Stage A trainer with robust collator --------
from torch.utils.data import Dataset as TorchDataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from pathlib import Path
import json, torch

class TextListDataset(TorchDataset):
    def __init__(self, items): self.items = list(items)
    def __len__(self): return len(self.items)
    def __getitem__(self, i): return self.items[i]

trainA_ds = TextListDataset(trainA_f)
valA_ds   = TextListDataset(valA_f)

# LoRA (still fine on CPU; just slower)
peft_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none", task_type="CAUSAL_LM",
)

output_dir_A = (Path.home()/ "unittime"/"goal2_models"/"tinyllama_stageA").as_posix()

cfgA = SFTConfig(
    output_dir=output_dir_A,
    num_train_epochs=1,
    max_seq_length=1024,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=1,
    packing=False,
    fp16=False, bf16=False,
    max_steps=50,                 # small smoke test on CPU
)

def _to_text(ex):
    """Return a training string for one example, tolerant of different shapes."""
    if isinstance(ex, str):
        return ex
    if isinstance(ex, dict):
        if "text" in ex:
            return ex["text"]
        instr = ex.get("instruction", "")
        inp   = ex.get("input", "")
        out   = ex.get("output", "")
        if not isinstance(inp, str):
            inp = json.dumps(inp, ensure_ascii=False)
        if not isinstance(out, str):
            out = json.dumps(out, ensure_ascii=False)
        return f"Instruction:\n{instr}\n\nInput:\n{inp}\n\nOutput:\n{out}"
    # fallback
    return str(ex)

def collate_fn(batch):
    texts = [_to_text(ex) for ex in batch]
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=cfgA.max_seq_length,
        return_tensors="pt",
    )
    enc["labels"] = enc["input_ids"].clone()
    return enc

trainerA = SFTTrainer(
    model=model,                 # loaded earlier on CPU
    tokenizer=tokenizer,         # TRL 0.9.6 is fine with this
    peft_config=peft_cfg,
    train_dataset=trainA_ds,
    eval_dataset=valA_ds,
    args=cfgA,
    data_collator=collate_fn,    # <-- robust collator
)

trainerA.train()
trainerA.save_model(output_dir_A)
tokenizer.save_pretrained(output_dir_A)
print("Saved Stage A adapter →", output_dir_A)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
50,0.0,1.2e-05


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Saved Stage A adapter → /home/p017psy/unittime/goal2_models/tinyllama_stageA


In [None]:
======================================================================================

In [1]:
# Cell 0: Install libs
# Purpose: Set up Transformers, PEFT (LoRA/QLoRA), TRL trainer, BitsAndBytes, Datasets, JSON schema.

%pip -q install -U transformers accelerate peft trl bitsandbytes datasets jsonschema


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Cell 1: Imports & helpers
# Purpose: Import core libs and small utilities used throughout training/eval.

import os, json, random, re, textwrap
from pathlib import Path

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig, TrainingArguments, pipeline
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from jsonschema import validate, ValidationError


In [20]:
# Cell 2: Config
# Purpose: Set data paths, choose base model, pick LoRA/QLoRA and hyperparams.

# === Data from Goal 1 ===
PER_CLASS_JSONL    = "timetable_per_class.jsonl"       # SFT-A
PER_OFFERING_JSONL = "timetable_per_offering.jsonl"    # SFT-B

# === Base instruction model (choose one you can access) ===
# Examples:
# BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
# BASE_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"

# === LoRA / QLoRA ===
USE_QLORA      = True          # QLoRA recommended for single GPU
LORA_R         = 16
LORA_ALPHA     = 16
LORA_DROPOUT   = 0.05
TARGET_MODULES = None          # let PEFT pick common Linear layers

# === Tokenization & generation ===
MAX_INPUT_TOKENS  = 4096
MAX_OUTPUT_TOKENS = 512
TEMPERATURE       = 0.2

# === Training hparams ===
EPOCHS_A = 2           # Stage A (per-class)
EPOCHS_B = 1           # Stage B (per-offering)
LR_A     = 1e-4
LR_B     = 5e-5
BATCH_SIZE_PER_DEVICE = 1
GRAD_ACCUM_STEPS      = 16
WARMUP_RATIO          = 0.03

SAVE_DIR_A = "checkpoints_sftA"
SAVE_DIR_B = "checkpoints_sftB_cont"
os.makedirs(SAVE_DIR_A, exist_ok=True)
os.makedirs(SAVE_DIR_B, exist_ok=True)


In [21]:
# Cell 2.1: Verify that JSONL files exist
import os

print("Looking for:", PER_CLASS_JSONL, PER_OFFERING_JSONL)
print("Current working dir:", os.getcwd())
print("Files in dir:", os.listdir("."))

# If files are missing, adjust paths here
# Example (Colab typical):
# PER_CLASS_JSONL    = "/content/timetable_per_class.jsonl"
# PER_OFFERING_JSONL = "/content/timetable_per_offering.jsonl"


Looking for: timetable_per_class.jsonl timetable_per_offering.jsonl
Current working dir: /content
Files in dir: ['.config', 'checkpoints_sftB_cont', 'checkpoints_sftA', 'sample_data']


In [23]:
# Parse UniTime XML (setup)
import xml.etree.ElementTree as ET

XML_FILE = "/content/pu-spr07-llr.xml"   # <-- update if your XML is named differently or in another folder
tree = ET.parse(XML_FILE)
root = tree.getroot()


In [25]:
# Rebuild JSONL files from XML
import json
import pandas as pd

def to_int(x, default=None):
    try: return int(x)
    except (TypeError, ValueError): return default

def to_float(x, default=None):
    try: return float(x)
    except (TypeError, ValueError): return default

# --- Extract classes, times, rooms ---
classes, class_times, class_rooms = [], [], []
for c in root.find("classes"):
    cid = to_int(c.get("id"))
    classes.append({
        "class_id": cid,
        "offering": to_int(c.get("offering")),
        "class_limit": to_int(c.get("classLimit")),
        "dates_mask": c.get("dates")
    })
    for t in c.findall("time"):
        class_times.append({
            "class_id": cid, "days": t.get("days"),
            "start": to_int(t.get("start")), "length": to_int(t.get("length")),
            "pref": to_float(t.get("pref"), 0.0)
        })
    for r in c.findall("room"):
        class_rooms.append({
            "class_id": cid, "room_id": to_int(r.get("id")),
            "pref": to_float(r.get("pref"), 0.0)
        })

df_classes     = pd.DataFrame(classes)
df_class_times = pd.DataFrame(class_times)
df_class_rooms = pd.DataFrame(class_rooms)

nr_days = to_int(root.get("nrDays"), 7)
slots_per_day = to_int(root.get("slotsPerDay"), 288)

# --- Simple pseudo-gold: pick top preference ---
gold_rows = []
for _, row in df_classes.iterrows():
    cid = row["class_id"]
    times = df_class_times[df_class_times.class_id == cid]
    rooms = df_class_rooms[df_class_rooms.class_id == cid]
    if times.empty or rooms.empty: continue
    best_t = times.sort_values("pref", ascending=False).iloc[0]
    best_r = rooms.sort_values("pref", ascending=False).iloc[0]
    gold_rows.append({
        "class_id": cid,
        "room_id": best_r["room_id"],
        "days": best_t["days"], "start": int(best_t["start"]), "length": int(best_t["length"])
    })
df_gold = pd.DataFrame(gold_rows)

# --- Write per-class JSONL ---
with open("timetable_per_class.jsonl", "w", encoding="utf-8") as f:
    for _, row in df_classes.iterrows():
        cid = row["class_id"]
        cand_t = df_class_times[df_class_times.class_id == cid].to_dict("records")
        cand_r = df_class_rooms[df_class_rooms.class_id == cid].rename(columns={"room_id":"id"}).to_dict("records")
        g = df_gold[df_gold.class_id == cid]
        if g.empty: continue
        gg = g.iloc[0]
        rec = {
            "instruction": "Assign a feasible room and time for the class given the candidates and constraints.",
            "input": {
                "nr_days": nr_days, "slots_per_day": slots_per_day,
                "classes": [{
                    "class_id": int(cid),
                    "class_limit": row.get("class_limit"),
                    "dates_mask": row.get("dates_mask"),
                    "candidate_times": cand_t,
                    "candidate_rooms": cand_r
                }]
            },
            "output": {
                "assignments": [{
                    "class_id": int(cid),
                    "room": int(gg["room_id"]),
                    "time": {"days": gg["days"], "start": int(gg["start"]), "length": int(gg["length"])}
                }]
            }
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# --- Write per-offering JSONL ---
with open("timetable_per_offering.jsonl", "w", encoding="utf-8") as f:
    for off_id, grp in df_classes.groupby("offering"):
        classes_bundle, gold_bundle = [], []
        for _, row in grp.iterrows():
            cid = row["class_id"]
            cand_t = df_class_times[df_class_times.class_id == cid].to_dict("records")
            cand_r = df_class_rooms[df_class_rooms.class_id == cid].rename(columns={"room_id":"id"}).to_dict("records")
            g = df_gold[df_gold.class_id == cid]
            if cand_t and cand_r and not g.empty:
                gg = g.iloc[0]
                classes_bundle.append({"class_id": int(cid),
                                       "class_limit": row.get("class_limit"),
                                       "dates_mask": row.get("dates_mask"),
                                       "candidate_times": cand_t,
                                       "candidate_rooms": cand_r})
                gold_bundle.append({"class_id": int(cid),
                                    "room": int(gg["room_id"]),
                                    "time": {"days": gg["days"], "start": int(gg["start"]), "length": int(gg["length"])}})
        if classes_bundle and gold_bundle:
            rec = {
                "instruction": f"Assign feasible rooms and times for all classes in offering {int(off_id)}.",
                "input": {"nr_days": nr_days, "slots_per_day": slots_per_day,
                          "offering": int(off_id), "classes": classes_bundle},
                "output": {"assignments": gold_bundle}
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("✅ JSONLs rebuilt: timetable_per_class.jsonl & timetable_per_offering.jsonl")


✅ JSONLs rebuilt: timetable_per_class.jsonl & timetable_per_offering.jsonl


In [26]:
# Peek first line of each JSONL
for p in ["timetable_per_class.jsonl","timetable_per_offering.jsonl"]:
    with open(p, "r", encoding="utf-8") as f:
        print(p, "→", f.readline()[:300], "…")


timetable_per_class.jsonl → {"instruction": "Assign a feasible room and time for the class given the candidates and constraints.", "input": {"nr_days": 7, "slots_per_day": 288, "classes": [{"class_id": 1, "class_limit": 66.0, "dates_mask": "000000000000000000000000000000000000001111110011111011111101111110111111011111101111110 …
timetable_per_offering.jsonl → {"instruction": "Assign feasible rooms and times for all classes in offering 1.", "input": {"nr_days": 7, "slots_per_day": 288, "offering": 1, "classes": [{"class_id": 1, "class_limit": 66.0, "dates_mask": "000000000000000000000000000000000000001111110011111011111101111110111111011111101111110111111 …


In [27]:
# Cell 3: Load JSONL datasets
# Purpose: Read Goal-1 JSONLs and create simple 90/10 train/val splits.

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

# Per-class (Task A)
data_A = read_jsonl(PER_CLASS_JSONL)
random.seed(17)
random.shuffle(data_A)
cutA = int(0.9 * len(data_A))
train_A, val_A = data_A[:cutA], data_A[cutA:]
ds_train_A = Dataset.from_list(train_A)
ds_val_A   = Dataset.from_list(val_A)

# Per-offering (Task B)
data_B = read_jsonl(PER_OFFERING_JSONL)
random.seed(23)
random.shuffle(data_B)
cutB = int(0.9 * len(data_B))
train_B, val_B = data_B[:cutB], data_B[cutB:]
ds_train_B = Dataset.from_list(train_B)
ds_val_B   = Dataset.from_list(val_B)

len(ds_train_A), len(ds_val_A), len(ds_train_B), len(ds_val_B)


(722, 81, 542, 61)

In [28]:
# Cell 4: Prompt formatting
import textwrap, json

SYSTEM_PROMPT = (
    "You are a scheduling assistant. Output ONLY JSON matching the requested schema. No extra text."
)
SCHEMA_DESC = textwrap.dedent("""
Return JSON:
{
  "assignments": [
    {"class_id": <int>, "room": <int>,
     "time": {"days": "<7-char 0/1 string>", "start": <int>, "length": <int>}}
  ]
}
""").strip()

def format_example(ex):
    prompt = (
        f"<system>\n{SYSTEM_PROMPT}\n</system>\n"
        f"<instruction>\n{ex['instruction']}\n</instruction>\n"
        f"<schema>\n{SCHEMA_DESC}\n</schema>\n"
        f"<input>\n{json.dumps(ex['input'], ensure_ascii=False)}\n</input>\n"
        f"<output>\n"
    )
    return {"text": prompt + json.dumps(ex["output"], ensure_ascii=False)}

ds_train_A_fmt = ds_train_A.map(format_example)
ds_val_A_fmt   = ds_val_A.map(format_example)
ds_train_B_fmt = ds_train_B.map(format_example)
ds_val_B_fmt   = ds_val_B.map(format_example)
len(ds_train_A_fmt), len(ds_val_A_fmt), len(ds_train_B_fmt), len(ds_val_B_fmt)


Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/542 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

(722, 81, 542, 61)

In [None]:
# Cell 5B: Public model fallbacks (no HF login needed)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

CANDIDATE_MODELS = [
    "Qwen/Qwen2.5-7B-Instruct",   # strong & public
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2-1.5B-Instruct",
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # very small, trains anywhere
]

USE_QLORA = True
bnb = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

BASE_MODEL_ID = None
last_err = None
for mid in CANDIDATE_MODELS:
    try:
        print(f"Trying {mid} …")
        tokenizer = AutoTokenizer.from_pretrained(mid, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(mid, quantization_config=bnb, device_map="auto")
        if USE_QLORA:
            model = prepare_model_for_kbit_training(model)
        BASE_MODEL_ID = mid
        print(f"Loaded: {mid}")
        break
    except Exception as e:
        print(f"Skipping {mid}: {e}")
        last_err = e

if BASE_MODEL_ID is None:
    raise RuntimeError(f"Failed to load any candidate model. Last error: {last_err}")

# attach LoRA
LORA_R, LORA_ALPHA, LORA_DROPOUT = 16, 16, 0.05
TARGET_MODULES = None
lora_cfg = LoraConfig(
    r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
    bias="none", task_type="CAUSAL_LM", target_modules=TARGET_MODULES
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()



Trying Qwen/Qwen2.5-7B-Instruct …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]