In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/hafsa/')

In [3]:
cd /content/drive/MyDrive/hafsa

/content


In [4]:
# Cell 1 — Install deps
# Comment: Transformers + TRL + PEFT + bitsandbytes; disable W&B.

!pip -q install --upgrade transformers accelerate datasets peft bitsandbytes trl evaluate packaging
import os
os.environ["WANDB_MODE"] = "disabled"   # avoid API key prompt


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m917.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# Cell 1
# Install required libraries (HuggingFace, PEFT for LoRA, etc.)
!pip install transformers==4.40.0 datasets peft accelerate bitsandbytes


Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.0)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.4
    Uninstalling tokenizers-0.21.4:
      Successfully uninstal

In [10]:
# Cell 2
# Comment: Standard imports + define input/output paths
import os, json, random
from pathlib import Path
from typing import List, Dict, Any
import xml.etree.ElementTree as ET

BASE = Path("/content")
DATA_DIR = BASE / "data"
OUT_DIR = BASE / "goal2_sft"
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Prefer your uploaded path first, fallback to /content if you uploaded in-session
XML_CANDIDATES = [
    Path("/mnt/data/pu-spr07-cs.xml"),     # uploaded via ChatGPT side panel
    BASE / "pu-spr07-cs.xml",              # uploaded to Colab working dir
    BASE / "content" / "pu-spr07-cs.xml"   # legacy Colab path
]

XML_PATH = next((p for p in XML_CANDIDATES if p.exists()), None)
print("Resolved XML:", XML_PATH)
if XML_PATH is None:
    raise FileNotFoundError(
        "Could not find 'pu-spr07-cs.xml'. Please upload it, or place it under /mnt/data/."
    )


Resolved XML: /content/pu-spr07-cs.xml


In [11]:
# Cell 3
# Comment: Parse UniTime XML -> minimal per-class structures for SFT-A
tree = ET.parse(str(XML_PATH))
root = tree.getroot()

nr_days = int(root.attrib.get("nrDays", 7))
slots_per_day = int(root.attrib.get("slotsPerDay", 288))

# Index rooms by id and capacity
rooms_idx = {}
for r in root.find("rooms"):
    rid = int(r.attrib["id"])
    rooms_idx[rid] = {
        "room_id": rid,
        "capacity": int(r.attrib.get("capacity", 0)),
    }

def parse_time_elem(t):
    # UniTime encodes candidate time as attributes
    return {
        "days": t.attrib.get("days", ""),
        "start": int(t.attrib.get("start", 0)),
        "length": int(t.attrib.get("length", 0)),
        "pref": float(t.attrib.get("pref", 0.0)),
    }

def parse_room_elem(r):
    rid = int(r.attrib["id"])
    return {
        "id": rid,
        "capacity": rooms_idx.get(rid, {}).get("capacity", 0),
        "pref": float(r.attrib.get("pref", 0.0))
    }

classes = []
classes_node = root.find("classes")
for c in classes_node:
    class_id = int(c.attrib["id"])
    class_limit = int(c.attrib.get("classLimit", 0))
    subpart = int(c.attrib.get("subpart", -1))
    parent = int(c.attrib["parent"]) if "parent" in c.attrib else None
    committed = (c.attrib.get("committed", "false").lower() == "true")
    dates_mask = c.attrib.get("dates", "")

    # Candidate times & rooms appear as direct children
    cand_times = [parse_time_elem(t) for t in c.findall("time")]
    cand_rooms = [parse_room_elem(r) for r in c.findall("room")]
    instructors = [int(i.attrib["id"]) for i in c.findall("instructor")]

    classes.append({
        "class_id": class_id,
        "subpart": subpart,
        "parent": parent,
        "class_limit": class_limit,
        "dates_mask": dates_mask,
        "committed": committed,
        "candidate_times": cand_times,
        "candidate_rooms": cand_rooms,
        "instructors": instructors
    })

len(classes), classes[0] if classes else None


(521,
 {'class_id': 1244,
  'subpart': 766,
  'parent': None,
  'class_limit': 22,
  'dates_mask': '00000000000000000000000000000000000000111111001111101111110111111011111101111110111111011111101111110000000011111101111110111111011111101111110111111',
  'committed': False,
  'candidate_times': [{'days': '0000100',
    'start': 90,
    'length': 12,
    'pref': 0.0},
   {'days': '0000100', 'start': 102, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 114, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 126, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 138, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 150, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 162, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 174, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 186, 'length': 12, 'pref': 0.0},
   {'days': '0000100', 'start': 198, 'length': 12, 'pref': 0.0}],
  'candidate_rooms': [],
  'instructors': []}

In [12]:
# Cell 4
# Comment: Choose a "gold" assignment heuristic (since many entries lack explicit solution flags)
# Rationale:
# - XML often lists candidate <time>/<room>. Some datasets also mark a chosen solution; if not present, we need a proxy.
# - Heuristic: pick the time with the BEST "pref" (highest value) and a room with capacity >= class_limit and BEST "pref".
# - This produces a weakly-supervised SFT-A dataset you can refine later when you have explicit gold.
# NOTE: This is a practical workaround to unblock Goal 2 training.

def pick_best_time(cand_times: List[Dict[str, Any]]) -> Dict[str, Any] | None:
    if not cand_times:
        return None
    # Higher 'pref' assumed better (XML shows mix of 0.0, positive, and negatives)
    return sorted(cand_times, key=lambda t: t.get("pref", 0.0), reverse=True)[0]

def pick_best_room(cand_rooms: List[Dict[str, Any]], class_limit: int) -> Dict[str, Any] | None:
    feasible = [r for r in cand_rooms if r.get("capacity", 0) >= class_limit]
    pool = feasible if feasible else cand_rooms
    if not pool:
        return None
    return sorted(pool, key=lambda r: r.get("pref", 0.0), reverse=True)[0]

usable = []
for row in classes:
    t = pick_best_time(row["candidate_times"])
    r = pick_best_room(row["candidate_rooms"], row["class_limit"])
    # keep samples only if we can form a full assignment
    if t is not None and r is not None:
        usable.append({**row, "gold_time": t, "gold_room": r})

len(usable)


502

In [13]:
# Cell 5
# Comment: Convert parsed classes -> SFT-A JSONL samples (instruction/input/output)
random.seed(42)
random.shuffle(usable)
split = max(1, int(0.1 * len(usable)))  # 90/10 split
val = usable[:split]
train = usable[split:]

def sft_record(cls):
    return {
        "instruction": "Assign a feasible room and time for the class given the candidates and constraints.",
        "input": {
            "nr_days": nr_days,
            "slots_per_day": slots_per_day,
            "classes": [{
                "class_id": cls["class_id"],
                "subpart": cls["subpart"],
                "class_limit": cls["class_limit"],
                "dates_mask": cls["dates_mask"],
                "instructors": cls["instructors"],
                "candidate_times": cls["candidate_times"],
                "candidate_rooms": cls["candidate_rooms"],
            }]
        },
        "output": {
            "assignments": [{
                "class_id": cls["class_id"],
                "room": cls["gold_room"]["id"],
                "time": {
                    "days": cls["gold_time"]["days"],
                    "start": cls["gold_time"]["start"],
                    "length": cls["gold_time"]["length"]
                }
            }]
        }
    }

train_path = OUT_DIR / "sft_a.jsonl"
val_path   = OUT_DIR / "sft_a_val.jsonl"

with train_path.open("w", encoding="utf-8") as f:
    for row in train:
        f.write(json.dumps(sft_record(row), ensure_ascii=False) + "\n")

with val_path.open("w", encoding="utf-8") as f:
    for row in val:
        f.write(json.dumps(sft_record(row), ensure_ascii=False) + "\n")

print("Wrote:", train_path, "and", val_path)
print("Counts -> train:", len(train), "val:", len(val))


Wrote: /content/goal2_sft/sft_a.jsonl and /content/goal2_sft/sft_a_val.jsonl
Counts -> train: 452 val: 50


In [14]:
# Cell 6
# Comment: Load freshly created JSONL files with HuggingFace Datasets
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": str(OUT_DIR / "sft_a.jsonl"),
        "validation": str(OUT_DIR / "sft_a_val.jsonl")
    }
)
dataset


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 452
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 50
    })
})

In [15]:
# Cell 7
# Comment: Build training text -> tokenize for causal LM
from transformers import AutoTokenizer

base_model = "HuggingFaceH4/zephyr-7b-beta"  # example 7B instruct model
tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_example(ex):
    prompt = f"{ex['instruction']}\nInput: {json.dumps(ex['input'])}\nOutput:"
    labels = json.dumps(ex['output'])
    text = prompt + labels
    return {"text": text}

formatted = dataset.map(format_example)
def tok(ex):
    out = tokenizer(ex["text"], truncation=True, padding="max_length", max_length=1024)
    out["labels"] = out["input_ids"].copy()
    return out

tokenized = formatted.map(tok, batched=False, remove_columns=dataset["train"].column_names + ["text"])
tokenized


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 452
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})