In [None]:
import shutil
import os

results_dir = "results"

#### Delete all checkpoint directories
for root, dirs, files in os.walk(results_dir):
    for d in dirs:
        if "checkpoint" in d:
            print(f"Deleting checkpoint directory: {os.path.join(root, d)}")
            shutil.rmtree(os.path.join(root, d))



In [1]:
import json
import os

columns = ["Edge Removal", "Type Semantic Removal", "Distance", "Cleansed", "Ordered"] + ["Loss", "Accuracy", "F1", "Precision", "Recall"]

conf_keys = ["edge_removal", "type_semantic_removal", "distance", "cleanse", "ordered"]
metrics = ["eval_loss", "eval_accuracy", "eval_macro_f1", "eval_macro_precision", "eval_macro_recall"]
archi_results_dir = os.path.join("results", "archi")

archi_results = list()
count = 0

for result_dir in os.listdir(archi_results_dir):
    if os.path.exists(os.path.join(archi_results_dir, result_dir, "trainer_state.json")):
        with open(os.path.join(archi_results_dir, result_dir, "trainer_state.json"), "r") as f:
            trainer_state = json.load(f)
        with open(os.path.join(archi_results_dir, result_dir, "run_config.json"), "r") as f:
            run_config = json.load(f)
        best_step = trainer_state["best_global_step"]
        best_result = [r for r in trainer_state["log_history"] if r["step"] == best_step and "eval_loss" in r.keys()][0]
        r = {
            k: best_result[k] for k in metrics
        }
        c = {
            k: run_config[k] for k in conf_keys
        }
        archi_results.append({**c, **r})

len(archi_results)

82

In [5]:
import pandas as pd


pd.DataFrame(archi_results).to_csv("archi_results.csv", index=False)
pd.DataFrame(archi_results).to_excel("archi_results.xlsx", index=False)

In [None]:
"""
Cases - 
x%Structure + y%Semantics 

x = % of edges removed (5 values)
y = % of type semantics removed (5 values)

semantics = (cleansed, ordered) (2x2)
distance = (0, 1, 2, 3) (4 values)
4x4x5x5 = 400
"""

from pydantic import BaseModel, Field

class CleansingConfig(BaseModel):
    min_edges: int = Field(default=5)
    min_enr: float = Field(default=1.2)
    duplicate_overlap_threshold: float = Field(default=0.9)
    dummy_ratio_threshold: float = Field(default=0.5)
    llm_filter_threshold: float = Field(default=0.5)


class ExtractionConfig(BaseModel):
    use_node_attributes: bool = Field(default=True)
    use_node_types: bool = Field(default=True)
    use_edge_types: bool = Field(default=True)
    use_edge_label: bool = Field(default=True)
    use_node_label: bool = Field(default=True)
    use_special_tokens: bool = Field(default=False)


class RunConfig(BaseModel):
    task_type: str = Field(default="node_cls")
    
    node_cls_label: str = Field(default="type")
    edge_cls_label: str = Field(default="type")
    
    edge_removal: float = Field(default=0.5, ge=0.0, le=1.0)
    type_semantic_removal: float = Field(default=0.5, ge=0.0, le=1.0)
    distance: int = Field(default=1, ge=0, le=3)
    cleanse: bool = Field(default=False)
    ordered: bool = Field(default=False)
    language: str = Field(default="en")
    cleansing_config: CleansingConfig = Field(default=CleansingConfig())
    extraction_config: ExtractionConfig = Field(default=ExtractionConfig())
    llm_cleansing: bool = Field(default=False)
    
    model: str = Field(default="bert-base-uncased")
    max_seq_length: int = Field(default=4096)
    
    learning_rate: float = Field(default=2e-4)
    weight_decay: float = Field(default=0.001)
    lr_scheduler_type: str = Field(default="linear")
    warmup_steps: int = Field(default=5)
    max_steps: int = Field(default=60)
    gradient_accumulation_steps: int = Field(default=4)
    
    seed: int = Field(default=3407)
    save_dir: str = Field(default="results")

config = RunConfig()

In [None]:
from architype.architype.dataset.build import ArchiMateDataset
from architype.architype.models.bert.trainer import BertTextClassifier

import os


dataset_dir = os.path.join("architype", "data", "raw", "eamodelset")

archimate_dataset = ArchiMateDataset(dataset_dir, language=config.language, config=config)

if config.cleanse:
    archimate_dataset.cleanse()

if not config.ordered:
    archimate_dataset.randomize_node_labels() \
    if config.task_type == "node_cls" else \
    archimate_dataset.randomize_edge_labels()

if config.task_type == "node_cls":
    dataset = archimate_dataset.get_node_texts()
elif config.task_type == "edge_cls":
    dataset = archimate_dataset.get_edge_texts()


classifier = BertTextClassifier(
    model_name=config.model,
    output_dir=config.save_dir,
    seed=config.seed,
)

classifier.train(dataset=dataset)

In [3]:
# from architype.architype.dataset.build import OntoUMLDataset
# import os

# config = RunConfig(node_cls_label="stereotype")
# dataset_dir = os.path.join("architype", "data", "raw", "ontouml")

# ontouml_dataset = OntoUMLDataset(dataset_dir, config=config)

# if config.cleanse:
#     ontouml_dataset.cleanse()

# if not config.ordered:
#     ontouml_dataset.randomize_node_labels() \
#     if config.task_type == "node_cls" else \
#     ontouml_dataset.randomize_edge_labels()

# if config.task_type == "node_cls":
#     dataset = ontouml_dataset.get_node_texts()
# elif config.task_type == "edge_cls":
#     dataset = ontouml_dataset.get_edge_texts()


In [4]:
from architype.architype.models.bert.trainer import BertTextClassifier

classifier = BertTextClassifier(
    model_name=config.model,
    output_dir=config.save_dir,
    seed=config.seed,
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [5]:
classifier.train(dataset=dataset)

Map:   0%|          | 0/46664 [00:00<?, ? examples/s]

Map:   0%|          | 0/11396 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


OutOfMemoryError: Caught OutOfMemoryError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 99, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1482, in forward
    outputs = self.bert(
              ^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1000, in forward
    encoder_outputs = self.encoder(
                      ^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 650, in forward
    layer_outputs = layer_module(
                    ^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 588, in forward
    layer_output = apply_chunking_to_forward(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/pytorch_utils.py", line 257, in apply_chunking_to_forward
    return forward_fn(*input_tensors)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 596, in feed_forward_chunk
    intermediate_output = self.intermediate(attention_output)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 512, in forward
    hidden_states = self.dense(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sali/CMAI-Projects/archi-type-prediction/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 134, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 1 has a total capacity of 23.57 GiB of which 5.69 MiB is free. Process 3922865 has 22.07 GiB memory in use. Including non-PyTorch memory, this process has 1.48 GiB memory in use. Of the allocated memory 1012.80 MiB is allocated by PyTorch, and 87.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
from unsloth import FastLanguageModel


fourbit_models = [
    "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-Thinking-2507-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Instruct-2507",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.11.2: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 4. Max memory: 23.57 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [18]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [19]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen3-instruct",
)

In [22]:
from datasets import load_from_disk

node_dataset = load_from_disk("archi_node_texts")
edge_dataset = load_from_disk("archi_edge_texts")

In [7]:
node_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'graph_id', 'node_id'],
        num_rows: 24148
    })
    test: Dataset({
        features: ['text', 'label', 'graph_id', 'node_id'],
        num_rows: 5887
    })
})

In [23]:
import json

node_types = list(set([i['label'] for i in node_dataset['train']] + [i['label'] for i in node_dataset['test']]))
node_types_str = "\n".join([f"- {i}" for i in node_types])

node_cls_label = "layer"

instruction = f"""You are an expert in Enterprise Architecture modeling with ArchiMate.
You are given some information about an Archimate class and some of its neighbours.
Your task is to predict the {node_cls_label} of the given class.
The types are:
{node_types_str}
"""
def format_chat_template(row):
    response = json.dumps({"label": row["label"]})
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["text"]},
               {"role": "assistant", "content": response}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

node_dataset["train"] = node_dataset["train"].map(
    format_chat_template,
    num_proc= 4,
)


Map (num_proc=4):   0%|          | 0/24148 [00:00<?, ? examples/s]

In [24]:
print(node_dataset["train"]["text"][0])

<|im_start|>system
You are an expert in Enterprise Architecture modeling with ArchiMate.
You are given some information about an Archimate class and some of its neighbours.
Your task is to predict the layer of the given class.
The types are:
- other
- strategy
- business
- motivation
- implementation_migration
- application
- technology
<|im_end|>
<|im_start|>user
get iterator from aggregate() <> next item in iterator(ApplicationProcess, application)<|im_end|>
<|im_start|>assistant
{"label": "application"}<|im_end|>



In [25]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = node_dataset["train"],
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=64):   0%|          | 0/24148 [00:00<?, ? examples/s]

In [26]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

Map (num_proc=64):   0%|          | 0/24148 [00:00<?, ? examples/s]

In [27]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<|im_start|>system\nYou are an expert in Enterprise Architecture modeling with ArchiMate.\nYou are given some information about an Archimate class and some of its neighbours.\nYour task is to predict the layer of the given class.\nThe types are:\n- other\n- strategy\n- business\n- motivation\n- implementation_migration\n- application\n- technology\n<|im_end|>\n<|im_start|>user\nMetadata Standard Discovery() <> Get Metadata Standard(BusinessProcess, business)\nMetadata Standard Discovery() <> Metadata Standard Finding service(ApplicationService, application)\nMetadata Standard Discovery() <> DMP App(ApplicationComponent, application)<|im_end|>\n<|im_start|>assistant\n{"label": "application"}<|im_end|>\n'

In [28]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                                                                         {"label": "application"}<|im_end|>\n'

In [29]:
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 24,148 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)


Step,Training Loss
1,6.7734
2,6.1246
3,6.3813
4,4.7196
5,2.8579
6,0.604
7,0.3179
8,0.1891
9,0.1111
10,0.1014


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
from transformers import TextStreamer
from tqdm.auto import tqdm

responses = []

for test_example in tqdm(node_dataset["test"], desc="Generating responses"):

    messages = [{"role" : "user", "content" : test_example["text"]}]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = True, # Must add for generation
    )

    
    response = model.generate(
        **tokenizer(text, return_tensors = "pt").to("cuda"),
        max_new_tokens = 1000, # Increase for longer outputs!
        temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )
    responses.append(response)

Generating responses:   0%|          | 0/5887 [00:00<?, ?it/s]

ValueError: TextStreamer only supports batch size 1