# Model fine-tuning
### XLM-RoBERTa and mDeBERTa models fine-tuning on MultiCW and OOD datasets

## Initialization

#### Setup project paths:

In [1]:
import os
import sys
from os.path import join, exists
from py_markdown_table.markdown_table import markdown_table

from tqdm.notebook import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_red = '\x1b[1;30;41m'
h_green = '\x1b[1;30;42m'
h_yellow = '\x1b[1;30;43m'
h_stop = '\x1b[0m'

## Setup project paths:
project_path = os.getcwd()
models_path = join(project_path, "Models")

datasets_path = join(project_path, "Source datasets")
multicw_path = join(project_path, 'Final-dataset')
multiclaim_path = join(datasets_path, "MultiClaim")
lesa_dst_dir = join(datasets_path, 'LESA-EACL-2021')
print('done')

done


## Datasets
Loading the MultiCW and OOD datasets for the purpose of models fine-tuning and their evaluation.

In [2]:
# Load MultiCW model
import pandas as pd
from os.path import join
languages = pd.read_csv(join('Final-dataset', 'multicw-full.csv'))['lang'].unique()

multicw_path = join("Final-dataset")
multicw_train = pd.read_csv(join(multicw_path, "multicw-train.csv")).astype({'label':'int'})
multicw_dev = pd.read_csv(join(multicw_path, "multicw-dev.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'label':'int'})
multicw_ood = pd.read_csv(join(multicw_path, "multicw-ood.csv")).astype({'label':'int'})
multicw_ood['style'] = multicw_ood['style'].replace('structured', 'struc')
multicw_ood['text'] = multicw_ood['text'].fillna("").astype(str)

print(f'Loaded MultiCW:')
print(f'Train set: {multicw_train.shape[0]}')
print(f'Dev set: {multicw_dev.shape[0]}')
print(f'Test set: {multicw_test.shape[0]}')
print(f'Out-of-dist set: {multicw_ood.shape[0]}')

Loaded MultiCW:
Train set: 86691
Dev set: 18491
Test set: 18540
Out-of-dist set: 29647


## Models
Implementation of the used models.

### XLM-RoBERTa

In [3]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer


class XLMRobertaModel():
    """ Model finetuning and inference on the LESA dataset using XLMRoberta """

    def __init__(self):
        self.max_len = 256
        self.batch_size = 32

        self.final_model = None

        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'xlm_roberta_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            # 1. Instantiate the model with the known preset (architecture)
            self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
                'xlm_roberta_base_multi',
                num_classes=2,
                preprocessor=self.preprocessor,
                dropout=0.2
            )
    
            # 2. Load the weights manually from the path
            weights_path = os.path.join('Models', f'{model_name}.weights.h5')
            self.final_model.load_weights(weights_path)
    
            print('Model loaded successfully.')
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
    
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en'):
        """
        Train the XLMRoberta model with the given parameters.

        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        model_name_path = f"{model_name}-{lang}-{epochs}e"
        path = os.path.join('Models', 'LESA', 'models', model_name_path)
        if not os.path.exists(path):
            os.makedirs(path)


        self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
            'xlm_roberta_base_multi',
            num_classes=len(train_set['label'].unique()),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learn_rate),
            jit_compile=True,
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, 'model.keras')
        self.final_model.save(model_save_path)

    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()


  from .autonotebook import tqdm as notebook_tqdm


### mDeBERTa

In [4]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer
from tensorflow.keras.optimizers.schedules import CosineDecay


class MDeBertaModel():
    """  """

    def __init__(self, model_name=''):
        self.max_len = 256
        self.batch_size = 32
        self.final_model = None
        
        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'deberta_v3_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            model_path = os.path.join('Models', model_name + '.keras')
    
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Saved model not found at: {model_path}")
    
            print(f"Loading full model from: {model_path}")
            self.final_model = keras.models.load_model(model_path)
            print("Model loaded successfully.")
            return True
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            return False
            
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en', final_learn_rate_fraction=0.5):
        """
        Train the mDeBerta model with the given parameters.
        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        self.final_model = keras_hub.models.DebertaV3Classifier.from_preset(
            'deberta_v3_base_multi',
            num_classes=len(set(train_set['label'].unique())),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        initial_learning_rate = learn_rate
        decay_steps = int(len(train_set) // self.batch_size * epochs)   # Number of steps over which the decay is applied
        alpha = final_learn_rate_fraction  # Minimum learning rate as a fraction of initial_learning_rate

        lr_schedule = CosineDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=decay_steps,
            alpha=alpha
        )

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
            jit_compile=True
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, f"{model_name}-{lang}-{epochs}e.keras")
        self.final_model.save(model_save_path)
        
    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()

## Experiments

### Models fine-tuning on MultiCW dataset
- Fine-tuning of xlm-RoBERTa, mDeBERTa anb LESA models on MultiCW train set
- Evaluation on MultiCW test set

#### XLM-RoBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.87      | 0.88   | 0.87     | 9269      |
| 1                | 0.88      | 0.87   | 0.87     | 9175      |
| **Accuracy**     |           |        | **0.87** | **18444** |
| **Macro avg**    | 0.87      | 0.87   | 0.87     | 18444     |
| **Weighted avg** | 0.87      | 0.87   | 0.87     | 18444     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.81      | 0.82   | 0.81     | 4744     |
| 1                | 0.81      | 0.80   | 0.81     | 4650     |
| **Accuracy**     |           |        | **0.81** | **9394** |
| **Macro avg**    | 0.81      | 0.81   | 0.81     | 9394     |
| **Weighted avg** | 0.81      | 0.81   | 0.81     | 9394     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.93      | 0.94   | 0.94     | 4525     |
| 1                | 0.94      | 0.93   | 0.93     | 4525     |
| **Accuracy**     |           |        | **0.93** | **9050** |
| **Macro avg**    | 0.93      | 0.93   | 0.93     | 9050     |
| **Weighted avg** | 0.93      | 0.93   | 0.93     | 9050     |
                MultiCW Structured Part.
</td>
</tr>
</table>

#### mDeBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.89      | 0.86   | 0.87     | 9269      |
| 1                | 0.86      | 0.89   | 0.88     | 9175      |
| **Accuracy**     |           |        | **0.88** | **18444** |
| **Macro avg**    | 0.88      | 0.88   | 0.88     | 18444     |
| **Weighted avg** | 0.88      | 0.88   | 0.88     | 18444     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.84      | 0.80   | 0.82     | 4744     |
| 1                | 0.81      | 0.84   | 0.82     | 4650     |
| **Accuracy**     |           |        | **0.82** | **9394** |
| **Macro avg**    | 0.82      | 0.82   | 0.82     | 9394     |
| **Weighted avg** | 0.82      | 0.82   | 0.82     | 9394     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.95      | 0.91   | 0.93     | 4525     |
| 1                | 0.92      | 0.95   | 0.93     | 4525     |
| **Accuracy**     |           |        | **0.93** | **9050** |
| **Macro avg**    | 0.93      | 0.93   | 0.93     | 9050     |
| **Weighted avg** | 0.93      | 0.93   | 0.93     | 9050     |
                MultiCW Structured Part.
</td>
</tr>
</table>

In [6]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')
    
    print(f'{h_yellow}MultiCW overall:{h_stop}')
    _, report = detector.detect_claims(multicw_test)
    print(report)
    
    test_noisy = multicw_test.loc[multicw_test['style']=='noisy']
    _, report = detector.detect_claims(test_noisy, verbose=False)
    print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
    print(report)
    
    test_strut = multicw_test.loc[multicw_test['style']=='struct']
    _, report = detector.detect_claims(test_strut, verbose=False)
    print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
    print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-09-01 07:29:50.732261: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-01 07:29:50.787211: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-01 07:29:50.791843: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:


I0000 00:00:1756704598.034464 1441906 service.cc:145] XLA service 0x7fc4b002cdc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756704598.034524 1441906 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-01 07:29:58.117842: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-01 07:29:58.332846: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m  1/577[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m34:16[0m 4s/step

I0000 00:00:1756704600.414404 1441906 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m576/577[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 195ms/step




[1m577/577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 200ms/step
Done.
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      9269
           1       0.88      0.87      0.87      9175

    accuracy                           0.87     18444
   macro avg       0.87      0.87      0.87     18444
weighted avg       0.87      0.87      0.87     18444

Running classification:
[1m293/294[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step





[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 206ms/step
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.81      0.82      0.81      4744
           1       0.81      0.80      0.81      4650

    accuracy                           0.81      9394
   macro avg       0.81      0.81      0.81      9394
weighted avg       0.81      0.81      0.81      9394

Running classification:
[1m282/283[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step





[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 206ms/step
Done.
[1;30;43mMultiCW Structured Part:[0m
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4525
           1       0.94      0.93      0.93      4525

    accuracy                           0.93      9050
   macro avg       0.93      0.93      0.93      9050
weighted avg       0.93      0.93      0.93      9050

[1;30;42mmDBERTa model:[0m
Loading full model from: Models/mdb-multicw-2e6-5e.keras


  instance.compile_from_config(compile_config)
2025-09-01 07:34:13.255647: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:
[1m577/577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 339ms/step
Done.
              precision    recall  f1-score   support

           0       0.89      0.86      0.87      9269
           1       0.86      0.89      0.88      9175

    accuracy                           0.88     18444
   macro avg       0.88      0.88      0.88     18444
weighted avg       0.88      0.88      0.88     18444

Running classification:
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 341ms/step
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      4744
           1       0.81      0.84      0.82      4650

    accuracy                           0.82      9394
   macro avg       0.82      0.82      0.82      9394
weighted avg       0.82      0.82      0.82      9394

Running classification:
[1m283/283

### Out-domain evaluation
- Evaluation of the fine-tuned models on 4 more languages obtained from source datasets.

#### XLM-RoBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.94      | 0.75   | 0.84     | 16000     |
| 1                | 0.76      | 0.95   | 0.85     | 13647     |
| **Accuracy**     |           |        | **0.84** | **29647** |
| **Macro avg**    | 0.85      | 0.85   | 0.84     | 29647     |
| **Weighted avg** | 0.86      | 0.84   | 0.84     | 29647     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.91      | 0.70   | 0.79     | 8000     |
| 1                | 0.76      | 0.93   | 0.84     | 8000     |
| **Accuracy**     |           |        | **0.82** | **16000** |
| **Macro avg**    | 0.83      | 0.82   | 0.81     | 16000     |
| **Weighted avg** | 0.83      | 0.82   | 0.81     | 16000     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.97      | 0.80   | 0.88     | 8000     |
| 1                | 0.78      | 0.97   | 0.86     | 5647     |
| **Accuracy**     |           |        | **0.87** | **13647** |
| **Macro avg**    | 0.87      | 0.88   | 0.87     | 13647     |
| **Weighted avg** | 0.89      | 0.87   | 0.87     | 13647     |
                MultiCW Structured Part.
</td>
</tr>
</table>

#### mDBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.97      | 0.74   | 0.84     | 16000     |
| 1                | 0.76      | 0.97   | 0.85     | 13647     |
| **Accuracy**     |           |        | **0.85** | **29647** |
| **Macro avg**    | 0.86      | 0.86   | 0.85     | 29647     |
| **Weighted avg** | 0.87      | 0.85   | 0.85     | 29647     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.95      | 0.68   | 0.80     | 8000     |
| 1                | 0.75      | 0.96   | 0.85     | 8000     |
| **Accuracy**     |           |        | **0.82** | **16000** |
| **Macro avg**    | 0.85      | 0.82   | 0.82     | 16000     |
| **Weighted avg** | 0.85      | 0.82   | 0.82     | 16000     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.98      | 0.79   | 0.88     | 8000     |
| 1                | 0.77      | 0.98   | 0.86     | 5647     |
| **Accuracy**     |           |        | **0.87** | **13647** |
| **Macro avg**    | 0.88      | 0.89   | 0.87     | 13647     |
| **Weighted avg** | 0.90      | 0.87   | 0.87     | 13647     |
                MultiCW Structured Part.
</td>
</tr>
</table>


In [5]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')
    
    print(f'{h_yellow}MultiCW overall:{h_stop}')
    _, report = detector.detect_claims(multicw_ood)
    print(report)
    
    test_noisy = multicw_ood.loc[multicw_ood['style']=='noisy']
    print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
    _, report = detector.detect_claims(test_noisy, verbose=False)
    print(report)
    
    test_strut = multicw_ood.loc[multicw_ood['style']=='struc']
    print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
    _, report = detector.detect_claims(test_strut, verbose=False)
    print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-09-09 15:50:19.039339: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-09 15:50:19.171982: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-09 15:50:19.175233: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:


I0000 00:00:1757425826.950727    4114 service.cc:145] XLA service 0x7f0c38009bb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1757425826.950786    4114 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-09 15:50:27.030880: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-09 15:50:27.253428: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m  1/927[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m58:27[0m 4s/step

I0000 00:00:1757425829.360802    4114 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m926/927[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 195ms/step




[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 199ms/step
Done.
              precision    recall  f1-score   support

           0       0.94      0.75      0.84     16000
           1       0.76      0.95      0.85     13647

    accuracy                           0.84     29647
   macro avg       0.85      0.85      0.84     29647
weighted avg       0.86      0.84      0.84     29647

[1;30;43mMultiCW Noisy Part:[0m
Running classification:
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 198ms/step
Done.
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      8000
           1       0.76      0.93      0.84      8000

    accuracy                           0.82     16000
   macro avg       0.83      0.82      0.81     16000
weighted avg       0.83      0.82      0.81     16000

[1;30;43mMultiCW Structured Part:[0m
Running classification:
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

  instance.compile_from_config(compile_config)
2025-09-09 15:56:55.885372: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 337ms/step
Done.
              precision    recall  f1-score   support

           0       0.97      0.74      0.84     16000
           1       0.76      0.97      0.85     13647

    accuracy                           0.85     29647
   macro avg       0.86      0.86      0.85     29647
weighted avg       0.87      0.85      0.85     29647

[1;30;43mMultiCW Noisy Part:[0m
Running classification:
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 335ms/step
Done.
              precision    recall  f1-score   support

           0       0.95      0.68      0.80      8000
           1       0.75      0.96      0.85      8000

    accuracy                           0.82     16000
   macro avg       0.85      0.82      0.82     16000
weighted avg       0.85      0.82      0.82     16000

[1;30;43mMultiCW Structured Part:[