# Model fine-tuning
### XLM-RoBERTa and mDeBERTa models fine-tuning on MultiCW and OOD datasets

## Initialization

#### Setup project paths:

In [1]:
import os
import sys
from os.path import join, exists
from py_markdown_table.markdown_table import markdown_table

from tqdm.notebook import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_red = '\x1b[1;30;41m'
h_green = '\x1b[1;30;42m'
h_yellow = '\x1b[1;30;43m'
h_stop = '\x1b[0m'

## Setup project paths:
project_path = os.getcwd()
models_path = join(project_path, "Models")

datasets_path = join(project_path, "Source datasets")
multicw_path = join(project_path, 'Final-dataset')
multiclaim_path = join(datasets_path, "MultiClaim")
lesa_dst_dir = join(datasets_path, 'LESA-EACL-2021')
print('done')

done


## Datasets
Loading the MultiCW and OOD datasets for the purpose of models fine-tuning and their evaluation.

In [2]:
# Load MultiCW model
import pandas as pd
from os.path import join
languages = pd.read_csv(join('Final-dataset', 'multicw-full.csv'))['lang'].unique()

multicw_path = join("Final-dataset")
multicw_train = pd.read_csv(join(multicw_path, "multicw-train.csv")).astype({'label':'int'})
multicw_dev = pd.read_csv(join(multicw_path, "multicw-dev.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'text':'str'})
multicw_ood = pd.read_csv(join(multicw_path, "multicw-ood.csv")).astype({'label':'int'})
multicw_ood['style'] = multicw_ood['style'].replace('structured', 'struc')
multicw_ood['text'] = multicw_ood['text'].fillna("").astype(str)

print(f'Loaded MultiCW:')
print(f'Train set: {multicw_train.shape[0]}')
print(f'Dev set: {multicw_dev.shape[0]}')
print(f'Test set: {multicw_test.shape[0]}')
print(f'Out-of-dist set: {multicw_ood.shape[0]}')

Loaded MultiCW:
Train set: 88001
Dev set: 14823
Test set: 14823
Out-of-dist set: 27761


## Models
Implementation of the used models.

### XLM-RoBERTa

In [3]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer


class XLMRobertaModel():
    """ Model finetuning and inference on the LESA dataset using XLMRoberta """

    def __init__(self):
        self.max_len = 256
        self.batch_size = 32

        self.final_model = None

        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'xlm_roberta_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            # 1. Instantiate the model with the known preset (architecture)
            self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
                'xlm_roberta_base_multi',
                num_classes=2,
                preprocessor=self.preprocessor,
                dropout=0.2
            )
    
            # 2. Load the weights manually from the path
            weights_path = os.path.join('Models', f'{model_name}.weights.h5')
            self.final_model.load_weights(weights_path)
    
            print('Model loaded successfully.')
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
    
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en'):
        """
        Train the XLMRoberta model with the given parameters.

        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        model_name_path = f"{model_name}-{lang}-{epochs}e"
        path = os.path.join('Models', 'LESA', 'models', model_name_path)
        if not os.path.exists(path):
            os.makedirs(path)


        self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
            'xlm_roberta_base_multi',
            num_classes=len(train_set['label'].unique()),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learn_rate),
            jit_compile=True,
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, 'model.keras')
        self.final_model.save(model_save_path)

    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()


2025-10-01 13:45:57.897642: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-01 13:45:57.922397: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-01 13:45:57.922454: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-01 13:45:57.951995: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


### mDeBERTa

In [4]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer
from tensorflow.keras.optimizers.schedules import CosineDecay


class MDeBertaModel():
    """  """

    def __init__(self, model_name=''):
        self.max_len = 256
        self.batch_size = 32
        self.final_model = None
        
        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'deberta_v3_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            model_path = os.path.join('Models', model_name + '.keras')
    
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Saved model not found at: {model_path}")
    
            print(f"Loading full model from: {model_path}")
            self.final_model = keras.models.load_model(model_path)
            print("Model loaded successfully.")
            return True
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            return False
            
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en', final_learn_rate_fraction=0.5):
        """
        Train the mDeBerta model with the given parameters.
        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        self.final_model = keras_hub.models.DebertaV3Classifier.from_preset(
            'deberta_v3_base_multi',
            num_classes=len(set(train_set['label'].unique())),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        initial_learning_rate = learn_rate
        decay_steps = int(len(train_set) // self.batch_size * epochs)   # Number of steps over which the decay is applied
        alpha = final_learn_rate_fraction  # Minimum learning rate as a fraction of initial_learning_rate

        lr_schedule = CosineDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=decay_steps,
            alpha=alpha
        )

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
            jit_compile=True
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, f"{model_name}-{lang}-{epochs}e.keras")
        self.final_model.save(model_save_path)
        
    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()

## Experiments

### Models fine-tuning on MultiCW dataset
- Fine-tuning of xlm-RoBERTa, mDeBERTa anb LESA models on MultiCW train set
- Evaluation on MultiCW test set

#### XLM-RoBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.87      | 0.88   | 0.87     | 9269      |
| 1                | 0.88      | 0.87   | 0.87     | 9175      |
| **Accuracy**     |           |        | **0.87** | **18444** |
| **Macro avg**    | 0.87      | 0.87   | 0.87     | 18444     |
| **Weighted avg** | 0.87      | 0.87   | 0.87     | 18444     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.81      | 0.82   | 0.81     | 4744     |
| 1                | 0.81      | 0.80   | 0.81     | 4650     |
| **Accuracy**     |           |        | **0.81** | **9394** |
| **Macro avg**    | 0.81      | 0.81   | 0.81     | 9394     |
| **Weighted avg** | 0.81      | 0.81   | 0.81     | 9394     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.93      | 0.94   | 0.94     | 4525     |
| 1                | 0.94      | 0.93   | 0.93     | 4525     |
| **Accuracy**     |           |        | **0.93** | **9050** |
| **Macro avg**    | 0.93      | 0.93   | 0.93     | 9050     |
| **Weighted avg** | 0.93      | 0.93   | 0.93     | 9050     |
                MultiCW Structured Part.
</td>
</tr>
</table>

#### mDeBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.89      | 0.86   | 0.87     | 9269      |
| 1                | 0.86      | 0.89   | 0.88     | 9175      |
| **Accuracy**     |           |        | **0.88** | **18444** |
| **Macro avg**    | 0.88      | 0.88   | 0.88     | 18444     |
| **Weighted avg** | 0.88      | 0.88   | 0.88     | 18444     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.84      | 0.80   | 0.82     | 4744     |
| 1                | 0.81      | 0.84   | 0.82     | 4650     |
| **Accuracy**     |           |        | **0.82** | **9394** |
| **Macro avg**    | 0.82      | 0.82   | 0.82     | 9394     |
| **Weighted avg** | 0.82      | 0.82   | 0.82     | 9394     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.95      | 0.91   | 0.93     | 4525     |
| 1                | 0.92      | 0.95   | 0.93     | 4525     |
| **Accuracy**     |           |        | **0.93** | **9050** |
| **Macro avg**    | 0.93      | 0.93   | 0.93     | 9050     |
| **Weighted avg** | 0.93      | 0.93   | 0.93     | 9050     |
                MultiCW Structured Part.
</td>
</tr>
</table>

In [6]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')
    
    print(f'{h_yellow}MultiCW overall:{h_stop}')
    _, report = detector.detect_claims(multicw_test)
    print(report)
    
    test_noisy = multicw_test.loc[multicw_test['style']=='noisy']
    _, report = detector.detect_claims(test_noisy, verbose=False)
    print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
    print(report)
    
    test_strut = multicw_test.loc[multicw_test['style']=='struct']
    _, report = detector.detect_claims(test_strut, verbose=False)
    print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
    print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-09-01 07:29:50.732261: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-01 07:29:50.787211: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-01 07:29:50.791843: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:


I0000 00:00:1756704598.034464 1441906 service.cc:145] XLA service 0x7fc4b002cdc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756704598.034524 1441906 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-01 07:29:58.117842: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-01 07:29:58.332846: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m  1/577[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m34:16[0m 4s/step

I0000 00:00:1756704600.414404 1441906 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m576/577[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 195ms/step




[1m577/577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 200ms/step
Done.
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      9269
           1       0.88      0.87      0.87      9175

    accuracy                           0.87     18444
   macro avg       0.87      0.87      0.87     18444
weighted avg       0.87      0.87      0.87     18444

Running classification:
[1m293/294[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step





[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 206ms/step
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.81      0.82      0.81      4744
           1       0.81      0.80      0.81      4650

    accuracy                           0.81      9394
   macro avg       0.81      0.81      0.81      9394
weighted avg       0.81      0.81      0.81      9394

Running classification:
[1m282/283[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step





[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 206ms/step
Done.
[1;30;43mMultiCW Structured Part:[0m
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4525
           1       0.94      0.93      0.93      4525

    accuracy                           0.93      9050
   macro avg       0.93      0.93      0.93      9050
weighted avg       0.93      0.93      0.93      9050

[1;30;42mmDBERTa model:[0m
Loading full model from: Models/mdb-multicw-2e6-5e.keras


  instance.compile_from_config(compile_config)
2025-09-01 07:34:13.255647: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;42mMultiCW overall:[0m
Running classification:
[1m577/577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 339ms/step
Done.
              precision    recall  f1-score   support

           0       0.89      0.86      0.87      9269
           1       0.86      0.89      0.88      9175

    accuracy                           0.88     18444
   macro avg       0.88      0.88      0.88     18444
weighted avg       0.88      0.88      0.88     18444

Running classification:
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 341ms/step
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      4744
           1       0.81      0.84      0.82      4650

    accuracy                           0.82      9394
   macro avg       0.82      0.82      0.82      9394
weighted avg       0.82      0.82      0.82      9394

Running classification:
[1m283/283

### Evaluation of models on each language of the MultiCW dataset

In [5]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')

    languages = multicw_test['lang'].unique()

    for lang in languages:
        print(f'{h_yellow}Language: {lang}:{h_stop}')
        _, report = detector.detect_claims(multicw_test[multicw_test['lang'] == lang])
        print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-09-25 13:48:52.439519: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 13:48:52.484092: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 13:48:52.488673: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;43mLanguage: sk:[0m
Running classification:


I0000 00:00:1758800941.258255   15687 service.cc:145] XLA service 0x7fbda802de10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758800941.258297   15687 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-25 13:49:01.364506: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-25 13:49:01.617889: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m 1/38[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:39[0m 4s/step

I0000 00:00:1758800944.139366   15687 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 197ms/step









[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 313ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89       596
         1.0       0.93      0.83      0.88       596

    accuracy                           0.89      1192
   macro avg       0.89      0.89      0.88      1192
weighted avg       0.89      0.89      0.88      1192

[1;30;43mLanguage: pl:[0m
Running classification:
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 257ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93       597
         1.0       0.95      0.91      0.93       597

    accuracy                           0.93      1194
   macro avg       0.93      0.93      0.93      1194
weighted avg       0.93      0.93      0.93      1194

[1;30;43mLanguage: cs:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 204ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.83      0.93      0.88       596
         1.0       0.92      0.81      0.86       596

    accuracy                           0.87      1192
   macro avg       0.87      0.87      0.87      1192
weighted avg       0.87      0.87      0.87      1192

[1;30;43mLanguage: bg:[0m
Running classification:
[1m33/34[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 199ms/step









[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 303ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.68      0.88      0.76       530
         1.0       0.83      0.58      0.68       530

    accuracy                           0.73      1060
   macro avg       0.75      0.73      0.72      1060
weighted avg       0.75      0.73      0.72      1060

[1;30;43mLanguage: ru:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 204ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92       597
         1.0       0.95      0.88      0.92       597

    accuracy                           0.92      1194
   macro avg       0.92      0.92      0.92      1194
weighted avg       0.92      0.92      0.92      1194

[1;30;43mLanguage: uk:[0m
Running classification:
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step









[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 290ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.83      0.95      0.89       595
         1.0       0.94      0.81      0.87       595

    accuracy                           0.88      1190
   macro avg       0.89      0.88      0.88      1190
weighted avg       0.89      0.88      0.88      1190

[1;30;43mLanguage: zh:[0m
Running classification:
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 207ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.88      0.91      0.90       517
         1.0       0.91      0.88      0.89       517

    accuracy                           0.89      1034
   macro avg       0.89      0.89      0.89      1034
weighted avg       0.89      0.89      0.89      1034

[1;30;43mLanguage: hi:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 207ms/step
Done




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 265ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.79      0.71      0.74       599
         1.0       0.73      0.81      0.77       599

    accuracy                           0.76      1198
   macro avg       0.76      0.76      0.76      1198
weighted avg       0.76      0.76      0.76      1198

[1;30;43mLanguage: es:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 207ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.79      0.90      0.84       599
         1.0       0.88      0.76      0.82       599

    accuracy                           0.83      1198
   macro avg       0.84      0.83      0.83      1198
weighted avg       0.84      0.83      0.83      1198

[1;30;43mLanguage: fr:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 206ms/step
Done









[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 311ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.99      0.83      0.90       465
         1.0       0.85      0.99      0.92       465

    accuracy                           0.91       930
   macro avg       0.92      0.91      0.91       930
weighted avg       0.92      0.91      0.91       930

[1;30;43mLanguage: pt:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 208ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98       597
         1.0       0.96      0.99      0.98       597

    accuracy                           0.98      1194
   macro avg       0.98      0.98      0.98      1194
weighted avg       0.98      0.98      0.98      1194

[1;30;43mLanguage: de:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 209ms/step
Done

  instance.compile_from_config(compile_config)
2025-09-25 13:52:08.348909: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;43mLanguage: sk:[0m
Running classification:
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 334ms/step











[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 523ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.89      0.93      0.91       596
         1.0       0.93      0.88      0.90       596

    accuracy                           0.91      1192
   macro avg       0.91      0.91      0.91      1192
weighted avg       0.91      0.91      0.91      1192

[1;30;43mLanguage: pl:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 395ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       597
         1.0       0.95      0.95      0.95       597

    accuracy                           0.95      1194
   macro avg       0.95      0.95      0.95      1194
weighted avg       0.95      0.95      0.95      1194

[1;30;43mLanguage: cs:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 335ms/step
Do











[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 477ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.71      0.93      0.81       530
         1.0       0.90      0.62      0.73       530

    accuracy                           0.77      1060
   macro avg       0.80      0.77      0.77      1060
weighted avg       0.80      0.77      0.77      1060

[1;30;43mLanguage: ru:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 334ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93       597
         1.0       0.95      0.90      0.93       597

    accuracy                           0.93      1194
   macro avg       0.93      0.93      0.93      1194
weighted avg       0.93      0.93      0.93      1194

[1;30;43mLanguage: uk:[0m
Running classification:
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 341ms/step











[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 468ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90       595
         1.0       0.94      0.85      0.89       595

    accuracy                           0.90      1190
   macro avg       0.90      0.90      0.90      1190
weighted avg       0.90      0.90      0.90      1190

[1;30;43mLanguage: zh:[0m
Running classification:
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 340ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89       517
         1.0       0.90      0.87      0.89       517

    accuracy                           0.89      1034
   macro avg       0.89      0.89      0.89      1034
weighted avg       0.89      0.89      0.89      1034

[1;30;43mLanguage: hi:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 344ms/step
Do











[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 537ms/step
Done.
              precision    recall  f1-score   support

         0.0       1.00      0.84      0.91       465
         1.0       0.86      1.00      0.92       465

    accuracy                           0.92       930
   macro avg       0.93      0.92      0.92       930
weighted avg       0.93      0.92      0.92       930

[1;30;43mLanguage: pt:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 348ms/step
Done.
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98       597
         1.0       0.96      0.99      0.98       597

    accuracy                           0.98      1194
   macro avg       0.98      0.98      0.98      1194
weighted avg       0.98      0.98      0.98      1194

[1;30;43mLanguage: de:[0m
Running classification:
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 357ms/step
Do

### Out-domain evaluation
- Evaluation of the fine-tuned models on 4 more languages obtained from source datasets.

#### XLM-RoBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.94      | 0.75   | 0.84     | 16000     |
| 1                | 0.76      | 0.95   | 0.85     | 13647     |
| **Accuracy**     |           |        | **0.84** | **29647** |
| **Macro avg**    | 0.85      | 0.85   | 0.84     | 29647     |
| **Weighted avg** | 0.86      | 0.84   | 0.84     | 29647     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.91      | 0.70   | 0.79     | 8000     |
| 1                | 0.76      | 0.93   | 0.84     | 8000     |
| **Accuracy**     |           |        | **0.82** | **16000** |
| **Macro avg**    | 0.83      | 0.82   | 0.81     | 16000     |
| **Weighted avg** | 0.83      | 0.82   | 0.81     | 16000     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.97      | 0.80   | 0.88     | 8000     |
| 1                | 0.78      | 0.97   | 0.86     | 5647     |
| **Accuracy**     |           |        | **0.87** | **13647** |
| **Macro avg**    | 0.87      | 0.88   | 0.87     | 13647     |
| **Weighted avg** | 0.89      | 0.87   | 0.87     | 13647     |
                MultiCW Structured Part.
</td>
</tr>
</table>

#### mDBERTa model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1-score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | 0.97      | 0.74   | 0.84     | 16000     |
| 1                | 0.76      | 0.97   | 0.85     | 13647     |
| **Accuracy**     |           |        | **0.85** | **29647** |
| **Macro avg**    | 0.86      | 0.86   | 0.85     | 29647     |
| **Weighted avg** | 0.87      | 0.85   | 0.85     | 29647     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.95      | 0.68   | 0.80     | 8000     |
| 1                | 0.75      | 0.96   | 0.85     | 8000     |
| **Accuracy**     |           |        | **0.82** | **16000** |
| **Macro avg**    | 0.85      | 0.82   | 0.82     | 16000     |
| **Weighted avg** | 0.85      | 0.82   | 0.82     | 16000     |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1-score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | 0.98      | 0.79   | 0.88     | 8000     |
| 1                | 0.77      | 0.98   | 0.86     | 5647     |
| **Accuracy**     |           |        | **0.87** | **13647** |
| **Macro avg**    | 0.88      | 0.89   | 0.87     | 13647     |
| **Weighted avg** | 0.90      | 0.87   | 0.87     | 13647     |
                MultiCW Structured Part.
</td>
</tr>
</table>


In [5]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')
    
    print(f'{h_yellow}MultiCW overall:{h_stop}')
    _, report = detector.detect_claims(multicw_ood)
    print(report)
    
    test_noisy = multicw_ood.loc[multicw_ood['style']=='noisy']
    print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
    _, report = detector.detect_claims(test_noisy, verbose=False)
    print(report)
    
    test_strut = multicw_ood.loc[multicw_ood['style']=='struc']
    print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
    _, report = detector.detect_claims(test_strut, verbose=False)
    print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-10-01 13:46:26.855076: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-10-01 13:46:26.980819: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-10-01 13:46:26.983829: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;43mMultiCW overall:[0m
Running classification:


I0000 00:00:1759319194.657170   23559 service.cc:145] XLA service 0x7f7b980022e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1759319194.657218   23559 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-10-01 13:46:34.729265: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-01 13:46:34.936323: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m  1/868[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m53:01[0m 4s/step

I0000 00:00:1759319197.122821   23559 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m867/868[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 195ms/step





[1m868/868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 199ms/step
Done.
              precision    recall  f1-score   support

           0       0.95      0.78      0.86     15997
           1       0.76      0.94      0.84     11764

    accuracy                           0.85     27761
   macro avg       0.86      0.86      0.85     27761
weighted avg       0.87      0.85      0.85     27761

[1;30;43mMultiCW Noisy Part:[0m
Running classification:
[1m441/442[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step









[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 205ms/step
Done.
              precision    recall  f1-score   support

           0       0.93      0.76      0.84      7997
           1       0.75      0.92      0.83      6117

    accuracy                           0.83     14114
   macro avg       0.84      0.84      0.83     14114
weighted avg       0.85      0.83      0.83     14114

[1;30;43mMultiCW Structured Part:[0m
Running classification:
[1m426/427[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 200ms/step




[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 205ms/step
Done.
              precision    recall  f1-score   support

           0       0.97      0.80      0.88      8000
           1       0.78      0.97      0.86      5647

    accuracy                           0.87     13647
   macro avg       0.87      0.88      0.87     13647
weighted avg       0.89      0.87      0.87     13647

[1;30;42mmDBERTa model:[0m
Loading full model from: Models/mdb-multicw-2e6-5e.keras


  instance.compile_from_config(compile_config)
2025-10-01 13:52:49.381804: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;43mMultiCW overall:[0m
Running classification:
[1m868/868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 345ms/step
Done.
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     15997
           1       0.77      0.97      0.86     11764

    accuracy                           0.86     27761
   macro avg       0.87      0.88      0.86     27761
weighted avg       0.89      0.86      0.86     27761

[1;30;43mMultiCW Noisy Part:[0m
Running classification:
[1m441/442[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 350ms/step











[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 360ms/step
Done.
              precision    recall  f1-score   support

           0       0.96      0.78      0.86      7997
           1       0.77      0.96      0.85      6117

    accuracy                           0.86     14114
   macro avg       0.86      0.87      0.86     14114
weighted avg       0.88      0.86      0.86     14114

[1;30;43mMultiCW Structured Part:[0m
Running classification:
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 354ms/step
Done.
              precision    recall  f1-score   support

           0       0.98      0.79      0.88      8000
           1       0.77      0.98      0.86      5647

    accuracy                           0.87     13647
   macro avg       0.88      0.89      0.87     13647
weighted avg       0.90      0.87      0.87     13647



### Evaluation of models on each language of the Out-of-distribution dataset

In [5]:
model = None
detector = None
models = ['xlm', 'mdb']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()

    if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
        print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
        # Note: Works well with a small learning rate (e.g. 3e-6)
        detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')

    languages = multicw_ood['lang'].unique()

    for lang in languages:
        print(f'{h_yellow}Language: {lang}:{h_stop}')
        _, report = detector.detect_claims(multicw_ood[multicw_ood['lang'] == lang])
        print(report)

[1;30;42mXLM-RoBERTa model:[0m


2025-09-25 19:50:35.876046: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 19:50:35.937876: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 19:50:35.940863: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model loaded successfully.
[1;30;43mLanguage: nl:[0m
Running classification:


I0000 00:00:1758822643.508835   28888 service.cc:145] XLA service 0x7f791402d3e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758822643.508880   28888 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-09-25 19:50:43.584956: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-25 19:50:43.783665: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907



[1m  1/226[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:30[0m 4s/step

I0000 00:00:1758822645.910936   28888 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m225/226[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 193ms/step





[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 208ms/step
Done.
              precision    recall  f1-score   support

           0       0.88      0.73      0.80      4000
           1       0.72      0.88      0.79      3227

    accuracy                           0.79      7227
   macro avg       0.80      0.80      0.79      7227
weighted avg       0.81      0.79      0.79      7227

[1;30;43mLanguage: my:[0m
Running classification:
[1m228/229[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 195ms/step










[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 208ms/step
Done.
              precision    recall  f1-score   support

           0       0.99      0.52      0.69      4000
           1       0.63      1.00      0.77      3297

    accuracy                           0.74      7297
   macro avg       0.81      0.76      0.73      7297
weighted avg       0.83      0.74      0.73      7297

[1;30;43mLanguage: it:[0m
Running classification:
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 197ms/step
Done.
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      4000
           1       0.90      0.97      0.94      4000

    accuracy                           0.93      8000
   macro avg       0.94      0.93      0.93      8000
weighted avg       0.94      0.93      0.93      8000

[1;30;43mLanguage: mk:[0m
Running classification:
[1m222/223[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 197ms/st





[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 208ms/step
Done.
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      4000
           1       0.84      0.93      0.88      3123

    accuracy                           0.89      7123
   macro avg       0.89      0.89      0.89      7123
weighted avg       0.89      0.89      0.89      7123

[1;30;42mmDBERTa model:[0m
Loading full model from: Models/mdb-multicw-2e6-5e.keras


  instance.compile_from_config(compile_config)
2025-09-25 19:54:16.975197: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 771072000 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.
[1;30;43mLanguage: nl:[0m
Running classification:
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 347ms/step
Done.
              precision    recall  f1-score   support

           0       0.93      0.67      0.78      4000
           1       0.69      0.94      0.80      3227

    accuracy                           0.79      7227
   macro avg       0.81      0.80      0.79      7227
weighted avg       0.82      0.79      0.79      7227

[1;30;43mLanguage: my:[0m
Running classification:
[1m228/229[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 332ms/step













[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 353ms/step
Done.
              precision    recall  f1-score   support

           0       0.99      0.55      0.71      4000
           1       0.65      1.00      0.78      3297

    accuracy                           0.75      7297
   macro avg       0.82      0.77      0.75      7297
weighted avg       0.84      0.75      0.74      7297

[1;30;43mLanguage: it:[0m
Running classification:
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 334ms/step
Done.
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      4000
           1       0.90      0.98      0.94      4000

    accuracy                           0.94      8000
   macro avg       0.94      0.94      0.94      8000
weighted avg       0.94      0.94      0.94      8000

[1;30;43mLanguage: mk:[0m
Running classification:
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 343ms/s