# Model fine-tuning
### LESA model fine-tuning on MultiCW and OOD datasets

## Initialization

#### Setup project paths:

In [1]:
import os
import sys
from os.path import join, exists
from py_markdown_table.markdown_table import markdown_table

from tqdm.notebook import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_red = '\x1b[1;30;41m'
h_green = '\x1b[1;30;42m'
h_yellow = '\x1b[1;30;43m'
h_stop = '\x1b[0m'

## Setup project paths:
project_path = os.getcwd()
models_path = join(project_path, "Models")

datasets_path = join(project_path, "Source datasets")
multicw_path = join(project_path, 'Final-dataset')
multiclaim_path = join(datasets_path, "MultiClaim")
lesa_dst_dir = join(datasets_path, 'LESA-EACL-2021')
lesa_model_path = join(models_path, 'LESA')
print('done')

done


## Datasets
Loading the MultiCW and OOD datasets for the purpose of models fine-tuning and their evaluation.

In [2]:
# Load MultiCW model
import pandas as pd
from os.path import join
languages = pd.read_csv(join('Final-dataset', 'multicw-full.csv'))['lang'].unique()

multicw_path = join("Final-dataset")
multicw_train = pd.read_csv(join(multicw_path, "multicw-train.csv")).astype({'label':'int'})
multicw_dev = pd.read_csv(join(multicw_path, "multicw-dev.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'text':'str'})
multicw_ood = pd.read_csv(join(multicw_path, "multicw-ood.csv")).astype({'label':'int'})
multicw_ood['style'] = multicw_ood['style'].replace('structured', 'struc')
multicw_ood['text'] = multicw_ood['text'].fillna("").astype(str)

print(f'Loaded MultiCW:')
print(f'Train set: {multicw_train.shape[0]}')
print(f'Dev set: {multicw_dev.shape[0]}')
print(f'Test set: {multicw_test.shape[0]}')
print(f'Out-of-dist set: {multicw_ood.shape[0]}')

Loaded MultiCW:
Train set: 88001
Dev set: 14823
Test set: 14823
Out-of-dist set: 27761


## Model

### LESA model (EACL-2021)

In [3]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer


sys.path.insert(1, lesa_model_path)

from loader import sent2feature2ngram, ParentPositions, tokenize_sentences, create_attention_masks, \
    load_embedding_matrix, load_tokenizer, ind_model_noisy, ind_model_semi, ind_model_structured, final

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_stop = '\x1b[0m'
gh_start = '\x1b[1;30;42m'
rh_start = '\x1b[1;30;41m'


class LESAClaimModel():
    """ LESA: Linguistic Encapsulation and Semantic Amalgamation Based Generalised Claim Detection from Online
    Content accepted at EACL 2021. ArXiv paper [link](https://arxiv.org/abs/2101.11891) """

    def __init__(self):
        self.max_len = 30
        self.batch_size = 32

        self.final_model = None
        self.noisy_model = None
        self.semi_model = None
        self.structured_model = None

        self.noisy = None
        self.noisy_dev = None
        self.semi = None
        self.semi_dev = None
        self.struct = None
        self.struct_dev = None

        self.update_semantic_dbs = False

        # Set default semantic embedding datasets
        # self.set_semantic_datasets()

        # Init tokenizers as class variables
        print('Loading tokenizers: ', end='')

        # Original BERT model
        # model = os.path.join(lesa_model_path, 'bert-base')
        # self.bert_tokenizer_transformer = BertTokenizer.from_pretrained(model, local_files_only=True)

        # Multilingual BERT model
        model = "bert-base-multilingual-cased"
        self.bert_tokenizer_transformer = AutoTokenizer.from_pretrained(model)

        # EMBEDDINGS
        self.noisy_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_noisy.pickle')
        self.noisy_vocab_size_tag = self.noisy_embedding_matrix_tag.shape[0]  # 5363

        self.semi_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_semi.pickle')
        self.semi_vocab_size_tag = self.semi_embedding_matrix_tag.shape[0]  # 6137

        self.structured_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_structured.pickle')
        self.structured_vocab_size_tag = self.structured_embedding_matrix_tag.shape[0]  # 6048

        # PARENT POS TOKENIZER
        self.tokenizer_dep_parent_noisy = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_noisy.pickle')
        self.num_words_dep_parent_noisy = self.tokenizer_dep_parent_noisy.num_words  # 100

        self.tokenizer_dep_parent_semi = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_semi.pickle')
        self.num_words_dep_parent_semi = self.tokenizer_dep_parent_semi.num_words  # 200

        self.tokenizer_dep_parent_structured = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_structured.pickle')
        self.num_words_dep_parent_structured = self.tokenizer_dep_parent_structured.num_words  # 200

        # LABEL TOKENIZER
        self.tokenizer_dep_noisy = load_tokenizer(lesa_model_path, 'tokenizer_dep_noisy.pickle')
        self.num_words_dep_noisy = self.tokenizer_dep_noisy.num_words  # 6300

        self.tokenizer_dep_semi = load_tokenizer(lesa_model_path, 'tokenizer_dep_semi.pickle')
        self.num_words_dep_semi = self.tokenizer_dep_semi.num_words  # 7300

        self.tokenizer_dep_structured = load_tokenizer(lesa_model_path, 'tokenizer_dep_structured.pickle')
        self.num_words_dep_structured = self.tokenizer_dep_structured.num_words  # 7400

        # TAG TOKENIZER
        self.tokenizer_tag_noisy = load_tokenizer(lesa_model_path, 'tokenizer_tag_noisy.pickle')
        self.tokenizer_tag_semi = load_tokenizer(lesa_model_path, 'tokenizer_tag_semi.pickle')
        self.tokenizer_tag_structured = load_tokenizer(lesa_model_path, 'tokenizer_tag_structured.pickle')
        print('ok')

        self.final_model = self._init_model()

    def load_model(self, model_name='lesa2021') -> bool:
        """Loads the model from the file."""

        os.makedirs(models_path, exist_ok=True)

        print(join(os.getcwd(), models_path, model_name))
        if not os.path.exists(join(models_path, model_name)):
            print(rh_start + 'Invalid path!' + h_stop)
            return False

        print(f'Loading {model_name} model: ', end='')
        try:
            self.noisy_model.load_weights(join(models_path, model_name, '_dep_noisy.h5'))
            self.semi_model.load_weights(join(models_path, model_name, '_dep_semi.h5'))
            self.structured_model.load_weights(join(models_path, model_name, '_dep_structured.h5'))

            self.final_model.load_weights(join(models_path, model_name, '_bert_comb.h5'))

            print(gh_start + " ok" + h_stop)
            return True
        except Exception as e:
            # print(e)
            print(rh_start + " failed!" + h_stop)
            return False

    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        x = self.semantic_embeddings(test_set)

        # testing
        print('Running classification:')
        metrics = self.final_model.predict(x)
        results = [np.argmax(el) for el in metrics[-1]]
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # compare against ground-truth
        ground_truth = test_set['label']
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en'):
        """
        Train the LESA-2021 model with the given parameters. The training consists of two phases:
        - Pre-training of semantic modules with noisy, semi-noisy and structured data respectively
        - Fine-tuning of the main BERT model together with the semantic model on the training data
        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        model_name = f"{model_name}-{lang}-{epochs}e"
        path = os.path.join(models_path, model_name)
        semantic_path = os.path.join(lesa_model_path, 'semantic_base')
        os.makedirs(path, exist_ok=True)

        if not self.update_semantic_dbs:
            # Make sure that semantic models are loaded
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_noisy.h5'), dst=os.path.join(path, '_dep_noisy.h5'))
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_semi.h5'), dst=os.path.join(path, '_dep_semi.h5'))
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_structured.h5'), dst=os.path.join(path, '_dep_structured.h5'))

            self.noisy_model.load_weights(os.path.join(semantic_path, '_dep_noisy.h5'))
            self.semi_model.load_weights(os.path.join(semantic_path, '_dep_semi.h5'))
            self.structured_model.load_weights(os.path.join(semantic_path, '_dep_structured.h5'))
        else:
            # If we use custom semantic models, make sure they are saved within the same folder as the model file
            self.noisy_model.save_weights(os.path.join(path, '_dep_noisy.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_noisy.h5'))

            self.semi_model.save_weights(os.path.join(path, '_dep_semi.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_semi.h5'))

            self.structured_model.save_weights(os.path.join(path, '_dep_structured.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_structured.h5'))

        print('Train Final model:')
        x = self.semantic_embeddings(train_set)
        dev = self.semantic_embeddings(dev_set)

        self.final_model.optimizer = Adam(learning_rate=learn_rate)
        self.final_model.fit(x=x, y=train_set['label'], batch_size=self.batch_size, epochs=epochs,
                             validation_data=(dev, dev_set['label']))
        self.final_model.save_weights(os.path.join(path, '_bert_comb.h5'))
        print('Model saved to: ', os.path.join(path, '_bert_comb.h5'))

    def _init_model(self):
        # load_model
        print("Initializing model architecture: ")

        # aux CLAIMS-2023 model
        self.noisy_model = ind_model_noisy(embed_dim=20, num_heads=5, ff_dim=128,
                                           maxlen=self.max_len, vocab_label=self.num_words_dep_noisy,
                                           vocab_parent_pos=self.num_words_dep_parent_noisy)

        self.semi_model = ind_model_semi(embed_dim=20, num_heads=5, ff_dim=128,
                                         maxlen=self.max_len, vocab_label=self.num_words_dep_semi,
                                         vocab_parent_pos=self.num_words_dep_parent_semi)

        self.structured_model = ind_model_structured(embed_dim=20, num_heads=5, ff_dim=128,
                                                     maxlen=self.max_len, vocab_label=self.num_words_dep_structured,
                                                     vocab_parent_pos=self.num_words_dep_parent_structured)

        parameters_dict_noisy = {
            "vocab_size_tag": self.noisy_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.noisy_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        parameters_dict_semi = {
            "vocab_size_tag": self.semi_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.semi_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        parameters_dict_structured = {
            "vocab_size_tag": self.structured_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.structured_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        final_model = final(lesa_model_path, self.noisy_model, self.semi_model, self.structured_model, parameters_dict_noisy,
                            parameters_dict_semi, parameters_dict_structured, max_seq_length=60)

        print('ok')

        return final_model

    def semantic_embeddings(self, dataset: DataFrame):
        # GET SYNTACTIC REP: TEST
        # Assuming 'original_dataset' is your original DataFrame
        dataset = dataset.copy()

        print("Getting dependency and POS tags...")

        # Processing DEP tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('sentence2pos tags:')
        dataset.loc[:, 'DEP'] = [sent2feature2ngram(row) for row in progress]

        # Processing parent POS tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('Parent POS tags:')
        dataset.loc[:, "parent_pos"] = [ParentPositions(row) for row in progress]

        # Processing TAG tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('sentence2tag tags:')
        dataset.loc[:, 'TAG'] = [sent2feature2ngram(row, feature="TAG") for row in progress]

        print("POS Tags: complete!")

        # COMMON TEST REP | DEP
        print("Getting dependency and DEP tags...")
        noisy = self.tokenizer_dep_noisy.texts_to_sequences(dataset['DEP'])
        semi = self.tokenizer_dep_semi.texts_to_sequences(dataset['DEP'])
        structured = self.tokenizer_dep_structured.texts_to_sequences(dataset['DEP'])

        p_noisy = self.tokenizer_dep_parent_noisy.texts_to_sequences(dataset['parent_pos'])
        p_semi = self.tokenizer_dep_parent_semi.texts_to_sequences(dataset['parent_pos'])
        p_structured = self.tokenizer_dep_parent_structured.texts_to_sequences(dataset['parent_pos'])

        noisy = pad_sequences(noisy, maxlen=self.max_len)
        semi = pad_sequences(semi, maxlen=self.max_len)
        structured = pad_sequences(structured, maxlen=self.max_len)

        p_noisy = pad_sequences(p_noisy, maxlen=self.max_len)
        p_semi = pad_sequences(p_semi, maxlen=self.max_len)
        p_structured = pad_sequences(p_structured, maxlen=self.max_len)

        print("DEP Tags: complete!")

        # BERT REP: TEST
        print("Creating BERT Embeddings...")
        input_ids = tokenize_sentences(dataset['text'], self.bert_tokenizer_transformer, 60)
        input_ids = pad_sequences(input_ids, maxlen=60, dtype="long", value=0, truncating="post", padding="post")
        attention_masks = create_attention_masks(input_ids)

        # COMMON TEST REP | TAG
        tag_noisy = self.tokenizer_tag_noisy.texts_to_sequences(dataset['TAG'])
        tag_semi = self.tokenizer_tag_semi.texts_to_sequences(dataset['TAG'])
        tag_structured = self.tokenizer_tag_structured.texts_to_sequences(dataset['TAG'])

        noisy[noisy >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1
        p_noisy[p_noisy >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1
        tag_noisy = np.array([np.array(x) for x in tqdm(tag_noisy)], dtype=object)
        for arrays in tag_noisy:
            arrays[arrays >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1

        tag_noisy = pad_sequences(tag_noisy, maxlen=self.max_len)
        tag_semi = pad_sequences(tag_semi, maxlen=self.max_len)
        tag_structured = pad_sequences(tag_structured, maxlen=self.max_len)

        x = {"label_noisy": np.array(noisy), "parent_pos_noisy": np.array(p_noisy),
             "label_semi": np.array(semi), "parent_pos_semi": np.array(p_semi),
             "label_structured": np.array(structured),
             "parent_pos_structured": np.array(p_structured),
             "inp_noisy": np.array(tag_noisy),
             "inp_semi": np.array(tag_semi),
             "inp_structured": np.array(tag_structured),
             'input_word_ids': np.array(input_ids), 'input_masks': np.array(attention_masks)}

        return x




2025-10-01 14:03:38.037590: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-01 14:03:38.041216: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2025-10-01 14:03:38.204706: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory
2025-10-01 14:03:38.204769: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the requir

## Experiments

### Models fine-tuning on MultiCW dataset
- Fine-tuning of xlm-RoBERTa, mDeBERTa anb LESA models on MultiCW train set
- Evaluation on MultiCW test set
#### LESA model:
<table>
<tr>
<td>

| Class            | Precision | Recall | F1‑score | Support   |
| ---------------- | --------- | ------ | -------- | --------- |
| 0                | **0.84**  | **0.72** | **0.78** | 9269      |
| 1                | **0.75**  | **0.87** | **0.80** | 9164      |
| **Accuracy**     |           |        | **0.79** | **18433** |
| **Macro avg**    | 0.80      | 0.79   | 0.79     | 18433     |
| **Weighted avg** | 0.80      | 0.79   | 0.79     | 18433     |
                    MultiCW Overall.
</td>
<td>

| Class            | Precision | Recall | F1‑score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | **0.78**  | **0.57** | **0.66** | 4744     |
| 1                | **0.66**  | **0.83** | **0.73** | 4639     |
| **Accuracy**     |           |        | **0.70** | **9383** |
| **Macro avg**    | 0.72      | 0.70   | 0.70     | 9383      |
| **Weighted avg** | 0.72      | 0.70   | 0.70     | 9383      |
                MultiCW Noisy Part.
</td>
<td>

| Class            | Precision | Recall | F1‑score | Support  |
| ---------------- | --------- | ------ | -------- | -------- |
| 0                | **0.89**  | **0.87** | **0.88** | 4525     |
| 1                | **0.88**  | **0.90** | **0.89** | 4525     |
| **Accuracy**     |           |        | **0.88** | **9050** |
| **Macro avg**    | 0.88      | 0.88   | 0.88     | 9050     |
| **Weighted avg** | 0.88      | 0.88   | 0.88     | 9050     |
                MultiCW Structured Part.
</td>
</tr>
</table>

In [14]:
print(f'{h_green}LESA model:{h_stop}')
detector = LESAClaimModel()

if not detector.load_model(model_name=f'lesa-multicw-2e6-5e'):
    print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
    # Note: Works well with a small learning rate (e.g. 3e-6)
    detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'lesa-multicw')

print(f'{h_green}MultiCW overall:{h_stop}')
_, report = detector.detect_claims(multicw_test)
print(report)

test_noisy = multicw_test.loc[multicw_test['style']=='noisy']
_, report = detector.detect_claims(test_noisy, verbose=False)
print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
print(report)

test_struc = multicw_test.loc[multicw_test['style']=='struct']
_, report = detector.detect_claims(test_struc, verbose=False)
print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
print(report)

[1;30;42mLESA model:[0m
Loading tokenizers: ok
Initializing model architecture: 


All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/LESA/bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
sentence2pos tags::  14%|██             | 1269/9394 [1:18:57<8:25:30,  3.73s/it]
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequence

ok
/home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/lesa-multicw-2e6-5e
Loading lesa-multicw-2e6-5e model: [1;30;42m ok[0m
[1;30;42mMultiCW overall:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|████████████████| 18433/18433 [01:40<00:00, 182.53it/s]
Parent POS tags:: 100%|██████████████████| 18433/18433 [01:33<00:00, 197.40it/s]
sentence2tag tags:: 100%|████████████████| 18433/18433 [01:50<00:00, 166.29it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|███████████████████████████████████| 18433/18433 [00:03<00:00, 5535.35it/s]
100%|█████████████████████████████████| 18433/18433 [00:00<00:00, 271617.95it/s]


Running classification:
Done.
              precision    recall  f1-score   support

           0       0.84      0.72      0.78      9269
           1       0.75      0.87      0.80      9164

    accuracy                           0.79     18433
   macro avg       0.80      0.79      0.79     18433
weighted avg       0.80      0.79      0.79     18433

Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 9383/9383 [00:40<00:00, 231.20it/s]
Parent POS tags:: 100%|████████████████████| 9383/9383 [00:37<00:00, 253.46it/s]
sentence2tag tags:: 100%|██████████████████| 9383/9383 [00:36<00:00, 254.19it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 9383/9383 [00:00<00:00, 10628.28it/s]
100%|███████████████████████████████████| 9383/9383 [00:00<00:00, 383593.46it/s]


Running classification:
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      4744
           1       0.66      0.83      0.73      4639

    accuracy                           0.70      9383
   macro avg       0.72      0.70      0.70      9383
weighted avg       0.72      0.70      0.70      9383

Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 9050/9050 [00:34<00:00, 262.05it/s]
Parent POS tags:: 100%|████████████████████| 9050/9050 [00:33<00:00, 266.80it/s]
sentence2tag tags:: 100%|██████████████████| 9050/9050 [00:34<00:00, 265.30it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 9050/9050 [00:00<00:00, 12492.62it/s]
100%|███████████████████████████████████| 9050/9050 [00:00<00:00, 409896.35it/s]


Running classification:
Done.
[1;30;43mMultiCW Structured Part:[0m
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4525
           1       0.88      0.90      0.89      4525

    accuracy                           0.88      9050
   macro avg       0.88      0.88      0.88      9050
weighted avg       0.88      0.88      0.88      9050



### Evaluation of LESA model on each language of the MultiCW dataset

In [4]:
print(f'{h_green}LESA model:{h_stop}')
detector = LESAClaimModel()

if not detector.load_model(model_name=f'lesa-multicw-2e6-5e'):
    print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
    # Note: Works well with a small learning rate (e.g. 3e-6)
    detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'lesa-multicw')

languages = multicw_test['lang'].unique()

for lang in languages:
    print(f'{h_yellow}Language: {lang}:{h_stop}')
    _, report = detector.detect_claims(multicw_test[multicw_test['lang'] == lang])
    print(report)

[1;30;42mLESA model:[0m
Loading tokenizers: ok
Initializing model architecture: 


2025-09-25 21:01:36.669532: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/LESA/bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another arch

ok
/home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/lesa-multicw-2e6-5e
Loading lesa-multicw-2e6-5e model: [1;30;42m ok[0m
[1;30;43mLanguage: sk:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1192/1192 [00:05<00:00, 206.90it/s]
Parent POS tags:: 100%|████████████████████| 1192/1192 [00:05<00:00, 225.29it/s]
sentence2tag tags:: 100%|██████████████████| 1192/1192 [00:06<00:00, 189.45it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1192/1192 [00:00<00:00, 11538.16it/s]
100%|███████████████████████████████████| 1192/1192 [00:00<00:00, 378185.35it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.81      0.73      0.77       596
         1.0       0.75      0.82      0.79       596

    accuracy                           0.78      1192
   macro avg       0.78      0.78      0.78      1192
weighted avg       0.78      0.78      0.78      1192

[1;30;43mLanguage: pl:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1194/1194 [00:05<00:00, 206.37it/s]
Parent POS tags:: 100%|████████████████████| 1194/1194 [00:04<00:00, 243.47it/s]
sentence2tag tags:: 100%|██████████████████| 1194/1194 [00:04<00:00, 257.21it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1194/1194 [00:00<00:00, 12512.21it/s]
100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 338836.20it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.87      0.77      0.82       597
         1.0       0.79      0.89      0.84       597

    accuracy                           0.83      1194
   macro avg       0.83      0.83      0.83      1194
weighted avg       0.83      0.83      0.83      1194

[1;30;43mLanguage: cs:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1192/1192 [00:05<00:00, 211.37it/s]
Parent POS tags:: 100%|████████████████████| 1192/1192 [00:05<00:00, 222.38it/s]
sentence2tag tags:: 100%|██████████████████| 1192/1192 [00:04<00:00, 241.19it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1192/1192 [00:00<00:00, 10386.92it/s]
100%|███████████████████████████████████| 1192/1192 [00:00<00:00, 381271.29it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.78      0.78      0.78       596
         1.0       0.78      0.78      0.78       596

    accuracy                           0.78      1192
   macro avg       0.78      0.78      0.78      1192
weighted avg       0.78      0.78      0.78      1192

[1;30;43mLanguage: bg:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1060/1060 [00:05<00:00, 205.95it/s]
Parent POS tags:: 100%|████████████████████| 1060/1060 [00:04<00:00, 241.05it/s]
sentence2tag tags:: 100%|██████████████████| 1060/1060 [00:05<00:00, 206.81it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████| 1060/1060 [00:00<00:00, 2798.30it/s]
100%|███████████████████████████████████| 1060/1060 [00:00<00:00, 362432.73it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.54      0.45      0.49       530
         1.0       0.53      0.61      0.57       530

    accuracy                           0.53      1060
   macro avg       0.53      0.53      0.53      1060
weighted avg       0.53      0.53      0.53      1060

[1;30;43mLanguage: ru:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1194/1194 [00:04<00:00, 255.25it/s]
Parent POS tags:: 100%|████████████████████| 1194/1194 [00:04<00:00, 265.45it/s]
sentence2tag tags:: 100%|██████████████████| 1194/1194 [00:04<00:00, 265.42it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1194/1194 [00:00<00:00, 11253.60it/s]
100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 406526.42it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.86      0.85      0.86       597
         1.0       0.85      0.86      0.86       597

    accuracy                           0.86      1194
   macro avg       0.86      0.86      0.86      1194
weighted avg       0.86      0.86      0.86      1194

[1;30;43mLanguage: uk:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1190/1190 [00:06<00:00, 191.15it/s]
Parent POS tags:: 100%|████████████████████| 1190/1190 [00:05<00:00, 236.81it/s]
sentence2tag tags:: 100%|██████████████████| 1190/1190 [00:07<00:00, 165.61it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████| 1190/1190 [00:00<00:00, 3818.37it/s]
100%|███████████████████████████████████| 1190/1190 [00:00<00:00, 398246.37it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81       595
         1.0       0.81      0.81      0.81       595

    accuracy                           0.81      1190
   macro avg       0.81      0.81      0.81      1190
weighted avg       0.81      0.81      0.81      1190

[1;30;43mLanguage: zh:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1034/1034 [00:04<00:00, 255.00it/s]
Parent POS tags:: 100%|████████████████████| 1034/1034 [00:02<00:00, 371.56it/s]
sentence2tag tags:: 100%|██████████████████| 1034/1034 [00:03<00:00, 341.15it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1034/1034 [00:00<00:00, 11459.00it/s]
100%|███████████████████████████████████| 1034/1034 [00:00<00:00, 485113.01it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.87      0.80      0.84       517
         1.0       0.82      0.88      0.85       517

    accuracy                           0.84      1034
   macro avg       0.85      0.84      0.84      1034
weighted avg       0.85      0.84      0.84      1034

[1;30;43mLanguage: hi:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1194/1194 [00:05<00:00, 232.89it/s]
Parent POS tags:: 100%|████████████████████| 1194/1194 [00:04<00:00, 255.46it/s]
sentence2tag tags:: 100%|██████████████████| 1194/1194 [00:04<00:00, 257.13it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1194/1194 [00:00<00:00, 12070.75it/s]
100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 402119.72it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.93      0.79      0.86       597
         1.0       0.82      0.94      0.88       597

    accuracy                           0.87      1194
   macro avg       0.88      0.87      0.87      1194
weighted avg       0.88      0.87      0.87      1194

[1;30;43mLanguage: en:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1200/1200 [00:08<00:00, 149.62it/s]
Parent POS tags:: 100%|████████████████████| 1200/1200 [00:08<00:00, 148.51it/s]
sentence2tag tags:: 100%|██████████████████| 1200/1200 [00:05<00:00, 213.03it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1200/1200 [00:00<00:00, 10665.16it/s]
100%|███████████████████████████████████| 1200/1200 [00:00<00:00, 379060.46it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.69      0.67      0.68       600
         1.0       0.68      0.70      0.69       600

    accuracy                           0.68      1200
   macro avg       0.68      0.68      0.68      1200
weighted avg       0.68      0.68      0.68      1200

[1;30;43mLanguage: ar:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1198/1198 [00:09<00:00, 131.85it/s]
Parent POS tags:: 100%|████████████████████| 1198/1198 [00:05<00:00, 218.49it/s]
sentence2tag tags:: 100%|██████████████████| 1198/1198 [00:05<00:00, 200.63it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████| 1198/1198 [00:00<00:00, 9699.31it/s]
100%|███████████████████████████████████| 1198/1198 [00:00<00:00, 382723.45it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.77      0.48      0.59       599
         1.0       0.62      0.86      0.72       599

    accuracy                           0.67      1198
   macro avg       0.70      0.67      0.66      1198
weighted avg       0.70      0.67      0.66      1198

[1;30;43mLanguage: es:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1198/1198 [00:08<00:00, 138.44it/s]
Parent POS tags:: 100%|████████████████████| 1198/1198 [00:07<00:00, 167.36it/s]
sentence2tag tags:: 100%|██████████████████| 1198/1198 [00:08<00:00, 134.66it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████| 1198/1198 [00:00<00:00, 3366.34it/s]
100%|███████████████████████████████████| 1198/1198 [00:00<00:00, 176648.84it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.85      0.63      0.72       599
         1.0       0.71      0.89      0.79       599

    accuracy                           0.76      1198
   macro avg       0.78      0.76      0.76      1198
weighted avg       0.78      0.76      0.76      1198

[1;30;43mLanguage: fr:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1194/1194 [00:07<00:00, 161.88it/s]
Parent POS tags:: 100%|████████████████████| 1194/1194 [00:05<00:00, 215.16it/s]
sentence2tag tags:: 100%|██████████████████| 1194/1194 [00:07<00:00, 150.10it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1194/1194 [00:00<00:00, 13575.53it/s]
100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 442090.31it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.93      0.83      0.88       597
         1.0       0.84      0.94      0.89       597

    accuracy                           0.88      1194
   macro avg       0.89      0.88      0.88      1194
weighted avg       0.89      0.88      0.88      1194

[1;30;43mLanguage: tr:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1184/1184 [00:06<00:00, 190.06it/s]
Parent POS tags:: 100%|████████████████████| 1184/1184 [00:05<00:00, 236.55it/s]
sentence2tag tags:: 100%|██████████████████| 1184/1184 [00:05<00:00, 212.88it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████| 1184/1184 [00:00<00:00, 9649.72it/s]
100%|███████████████████████████████████| 1184/1184 [00:00<00:00, 388732.36it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.79      0.48      0.60       592
         1.0       0.63      0.87      0.73       592

    accuracy                           0.68      1184
   macro avg       0.71      0.68      0.66      1184
weighted avg       0.71      0.68      0.66      1184

[1;30;43mLanguage: bn:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|████████████████████| 930/930 [00:05<00:00, 182.19it/s]
Parent POS tags:: 100%|██████████████████████| 930/930 [00:03<00:00, 271.75it/s]
sentence2tag tags:: 100%|████████████████████| 930/930 [00:03<00:00, 260.09it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|██████████████████████████████████████| 930/930 [00:00<00:00, 11298.56it/s]
100%|█████████████████████████████████████| 930/930 [00:00<00:00, 428931.46it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.93      0.69      0.79       465
         1.0       0.75      0.95      0.84       465

    accuracy                           0.82       930
   macro avg       0.84      0.82      0.81       930
weighted avg       0.84      0.82      0.81       930

[1;30;43mLanguage: pt:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1194/1194 [00:09<00:00, 126.76it/s]
Parent POS tags:: 100%|█████████████████████| 1194/1194 [00:12<00:00, 93.25it/s]
sentence2tag tags:: 100%|██████████████████| 1194/1194 [00:07<00:00, 158.00it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1194/1194 [00:00<00:00, 12515.49it/s]
100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 394765.80it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.96      0.86      0.91       597
         1.0       0.87      0.96      0.92       597

    accuracy                           0.91      1194
   macro avg       0.92      0.91      0.91      1194
weighted avg       0.92      0.91      0.91      1194

[1;30;43mLanguage: de:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████| 1192/1192 [00:05<00:00, 237.81it/s]
Parent POS tags:: 100%|████████████████████| 1192/1192 [00:04<00:00, 255.31it/s]
sentence2tag tags:: 100%|██████████████████| 1192/1192 [00:05<00:00, 228.22it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|████████████████████████████████████| 1192/1192 [00:00<00:00, 12520.03it/s]
100%|███████████████████████████████████| 1192/1192 [00:00<00:00, 390411.55it/s]


Running classification:
Done.
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88       596
         1.0       0.86      0.91      0.89       596

    accuracy                           0.88      1192
   macro avg       0.88      0.88      0.88      1192
weighted avg       0.88      0.88      0.88      1192



### Out-domain evaluation
- Evaluation of the fine-tuned models on manually obtained samples from factcheck.afp.com and with the preprocessing applied

In [4]:
print(f'{h_green}LESA model:{h_stop}')
detector = LESAClaimModel()

if not detector.load_model(model_name=f'lesa-multicw-2e6-5e'):
    print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
    # Note: Works well with a small learning rate (e.g. 3e-6)
    detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'lesa-multicw')

print(f'{h_green}MultiCW overall:{h_stop}')
_, report = detector.detect_claims(multicw_ood)
print(report)

test_noisy = multicw_ood.loc[multicw_ood['style']=='noisy']
_, report = detector.detect_claims(test_noisy, verbose=False)
print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
print(report)

test_struc = multicw_ood.loc[multicw_ood['style']=='struc']
_, report = detector.detect_claims(test_struc, verbose=False)
print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
print(report)

[1;30;42mLESA model:[0m
Loading tokenizers: ok
Initializing model architecture: 


2025-10-01 14:04:10.484076: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/LESA/bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
2025-10-01 14:04:16.227942: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 367248384 exceeds 10% of free system memory.
2025-10-01 14:04:16.387841: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 367248384 exceeds 10% of free syste

ok
/home/hyben/KInIT/Projects/veraAI/T4.1-Claim detection/MultiCW/Models/lesa-multicw-2e6-5e
Loading lesa-multicw-2e6-5e model: [1;30;42m ok[0m
[1;30;42mMultiCW overall:[0m
Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 27761/27761 [01:56<00:00, 237.97it/s]
Parent POS tags:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 27761/27761 [01:50<00:00, 250.17it/s]
sentence2tag tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 27761/27761 [01:54<00:00, 242.14it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27761/27761 [00:03<00:00, 7511.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27761/27761 [00:00<00:00, 409996.10it/s]


Running classification:
Done.
              precision    recall  f1-score   support

           0       0.86      0.64      0.73     15997
           1       0.64      0.86      0.73     11764

    accuracy                           0.73     27761
   macro avg       0.75      0.75      0.73     27761
weighted avg       0.77      0.73      0.73     27761

Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 14114/14114 [01:05<00:00, 215.14it/s]
Parent POS tags:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 14114/14114 [01:04<00:00, 218.20it/s]
sentence2tag tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 14114/14114 [01:05<00:00, 215.22it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14114/14114 [00:02<00:00, 5884.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14114/14114 [00:00<00:00, 377170.42it/s]


Running classification:
Done.
[1;30;43mMultiCW Noisy Part:[0m
              precision    recall  f1-score   support

           0       0.81      0.50      0.62      7997
           1       0.57      0.85      0.68      6117

    accuracy                           0.65     14114
   macro avg       0.69      0.67      0.65     14114
weighted avg       0.71      0.65      0.65     14114

Getting dependency and POS tags...


sentence2pos tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 13647/13647 [00:50<00:00, 272.06it/s]
Parent POS tags:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 13647/13647 [00:48<00:00, 278.57it/s]
sentence2tag tags:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 13647/13647 [00:48<00:00, 278.98it/s]


POS Tags: complete!
Getting dependency and DEP tags...
DEP Tags: complete!
Creating BERT Embeddings...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13647/13647 [00:01<00:00, 9992.04it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13647/13647 [00:00<00:00, 412960.77it/s]


Running classification:
Done.
[1;30;43mMultiCW Structured Part:[0m
              precision    recall  f1-score   support

           0       0.90      0.78      0.83      8000
           1       0.73      0.87      0.80      5647

    accuracy                           0.82     13647
   macro avg       0.82      0.83      0.82     13647
weighted avg       0.83      0.82      0.82     13647

