## Imports & constants

In [2]:
import os
from typing import List
import json
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil
import sys
import logging 

logging.basicConfig(
     level=logging.INFO, 
     format= '[%(asctime)s|%(levelname)s|%(module)s.py:%(lineno)s] %(message)s',
     datefmt='%H:%M:%S'
 )
import tqdm.notebook as tq
from tqdm import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, 
    TrainingArguments, Trainer, EarlyStoppingCallback, IntervalStrategy
)

from defi_textmine_2025.data.utils import TARGET_COL, INTERIM_DIR, MODELS_DIR

In [4]:
RANDOM_SEED = 123  # random reproducibility
np.random.seed(RANDOM_SEED)
# BASE_CHECKPOINT = "bert-base-uncased"
# BASE_CHECKPOINT = "bert-base-multilingual-cased"
BASE_CHECKPOINT = "camembert/camembert-base"
TASK_NAME = "single_multilabel_model"
TASK_INPUT_COL = "input_text"

entity_classes = {'TERRORIST_OR_CRIMINAL', 'LASTNAME', 'LENGTH', 'NATURAL_CAUSES_DEATH', 'COLOR', 'STRIKE', 'DRUG_OPERATION', 'HEIGHT', 'INTERGOVERNMENTAL_ORGANISATION', 'TRAFFICKING', 'NON_MILITARY_GOVERNMENT_ORGANISATION', 'TIME_MIN', 'DEMONSTRATION', 'TIME_EXACT', 'FIRE', 'QUANTITY_MIN', 'MATERIEL', 'GATHERING', 'PLACE', 'CRIMINAL_ARREST', 'CBRN_EVENT', 'ECONOMICAL_CRISIS', 'ACCIDENT', 'LONGITUDE', 'BOMBING', 'MATERIAL_REFERENCE', 'WIDTH', 'FIRSTNAME', 'MILITARY_ORGANISATION', 'CIVILIAN', 'QUANTITY_MAX', 'CATEGORY', 'POLITICAL_VIOLENCE', 'EPIDEMIC', 'TIME_MAX', 'TIME_FUZZY', 'NATURAL_EVENT', 'SUICIDE', 'CIVIL_WAR_OUTBREAK', 'POLLUTION', 'ILLEGAL_CIVIL_DEMONSTRATION', 'NATIONALITY', 'GROUP_OF_INDIVIDUALS', 'QUANTITY_FUZZY', 'RIOT', 'WEIGHT', 'THEFT', 'MILITARY', 'NON_GOVERNMENTAL_ORGANISATION', 'LATITUDE', 'COUP_D_ETAT', 'ELECTION', 'HOOLIGANISM_TROUBLEMAKING', 'QUANTITY_EXACT', 'AGITATING_TROUBLE_MAKING'}
categories_to_check = ['END_DATE', 'GENDER_MALE', 'WEIGHS', 'DIED_IN', 'HAS_FAMILY_RELATIONSHIP', 'IS_DEAD_ON', 'IS_IN_CONTACT_WITH', 'HAS_CATEGORY', 'HAS_CONTROL_OVER', 'IS_BORN_IN', 'IS_OF_SIZE', 'HAS_LATITUDE', 'IS_PART_OF', 'IS_OF_NATIONALITY', 'IS_COOPERATING_WITH', 'DEATHS_NUMBER', 'HAS_FOR_HEIGHT', 'INITIATED', 'WAS_DISSOLVED_IN', 'HAS_COLOR', 'CREATED', 'IS_LOCATED_IN', 'WAS_CREATED_IN', 'IS_AT_ODDS_WITH', 'HAS_CONSEQUENCE', 'HAS_FOR_LENGTH', 'INJURED_NUMBER', 'START_DATE', 'STARTED_IN', 'GENDER_FEMALE', 'HAS_LONGITUDE', 'RESIDES_IN', 'HAS_FOR_WIDTH', 'IS_BORN_ON', 'HAS_QUANTITY', 'OPERATES_IN', 'IS_REGISTERED_AS']

mlb = MultiLabelBinarizer()
mlb.fit([categories_to_check])
logging.info(f"{mlb.classes_=}")

TASK_TARGET_COLS = mlb.classes_.tolist() # hasrelation?
logging.info(f"{TASK_TARGET_COLS=}")

generated_data_dir_path = os.path.join(INTERIM_DIR, "reduced_text_w_entity_bracket")
assert os.path.exists(generated_data_dir_path)
train_dir = os.path.join(generated_data_dir_path, "train")
test_dir = os.path.join(generated_data_dir_path, "test")

preprocessed_data_dir = os.path.join(INTERIM_DIR, "one_hot_reduced_text_w_entity_bracket")
labeled_preprocessed_data_dir_path = os.path.join(preprocessed_data_dir,"train")
! mkdir -p {labeled_preprocessed_data_dir_path}

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

[22:44:50|INFO|1991774308.py:14] mlb.classes_=array(['CREATED', 'DEATHS_NUMBER', 'DIED_IN', 'END_DATE', 'GENDER_FEMALE',
       'GENDER_MALE', 'HAS_CATEGORY', 'HAS_COLOR', 'HAS_CONSEQUENCE',
       'HAS_CONTROL_OVER', 'HAS_FAMILY_RELATIONSHIP', 'HAS_FOR_HEIGHT',
       'HAS_FOR_LENGTH', 'HAS_FOR_WIDTH', 'HAS_LATITUDE', 'HAS_LONGITUDE',
       'HAS_QUANTITY', 'INITIATED', 'INJURED_NUMBER', 'IS_AT_ODDS_WITH',
       'IS_BORN_IN', 'IS_BORN_ON', 'IS_COOPERATING_WITH', 'IS_DEAD_ON',
       'IS_IN_CONTACT_WITH', 'IS_LOCATED_IN', 'IS_OF_NATIONALITY',
       'IS_OF_SIZE', 'IS_PART_OF', 'IS_REGISTERED_AS', 'OPERATES_IN',
       'RESIDES_IN', 'STARTED_IN', 'START_DATE', 'WAS_CREATED_IN',
       'WAS_DISSOLVED_IN', 'WEIGHS'], dtype=object)
[22:44:50|INFO|1991774308.py:17] TASK_TARGET_COLS=['CREATED', 'DEATHS_NUMBER', 'DIED_IN', 'END_DATE', 'GENDER_FEMALE', 'GENDER_MALE', 'HAS_CATEGORY', 'HAS_COLOR', 'HAS_CONSEQUENCE', 'HAS_CONTROL_OVER', 'HAS_FAMILY_RELATIONSHIP', 'HAS_FOR_HEIGHT', 'HAS_FOR_LENGT

device(type='cuda')

In [39]:
def get_cat_var_distribution(cat_var: pd.Series | pd.DataFrame) -> pd.DataFrame:
    if isinstance(cat_var, pd.Series):
        return pd.concat(
            [cat_var.value_counts(), cat_var.value_counts(normalize=True)], axis=1
        )
    else:
        return cat_var.sum(axis=0).sort_values().to_frame().T

## Prepare the datasets for the binary text classification

### Load and process the target

In [31]:
def load_csv(dir_or_file_path: str, index_col=None, sep=',') -> pd.DataFrame:
    if os.path.isdir(dir_or_file_path):
        all_files = glob.glob(os.path.join(dir_or_file_path , "*.csv"))  
    else:
        assert dir_or_file_path.endswith(".csv")
        all_files = [dir_or_file_path]
    assert len(all_files) > 0
    return pd.concat([pd.read_csv(filename, index_col=index_col, header=0, sep=sep) for filename in all_files], axis=0, ignore_index=True)

def format_relations_str_to_list(labels_as_str: str) -> List[str]:
    return json.loads(
        labels_as_str.replace("{", "[").replace("}", "]").replace("'", '"')
    )  if not pd.isnull(labels_as_str) else []

def process_data(data: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([data, pd.DataFrame(mlb.transform(data[TARGET_COL]), columns=mlb.classes_, index=data.index)], axis=1) # .drop([TARGET_COL], axis=1)

labeled_df = load_csv(train_dir, index_col=0).assign(**{
        TASK_INPUT_COL: lambda df: df[["e1_type", "e2_type", "reduced_text"]].apply(lambda row: ' | '.join(row.values.astype(str)), axis=1),        
    },
)
labeled_df = process_data(labeled_df.assign(**{TARGET_COL: lambda df: df[TARGET_COL].apply(format_relations_str_to_list)}))

In [32]:
labeled_df.to_parquet(os.path.join(INTERIM_DIR, "train-entities+reduced_text.parquet"), index=False)

In [40]:
get_cat_var_distribution(labeled_df[TASK_TARGET_COLS])
# labeled_df[TASK_TARGET_COLS].sum(axis=0).sort_values()

Unnamed: 0,HAS_LATITUDE,HAS_LONGITUDE,HAS_FOR_HEIGHT,WAS_DISSOLVED_IN,HAS_FOR_WIDTH,WAS_CREATED_IN,HAS_FOR_LENGTH,IS_BORN_ON,IS_REGISTERED_AS,WEIGHS,...,HAS_CATEGORY,GENDER_MALE,START_DATE,IS_PART_OF,IS_AT_ODDS_WITH,STARTED_IN,OPERATES_IN,IS_IN_CONTACT_WITH,HAS_CONTROL_OVER,IS_LOCATED_IN
0,10,12,12,14,14,15,16,20,34,41,...,894,908,1034,1462,1526,1860,2435,2919,4547,9025


### train-validation split

In [48]:
VALIDATION_RATE = 0.3
train_df, val_df = train_test_split(labeled_df, test_size=VALIDATION_RATE, shuffle=True, random_state=RANDOM_SEED) #, stratify=train_df[TASK_TARGET_COLS])
train_df.shape, val_df.shape

((85430, 46), (36614, 46))

In [49]:
get_cat_var_distribution(train_df[TASK_TARGET_COLS])

Unnamed: 0,HAS_LONGITUDE,HAS_LATITUDE,HAS_FOR_HEIGHT,WAS_DISSOLVED_IN,WAS_CREATED_IN,HAS_FOR_WIDTH,HAS_FOR_LENGTH,IS_BORN_ON,IS_REGISTERED_AS,WEIGHS,...,HAS_CATEGORY,GENDER_MALE,START_DATE,IS_PART_OF,IS_AT_ODDS_WITH,STARTED_IN,OPERATES_IN,IS_IN_CONTACT_WITH,HAS_CONTROL_OVER,IS_LOCATED_IN
0,8,8,9,10,11,11,14,16,19,30,...,626,647,733,1022,1076,1340,1747,2052,3166,6316


In [50]:
get_cat_var_distribution(val_df[TASK_TARGET_COLS])

Unnamed: 0,HAS_LATITUDE,HAS_FOR_LENGTH,HAS_FOR_WIDTH,HAS_FOR_HEIGHT,WAS_CREATED_IN,IS_BORN_ON,WAS_DISSOLVED_IN,HAS_LONGITUDE,WEIGHS,DIED_IN,...,GENDER_MALE,HAS_CATEGORY,START_DATE,IS_PART_OF,IS_AT_ODDS_WITH,STARTED_IN,OPERATES_IN,IS_IN_CONTACT_WITH,HAS_CONTROL_OVER,IS_LOCATED_IN
0,2,2,3,3,4,4,4,4,11,11,...,261,268,301,440,450,520,688,867,1381,2709


### Chances of having a class in the training batch

In [77]:
train_val_category_sizes_df = pd.concat([get_cat_var_distribution(df[TASK_TARGET_COLS]).T[0].rename(name) for df, name in zip([train_df, val_df], ["train", "val"])], axis=1)
train_val_category_sizes_df.T

Unnamed: 0,HAS_LONGITUDE,HAS_LATITUDE,HAS_FOR_HEIGHT,WAS_DISSOLVED_IN,WAS_CREATED_IN,HAS_FOR_WIDTH,HAS_FOR_LENGTH,IS_BORN_ON,IS_REGISTERED_AS,WEIGHS,...,HAS_CATEGORY,GENDER_MALE,START_DATE,IS_PART_OF,IS_AT_ODDS_WITH,STARTED_IN,OPERATES_IN,IS_IN_CONTACT_WITH,HAS_CONTROL_OVER,IS_LOCATED_IN
train,8,8,9,10,11,11,14,16,19,30,...,626,647,733,1022,1076,1340,1747,2052,3166,6316
val,4,2,3,4,4,3,2,4,15,11,...,268,261,301,440,450,520,688,867,1381,2709


In [88]:
train_df.shape[0]

85430

In [87]:
BATCH_SIZE = 8
# train_val_category_sizes_df.assign(in_batch_proba = train_category_sizes.map(lambda category_size: 1 - ((train_df.shape[0] - category_size) / train_df.shape[0])**BATCH_SIZE))
train_val_category_sizes_df = train_val_category_sizes_df.assign(proba_to_be_in_batch = train_val_category_sizes_df.train.map(lambda categ_size: 1 - np.prod([(train_df.shape[0] - categ_size - i) / (train_df.shape[0] - i) for i in range(BATCH_SIZE)])))
train_val_category_sizes_df.T

Unnamed: 0,HAS_LONGITUDE,HAS_LATITUDE,HAS_FOR_HEIGHT,WAS_DISSOLVED_IN,WAS_CREATED_IN,HAS_FOR_WIDTH,HAS_FOR_LENGTH,IS_BORN_ON,IS_REGISTERED_AS,WEIGHS,...,HAS_CATEGORY,GENDER_MALE,START_DATE,IS_PART_OF,IS_AT_ODDS_WITH,STARTED_IN,OPERATES_IN,IS_IN_CONTACT_WITH,HAS_CONTROL_OVER,IS_LOCATED_IN
train,8.0,8.0,9.0,10.0,11.0,11.0,14.0,16.0,19.0,30.0,...,626.0,647.0,733.0,1022.0,1076.0,1340.0,1747.0,2052.0,3166.0,6316.0
val,4.0,2.0,3.0,4.0,4.0,3.0,2.0,4.0,15.0,11.0,...,268.0,261.0,301.0,440.0,450.0,520.0,688.0,867.0,1381.0,2709.0
proba_to_be_in_batch,0.000749,0.000749,0.000843,0.000936,0.00103,0.00103,0.00131,0.001497,0.001778,0.002806,...,0.057142,0.059008,0.066617,0.091795,0.096433,0.118811,0.152359,0.176763,0.260753,0.459081


# undersampling negative samples

In [84]:
n_out_scope = train_df[TASK_TARGET_COLS].sum(axis=1).rename("n_labels").to_frame().query("n_labels==0").shape[0]
n_out_scope

66864

In [85]:
n_in_scope = train_df[TASK_TARGET_COLS].sum(axis=1).rename("n_labels").to_frame().query("n_labels>0").shape[0]
n_in_scope

18566

In [86]:
n_in_scope / n_out_scope

0.27766810241684614

## Create the tokenized datasets for model input

### init the tokenizer

In [None]:
# Hyperparameters
tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT)
task_special_tokens = ["<", ">", "{", "}"] + [
    f"{entity_class}" for entity_class in entity_classes
]
# add special tokens to the tokenizer
num_added_tokens = tokenizer.add_tokens(task_special_tokens, special_tokens=True)
num_added_tokens, len(tokenizer)

### init the train-valid datasets from dataframe

In [None]:
train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds = Dataset.from_pandas(val_df, preserve_index=False)
task_datasets = DatasetDict({"train": train_ds, "validation": val_ds})
task_datasets

In [None]:
task_datasets["train"][0]

In [None]:
task_datasets["train"][1]

### Tokenize the datasets

In [None]:
def tokenize_function(example: dict):
    return tokenizer(example[TASK_INPUT_COL], truncation=True)

# We’re using batched=True in our call to map so the function is applied to multiple elements of our dataset at once, and not on each element separately
# This is way faster
# Source https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt
# columns are removed because DataCollatorWithPadding doesn't support any other columns than the ones produced by the tokenizer (or non tensors)
tokenized_datasets = task_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
tokenized_datasets.column_names

In [None]:
type(tokenized_datasets["train"][1]['attention_mask'])

### Token numbers distribution

In [None]:
from typing import Any, Dict

def count_tokens(text: str) -> int:
    return len(tokenizer(text)["input_ids"])


def count_token_in_dataset_element(example: Dict[str, Any]) -> Dict[str, int]:
    return {"n_tokens": count_tokens(example[TASK_INPUT_COL])}


tokenized_datasets = tokenized_datasets.map(count_token_in_dataset_element)
tokenized_datasets

In [None]:
split2ntokens_df = pd.DataFrame(
    {
        split_name: pd.Series(
            [e["n_tokens"] for e in tqdm(tokenized_datasets[split_name], split_name)],
            name=f"{split_name}_text_n_tokens",
        )
        for split_name in tokenized_datasets.keys()
    }
)
split2ntokens_df.describe()

In [None]:
split2ntokens_df.boxplot()

In [None]:
tokenized_datasets["train"].filter(lambda x: x['n_tokens'] > 200)

In [None]:
tokenized_datasets["train"].filter(lambda x: x['n_tokens'] == 19)[0]['input_text']

In [None]:
# tokenized_datasets["train"].filter(lambda x: x['n_tokens'] > 300)[0]['input_text']

### Test the batch-level padding with a data collator

In [None]:
samples = tokenized_datasets.remove_columns(task_datasets["train"].column_names)["train"][:8]
samples = {k: v for k, v in samples.items()}
[len(x) for x in samples["input_ids"]]

In [None]:
task_datasets

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

## Fine-tuning a model with the Trainer API

### Compute the weight of classes to handle imbalance

In [None]:
# Source: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#calculate_class_weights
# Scaling by total/2 helps keep the loss to a similar magnitude.
n_examples = train_df.shape[0]
n_classes = train_df[TASK_TARGET_COL].nunique()
def compute_class_weights(lbl_df: pd.DataFrame) -> pd.Series:
    return get_cat_var_distribution(lbl_df[TASK_TARGET_COL]).reset_index(drop=False)["count"].apply(lambda x: (1 / x) * (n_examples / n_classes)).rename("weight")
pd.concat([get_cat_var_distribution(train_df[TASK_TARGET_COL]), compute_class_weights(train_df)], axis=1)

### Init the model

In [None]:
n_classes = train_df[TASK_TARGET_COL].nunique()
print(f"{n_classes=}")
model = AutoModelForSequenceClassification.from_pretrained(BASE_CHECKPOINT, num_labels=n_classes)
model.resize_token_embeddings(len(tokenizer))

In [None]:
model

### Init the trainer

Source: https://stackoverflow.com/questions/69087044/early-stopping-in-bert-trainer-instances#69087153

1. Use `load_best_model_at_end = True` (EarlyStoppingCallback() requires this to be True).
2. `evaluation_strategy = 'steps'` or IntervalStrategy.STEPS instead of 'epoch'.
3. `eval_steps = 50` (evaluate the metrics after N steps).
4. `metric_for_best_model = 'f1'`

In [None]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="macro")
    precision = precision_score(y_true=labels, y_pred=pred, average="macro")
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir=os.path.join(MODELS_DIR, f"{TASK_NAME}-byTrainerAPI-checkpoints"),
    per_device_train_batch_size=16,    
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    evaluation_strategy=IntervalStrategy.STEPS, # steps
    eval_steps = 1000, # Evaluation and Save happens every 50 steps
    learning_rate=1e-5,
    weight_decay=0.01,
    save_strategy=IntervalStrategy.STEPS,
    save_steps=1000,
    logging_dir=os.path.join(MODELS_DIR, f"{TASK_NAME}-byTrainerAPI-tensorboard"),
    save_total_limit = 2, # Only last 2 models are saved. Older ones are deleted
    push_to_hub=False,
    metric_for_best_model = 'f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=4)],
)

### Launch the training

In [None]:
trainer.callback_handler.callbacks[-2].__dict__

In [None]:
trainer.train()

## Evaluate

TODO...

In [None]:
trainer.state.best_metric

In [None]:
# After training, access the path of the best checkpoint like this
best_ckpt_path = trainer.state.best_model_checkpoint
best_ckpt_path

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=best_ckpt_path, device="cuda")

In [None]:
classifier(val_ds["input_text"][:10])

In [None]:
val_df.iloc[:10][["input_text", "label", "relations"]]

In [None]:
val_ds["input_text"][5]

In [None]:
classifier(val_ds["text"][5])

In [None]:
val_df[TASK_TARGET_COL].iloc[:10]

In [None]:
val_ds["input_text"]

In [None]:
val_ds[0]

In [None]:
print(samples['input_ids'][2])

In [None]:
tokenizer.decode(samples['input_ids'][0])

In [None]:
tokenizer.decode(samples['input_ids'][1])

In [None]:
tokenizer.decode(samples['input_ids'][2])