In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers==4.44.2

import json
import numpy as np
import pickle
import sys

from sklearn.model_selection import train_test_split

from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import AdamW

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import trange

from sklearn.metrics import f1_score

Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  

Open the data

In [3]:
f = open("drive/MyDrive/data/defi-text-mine-egc-2026/train_v2.jsonl", "r")

text = []
acronyms = []
options = []

for line in f:
  try:
      data = json.loads(line)

      text.append(data["text"])

      acronyms.append(data["acronym"])

      options.append(data["options"])

  except ValueError:
      print('Invalid input:',line)

f.close()

# Data preparation and model initialization

Split data into train and test

In [4]:
data_all = []
for i in range(len(acronyms)):
  data_all.append([text[i],acronyms[i],options[i]])

# Split data into train and test:
data_train_all, data_test = train_test_split(data_all, test_size=0.2, random_state=35)

# Split train data into train and validation
data_train, data_val = train_test_split(data_train_all, test_size=0.2, random_state=35)

Construct segments by merging together text, acronym and one option

In [5]:
X_train = []
y_train = []
X_test = []
y_test = []
X_val = []
y_val = []

for i in range(len(data_train)):
  for option,label in data_train[i][2].items():
    X_train.append(data_train[i][0].strip().replace("  "," ").replace(" -","")+"\n\n"+data_train[i][1]+" "+option)
    y_train.append(label)

for i in range(len(data_val)):
  for option,label in data_val[i][2].items():
    X_val.append(data_val[i][0].strip().replace("  "," ").replace(" -","")+"\n\n"+data_val[i][1]+" "+option)
    y_val.append(label)

for i in range(len(data_test)):
  for option,label in data_test[i][2].items():
    X_test.append(data_test[i][0].strip().replace("  "," ").replace(" -","")+"\n\n"+data_test[i][1]+" "+option)
    y_test.append(label)

Example of the segment

In [6]:
print(X_train[0])

AC attestation de compatibilité COGC centre opérationnel de gestion des circulations DT double traction EM engin moteur EP embranchement particulier

EP Embranchement particulier


Sizes of train, validation and test sets

In [7]:
print(len(X_train),len(X_val),len(X_test))

1434 325 418


Initialize the model and preprocess the data

In [8]:
# Fix random seed to make results reproducible:
torch.manual_seed(128)

# Select the model:
model_name = "camembert-base"

# Define tokenizer:
tokenizer = CamembertTokenizer.from_pretrained(model_name,do_lower_case=True)

# Tokenize training, validation and test sets:
tokenizer_train = tokenizer(X_train, padding="longest", truncation = True, return_tensors="pt")

tokenizer_val = tokenizer(X_val, padding="longest", truncation = True, return_tensors="pt")

tokenizer_test = tokenizer(X_test, padding="longest", truncation = True, return_tensors="pt")

# Define Dataloaders:
batch_size = 16

train_set = TensorDataset(tokenizer_train['input_ids'],
                          tokenizer_train['attention_mask'],
                          torch.tensor([int(y_train[i]) for i in range(len(y_train))]))

val_set = TensorDataset(tokenizer_val['input_ids'],
                          tokenizer_val['attention_mask'],
                          torch.tensor([int(y_val[i]) for i in range(len(y_val))]))

test_set = TensorDataset(tokenizer_test['input_ids'],
                          tokenizer_test['attention_mask'],
                          torch.tensor([int(y_test[i]) for i in range(len(y_test))]))

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_set,
            sampler = SequentialSampler(test_set),
            batch_size = batch_size
        )

# Define the model:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model = CamembertForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

# Learning and evaluation

Define a function for getting predicted labels

In [9]:
def flatten_preds(preds):
  preds = np.argmax(preds, axis = 1).flatten()

  return preds.tolist()

Define parameters and metrics to optimize

In [10]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)



Fine-tuning and validation

In [11]:
epochs = 10

train_loss_set = []
count = 0
best_f1 = 0
best_results = []
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs[0]
        # Add it to train loss list
        train_loss_set.append(loss.item())
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

    # Tracking variables for validation
    val_preds = []
    val_labels = []
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = outputs[:2]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()
        # Update predicted and true labels
        val_preds += flatten_preds(logits)
        val_labels += labels.tolist()

    # Compute evaluaion metric:
    val_f1 = f1_score(val_labels,val_preds,average='binary')

    count+=1

    # Save best results:
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = count
        # Save the best model:
        best_state = model.state_dict()

print("\n\nF-mesure: ",best_f1, "Best epoch: ",best_epoch)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]


Train loss: 0.47552925993998846


Epoch:  10%|█         | 1/10 [01:10<10:34, 70.51s/it]


Train loss: 0.31790724247694013


Epoch:  20%|██        | 2/10 [02:23<09:33, 71.69s/it]


Train loss: 0.20736450366675854


Epoch:  30%|███       | 3/10 [03:35<08:24, 72.01s/it]


Train loss: 0.15131545449710554


Epoch:  40%|████      | 4/10 [04:47<07:12, 72.08s/it]


Train loss: 0.1013892663642764


Epoch:  50%|█████     | 5/10 [06:00<06:00, 72.19s/it]


Train loss: 0.08260131656295723


Epoch:  60%|██████    | 6/10 [07:12<04:49, 72.27s/it]


Train loss: 0.07216554527791838


Epoch:  70%|███████   | 7/10 [08:24<03:36, 72.28s/it]


Train loss: 0.07356735006388691


Epoch:  80%|████████  | 8/10 [09:37<02:24, 72.29s/it]


Train loss: 0.061218944088452396


Epoch:  90%|█████████ | 9/10 [10:49<01:12, 72.27s/it]


Train loss: 0.05757785584363673


Epoch: 100%|██████████| 10/10 [12:01<00:00, 72.15s/it]



F-mesure:  0.8939393939393939 Best epoch:  8





Should be F1=0.8939393939393939, which is quite good so far!

# Evaluate on the test set

Predict labels

In [12]:
# Load the state of the model from best epoch:
model.load_state_dict(best_state)

test_preds = []
test_labels = []
# Evaluate data for one epoch
for batch in test_dataloader:
    # Add batch to device CPU or GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs[:2]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    labels = b_labels.to('cpu').numpy()
    # Update predicted and true labels
    test_preds += flatten_preds(logits)
    test_labels += labels.tolist()

# Compute evaluaion metric:
test_f1 = f1_score(test_labels,test_preds,average='binary')

print("F-mesure: ",test_f1)

F-mesure:  0.8059701492537313


Should be 0.8059701492537313, which is a great result for unseen data.

Let's compare this result with a baseline approach based on semantic similarities using same test set split

In [13]:
!pip install sentence_transformers

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('dangvantuan/french-document-embedding',trust_remote_code=True)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/bilingual_impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modelling.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/bilingual_impl:
- modelling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Perform prediction

In [15]:
y_pred = []
y_true = []

for i in range(len(data_test)):
  text_embedding = model.encode(data_test[i][0])

  options_embedding = model.encode(list(data_test[i][2].keys()))

  scores = util.dot_score(text_embedding, options_embedding)

  labels_pred = np.zeros(len(data_test[i][2]),bool)

  index_max = int(np.argmax(scores.numpy()))
  labels_pred[index_max] = True

  y_pred.append(labels_pred)
  y_true.append(list(data_test[i][2].values()))

Perform evaluation

In [16]:
f1_list = []

for i in range(len(y_true)):
  f1_list.append(f1_score(y_true[i],y_pred[i],average='binary'))

print(np.mean(f1_list))

0.5791245791245792


Should be F1=5791245791245792, which is equal to the performance of the approach on a bigger set.

The CamemBERT based approach performs way better, let's use it to predict labels on the competition set

# Predict labels on the competition set

Open competition set

In [None]:
f = open("drive/MyDrive/data/defi-text-mine-egc-2026/test_v4.jsonl", "r")

ids = []
text = []
acronyms = []
options = []

for line in f:
  try:
      data = json.loads(line)

      ids.append(data["id"])

      text.append(data["text"])

      acronyms.append(data["acronym"])

      options.append(data["options"])

  except ValueError:
      print('Invalid input:',line)

f.close()

Construct segments

In [None]:
data_eval = []

for i in range(len(acronyms)):
  for j in range(len(options[i])):
    # Data in the format: (example_id, option_id, text_with_acronym_and_option)
    data_eval.append((ids[i],j,text[i].strip().replace("  "," ").replace(" -","")+"\n\n"+acronyms[i]+" "+options[i][j]))

In [None]:
data_eval[0]

(0,
 0,
 'o V3 RCI o V5 RCI A101.2 Caractéristiques de la section de ligne Ligne à grande vitesse de Mâcon TGV à Valence TGV et ses raccordements pouvant\n\nRCI Régulateur de Circulation Interconnecté')

Preprocess data

In [None]:
# Tokenize competition set:
tokenizer_eval = tokenizer([data_eval[i][2] for i in range(len(data_eval))], padding="longest", truncation = True, return_tensors="pt")

eval_set = TensorDataset(tokenizer_eval['input_ids'],
                          tokenizer_eval['attention_mask'])

Predict labels

In [None]:
pred_labels = []
for i in range(len(data_eval)):
    test_ids = []
    test_attention_mask = []

    # Apply the tokenizer
    encoding = tokenizer(data_eval[i][2], padding="longest", truncation = True, return_tensors="pt")

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
      output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

    pred_labels.append(np.argmax(output.logits.cpu().numpy()).flatten().item())

Convert results to the format of competition

In [None]:
eval_results = {}
for i in range(len(data_eval)):
  if pred_labels[i]:
    example_id = data_eval[i][0]
    if example_id not in eval_results.keys():
      eval_results[example_id] = []
    eval_results[example_id].append(data_eval[i][1])

Save to the CSV file

In [None]:
import csv

f_out = open('drive/MyDrive/data/defi-text-mine-egc-2026/submission.csv', 'w', newline='', encoding='UTF8')

tab = csv.writer(f_out,delimiter=',')

header = ['id','prediction']
tab.writerow(header)

for id in ids:
    if id not in eval_results.keys():
      tab.writerow([str(id),'[]'])
    else:
      tab.writerow([str(id),str(eval_results[id])])

f_out.close()