In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import numpy as np

!pip install openai

!pip install langchain
!pip install huggingface_hub
!pip install langchain-community langchain-core

!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install langchainhub

from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser
from langchain import hub
from langchain.chat_models import ChatOpenAI

Collecting langchain-core
  Using cached langchain_core-1.0.4-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<2.0.0,>=1.0.0 (from langchain-classic<2.0.0,>=1.0.0->langchain-community)
  Using cached langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Using cached langchain_core-1.0.4-py3-none-any.whl (471 kB)
Using cached langchain_text_splitters-1.0.0-py3-none-any.whl (33 kB)
Installing collected packages: langchain-core, langchain-text-splitters
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.79
    Uninstalling langchain-core-0.3.79:
      Successfully uninstalled langchain-core-0.3.79
  Attempting uninstall: langchain-text-splitters
    Found existing installation: langchain-text-splitters 0.3.11
    Uninstalling langchain-text-splitters-0.3.11:
      Successfully uninstalled langchain-text-splitters-0.3.11
[31mERROR: pip's dependency resolver does not currently take into account all the packages that 

Initialize the model

In [None]:
# Set the environment variables from shell environment
OPENAI_API_KEY = "..." # Your API key here

# Select a model
#OPENAI_CHAT_MODEL = "gpt-5-nano-2025-08-07"
#OPENAI_CHAT_MODEL = "gpt-5-mini-2025-08-07"
OPENAI_CHAT_MODEL = "gpt-5-2025-08-07"

model = OPENAI_CHAT_MODEL

Open the data

In [None]:
f = open("drive/MyDrive/data/defi-text-mine-egc-2026/train_v2.jsonl", "r")

text = []
acronyms = []
options = []

for line in f:
  try:
      data = json.loads(line)

      text.append(data["text"])

      acronyms.append(data["acronym"])

      options.append(data["options"])

  except ValueError:
      print('Invalid input:',line)

f.close()

# An LLM based approach

### Setup RAG environment

Load external knowledge source:

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

fileName = "drive/MyDrive/data/defi-text-mine-egc-2026/lexique-des-acronymes-sncf.csv"

loader = CSVLoader(
    file_path=fileName,
    csv_args={
        "delimiter": ";",
        "quotechar": '"'
    },
    source_column="Abréviation"
)

data = loader.load()

In [None]:
print(data)



In [None]:
len(data)

9502

In [None]:
print(data[10].page_content)

﻿id: 4749.0
Abréviation: DET
Définition: Direction de l'exécution des contrats de travaux
empty: 


Compute the embeddings:

In [None]:
# Define the path to the pre-trained model you want to use
#modelPath = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
modelPath = "intfloat/multilingual-e5-base"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Define database for storing embeddings:

In [None]:
db = FAISS.from_documents(data, embeddings)

### Querry ChatGPT-5

Define query retriever:

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 3}) # Select top k similar documents

Define a RAG pipeline:

In [None]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | hub.pull("rlm/rag-prompt")
    | ChatOpenAI(model_name=OPENAI_CHAT_MODEL, temperature=1, openai_api_key=OPENAI_API_KEY)
    | StrOutputParser()
)

Function to post-process the results:

In [None]:
import re
import ast

def extract_bool_list(text):
    """Return a list of all boolean lists found in `text`.
       Example: ' [ASSISTANT] [False, True]' -> [[False, True]]"""
    text = text.replace('"','').replace("'","") # get rid of all quotes
    candidates = re.findall(r'\[.*?\]', text)  # non-greedy bracket matches
    results = []
    for cand in candidates:
        # normalize lowercase true/false to Python booleans (optional)
        cand_norm = re.sub(r'\btrue\b', 'True', cand, flags=re.IGNORECASE)
        cand_norm = re.sub(r'\bfalse\b', 'False', cand_norm, flags=re.IGNORECASE)

        try:
            val = ast.literal_eval(cand_norm)
        except Exception:
            # not a Python literal (e.g. [ASSISTANT]) -> skip
            continue

        # keep only lists made entirely of booleans
        if isinstance(val, list) and all(isinstance(x, bool) for x in val):
            results.append(val)

    return results[0] if results else None

Perform prediction (on a train set!)

In [None]:
result_dic = {}
examples_to_process = [(i,0) for i in range(len(text))]

flag = False
count = 0
while examples_to_process != []:
  if count % 10 == 0:
    print(len(examples_to_process))

  item = examples_to_process.pop(0)

  # Querry the LLM
  response = rag_chain.invoke("Find what is "+acronyms[item[0]]+" in this phrase: "+text[item[0]]+". Evaluate the following options by giving the boolean list as an answer: "+str([j for j in options[item[0]].keys()])+". Output only the boolean list. Do not output any explanation nor other text")

  # Post-process result
  labels_pred = extract_bool_list(response)
  if labels_pred != None:
    if len(labels_pred) == len(options[item[0]]):
      result_dic[item[0]] = (labels_pred,item[1])
    else:
      flag = True
  else:
    flag = True

  if flag:
    if item[1] < 1:
      examples_to_process.append((item[0],item[1]+1))
    flag = False
  count+=1

491
481
471
461
451
441
431
421
411
401
391
381
371
361
351
341
331
321
311
301
291
281
271
261
251
241
231
221
211
201
191
181
171
161
151
141
131
121
111
101
91
81
71
61
51
41
31
21
11
1


Predicted labels

In [None]:
count

492

In [None]:
result_dic

{0: ([False, True, False, False], 0),
 1: ([True, False, False, False, False, False, True], 0),
 2: ([False, True, False, False, False, False], 0),
 3: ([False, False, False, False, False, False, False, False, True, False], 0),
 4: ([False, False, True], 0),
 5: ([True, False], 0),
 6: ([False, True], 0),
 7: ([True, False], 0),
 8: ([False, False, False, False, False, False, True], 0),
 9: ([False, True, False, False], 0),
 10: ([False, True], 0),
 11: ([False, False, True, False], 0),
 12: ([False, True], 0),
 13: ([False, False, False, False, False, True], 0),
 14: ([True, False], 0),
 15: ([False, True, False, False], 0),
 16: ([True, False, False, False], 0),
 17: ([True, False, False], 0),
 18: ([True, False, False, False, False, False], 0),
 19: ([False, True], 0),
 20: ([True, False, False, False, False, False, False, False, False, False],
  0),
 21: ([True, False, False], 0),
 22: ([False, False, False, True, False, False], 0),
 23: ([False, True], 0),
 24: ([False, True, Fals

In [None]:
len(result_dic)

492

In [None]:
# Save results as numpy dictionary:
import pickle

#with open("drive/MyDrive/data/defi-text-mine-egc-2026/val_results-gpt5.pkl", "wb") as f:
with open("drive/MyDrive/data/defi-text-mine-egc-2026/result_dic_train-gpt5_rag.pkl", "wb") as f:
  pickle.dump(result_dic,f)

f.close()

# Evaluation

In [None]:
from sklearn.metrics import f1_score

f1_list = []

for i in range(len(acronyms)):
  if i in result_dic.keys():
    y_pred = result_dic[i][0]
  else:
    y_pred = np.zeros(len(options[i]),bool)
  y_true = list(options[i].values())
  f1_list.append(f1_score(y_true,y_pred,average='binary'))

print(np.mean(f1_list))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.7408536585365854


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


My run (gpt-5) resulted in F1=0.7408536585365854

# Predict labels for the test set

Open test set

In [None]:
f = open("drive/MyDrive/data/defi-text-mine-egc-2026/test_v4.jsonl", "r")

ids = []
text = []
acronyms = []
options = []

for line in f:
  try:
      data = json.loads(line)

      ids.append(data["id"])

      text.append(data["text"])

      acronyms.append(data["acronym"])

      options.append(data["options"])

  except ValueError:
      print('Invalid input:',line)

f.close()

Perform prediction

In [None]:
val_results = {}
examples_to_process = [(i,0) for i in range(len(text))]

flag = False
count = 0
while examples_to_process != []:
  if count % 10 == 0:
    print(len(examples_to_process))

  item = examples_to_process.pop(0)

  # Querry the LLM
  #response = rag_chain.invoke(
  #  "Trouve ce que signifie " + acronyms[item[0]] +
  #  " dans cette phrase : " + text[item[0]] +
  #  ". Évalue les options suivantes en donnant la liste booléenne comme réponse : " + str(options[item[0]]) +
  #  ". Ne produis que la liste booléenne. N’ajoute aucune explication ni autre texte."
  #)
  response = rag_chain.invoke("Find what is "+acronyms[item[0]]+" in this phrase: "+text[item[0]]+". Evaluate the following options by giving the boolean list as an answer: "+str(options[item[0]])+". Output only the boolean list. Do not output any explanation nor other text")

  # Process the result
  labels_pred = extract_bool_list(response)

  if labels_pred != None:
    if len(labels_pred) == len(options[item[0]]):
      val_results[item[0]] = (labels_pred,item[1])
    else:
      flag = True
  else:
    flag = True

  if flag:
    if item[1] < 1:
      examples_to_process.append((item[0],item[1]+1))
    flag = False
  count+=1

518
508
498
488
478
468
458
448
438
428
418
408
398
388
378
368
358
348
338
328
318
308
298
288
278
268
258
248
238
228
218
208
198
188
178
168
158
148
138
128
118
108
98
88
78
68
58
48
38
28
18
8


In [None]:
len(val_results)

519

In [None]:
val_results

{0: ([False, True, False, False], 0),
 1: ([False,
   True,
   False,
   False,
   False,
   False,
   False,
   False,
   False,
   False,
   False,
   False,
   False],
  0),
 2: ([True, False], 0),
 3: ([False, True, False, False], 0),
 4: ([False, False, False, True, False], 0),
 5: ([False, False, True], 0),
 6: ([False, False, False, False, False, False], 0),
 7: ([False, True, True, False], 0),
 8: ([False, False, True], 0),
 9: ([False, False, True, False], 0),
 10: ([False, True, False, False], 0),
 11: ([False, True, False, False], 0),
 12: ([False, False, False, True], 0),
 13: ([False, True, False, False], 0),
 14: ([True, False, False, True], 0),
 15: ([False, True, False, False], 0),
 16: ([False, True], 0),
 17: ([False, False, True, False], 0),
 18: ([False, True, True, False], 0),
 19: ([False, False, True], 0),
 20: ([False, True, False, False, False, False, False], 0),
 21: ([False, True], 0),
 22: ([False, True, False, False, False, False], 0),
 23: ([True, True, Fa

Save to the CSV file

In [None]:
import csv

f_out = open('drive/MyDrive/data/defi-text-mine-egc-2026/submission_gpt5_rag.csv', 'w', newline='', encoding='UTF8')

tab = csv.writer(f_out,delimiter=',')

header = ['id','prediction']
tab.writerow(header)

for id in ids:
    if id not in val_results.keys():
      tab.writerow([str(id),'[]'])
    else:
      tab.writerow([str(id),str([i for i, val in enumerate(val_results[id][0]) if val])])

f_out.close()