In [32]:
%load_ext autoreload
%autoreload 2


from datetime import datetime
import os
from defi_textmine_2025.data.utils import TARGET_COL, LOGGING_DIR, INTERIM_DIR, MODELS_DIR, submission_path
import logging 

logging.basicConfig(
     level=logging.INFO, 
     format= '[%(asctime)s|%(levelname)s|%(module)s.py:%(lineno)s] %(message)s',
     datefmt='%H:%M:%S',
     filename=os.path.join(LOGGING_DIR, f'{datetime.now().strftime("%Y%m%dT%H%M%S")}.log')
 )

from sklearn.preprocessing import MultiLabelBinarizer
from defi_textmine_2025.modeling.data_preparation import load_csv, format_relations_str_to_list, one_hot_encode_relations, mlb


RANDOM_SEED = 1234 # for reproducibility

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [2]:
labeled_df = load_csv("data/defi-text-mine-2025/interim/reduced_text_w_entity_bracket/train", index_col=0)
labeled_df.shape

(122044, 8)

In [5]:
labeled_df.head(2)

Unnamed: 0,text_index,e1_id,e2_id,e1_type,e2_type,text,relations,reduced_text
0,2576,1,0,FIRE,GATHERING,"Le matin du 10 janvier 2010, Arthur et Jacques...",,"Au milieu de l’[ interview ], un { incendie } ..."
1,2576,0,1,GATHERING,FIRE,"Le matin du 10 janvier 2010, Arthur et Jacques...",['HAS_CONSEQUENCE'],"Au milieu de l’{ interview }, un [ incendie ] ..."


# One-hot encoding because of multi-label

In [18]:
one_hot_df = one_hot_encode_relations(labeled_df.assign(relations =  lambda df: df.relations.apply(format_relations_str_to_list)))
one_hot_df.head(2)

Unnamed: 0,text_index,e1_id,e2_id,e1_type,e2_type,text,relations,reduced_text,CREATED,DEATHS_NUMBER,...,IS_OF_SIZE,IS_PART_OF,IS_REGISTERED_AS,OPERATES_IN,RESIDES_IN,STARTED_IN,START_DATE,WAS_CREATED_IN,WAS_DISSOLVED_IN,WEIGHS
0,2576,1,0,FIRE,GATHERING,"Le matin du 10 janvier 2010, Arthur et Jacques...",[],"Au milieu de l’[ interview ], un { incendie } ...",0,0,...,0,0,0,0,0,0,0,0,0,0
1,2576,0,1,GATHERING,FIRE,"Le matin du 10 janvier 2010, Arthur et Jacques...",[HAS_CONSEQUENCE],"Au milieu de l’{ interview }, un [ incendie ] ...",0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
one_hot_df.head(2).HAS_CONSEQUENCE

0    0
1    1
Name: HAS_CONSEQUENCE, dtype: int64

In [24]:
# number with relations
one_hot_df[one_hot_df[mlb.classes_].sum(axis=1) > 0].shape

(26394, 45)

In [25]:
# number without relations
one_hot_df[one_hot_df[mlb.classes_].sum(axis=1) == 0].shape

(95650, 45)

# cross-validation split

In [66]:
from sklearn.model_selection import KFold

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

for train_index, validation_index in kf.split(one_hot_df):
    print(len(train_index), len(validation_index), train_index, validation_index)

97635 24409 [     0      1      2 ... 122041 122042 122043] [    11     12     13 ... 122028 122029 122038]
97635 24409 [     1      2      3 ... 122040 122042 122043] [     0      4     16 ... 122032 122035 122041]
97635 24409 [     0      1      2 ... 122040 122041 122043] [     8     18     37 ... 122027 122037 122042]
97635 24409 [     0      3      4 ... 122038 122041 122042] [     1      2      9 ... 122039 122040 122043]
97636 24408 [     0      1      2 ... 122041 122042 122043] [     3      5      6 ... 122033 122034 122036]


# Init model

In [64]:
from setfit import SetFitModel

model_id = "sentence-transformers/distiluse-base-multilingual-cased-v1"

num_classes = len(mlb.classes_)

model = SetFitModel.from_pretrained(
    model_id, # e.g. "BAAI/bge-small-en-v1.5"
    # use_differentiable_head=True,
    # head_params={"out_features": num_classes},
)
print(model.model_body)
model.model_head

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)


In [61]:
model.model_body

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

# Init dataset

In [65]:
from datasets import Dataset

In [69]:
train_ds = Dataset.from_pandas(one_hot_df.loc[train_index])
train_ds

Dataset({
    features: ['text_index', 'e1_id', 'e2_id', 'e1_type', 'e2_type', 'text', 'relations', 'reduced_text', 'CREATED', 'DEATHS_NUMBER', 'DIED_IN', 'END_DATE', 'GENDER_FEMALE', 'GENDER_MALE', 'HAS_CATEGORY', 'HAS_COLOR', 'HAS_CONSEQUENCE', 'HAS_CONTROL_OVER', 'HAS_FAMILY_RELATIONSHIP', 'HAS_FOR_HEIGHT', 'HAS_FOR_LENGTH', 'HAS_FOR_WIDTH', 'HAS_LATITUDE', 'HAS_LONGITUDE', 'HAS_QUANTITY', 'INITIATED', 'INJURED_NUMBER', 'IS_AT_ODDS_WITH', 'IS_BORN_IN', 'IS_BORN_ON', 'IS_COOPERATING_WITH', 'IS_DEAD_ON', 'IS_IN_CONTACT_WITH', 'IS_LOCATED_IN', 'IS_OF_NATIONALITY', 'IS_OF_SIZE', 'IS_PART_OF', 'IS_REGISTERED_AS', 'OPERATES_IN', 'RESIDES_IN', 'STARTED_IN', 'START_DATE', 'WAS_CREATED_IN', 'WAS_DISSOLVED_IN', 'WEIGHS', '__index_level_0__'],
    num_rows: 97636
})