# Benchmark des ajouts de variables catégorielles

## Environnement

In [None]:
import time
import glob
from pathlib import Path
import sys
import s3fs
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import pyarrow.parquet as pq
import fasttext
import os
import warnings
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
from tensorboard.backend.event_processing import event_accumulator
import torch
from torch import nn
from torch.optim import Adam, SGD
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
import unidecode
# from src.model_negsamp import FastTextModule_negsamp, FastTextModel_negsamp
from src.model import FastTextModule, FastTextModel
from src.dataset import FastTextModelDataset
from src.tokenizer import NGramTokenizer
from src.preprocess import clean_text_feature
import warnings
import nltk
nltk.download('stopwords')
from tensorboard.backend.event_processing import event_accumulator
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
np.random.seed(0)
random.seed(0)

MLflow

In [None]:
remote_server_uri = mlflow.get_tracking_uri()
experiment_name = "benchmark_categorial"
run_name=""

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(experiment_name)


Logging local du modèle avec TensorBoard

In [None]:
# TensorBoard loggers
path_logger_torch = "model_torch"
path_logger_torch_tweeked = "model_torch_tweeked"
tb_logger = TensorBoardLogger("logs", name=path_logger_torch)
tb_logger_libelle_tweeked = TensorBoardLogger("logs", name=path_logger_torch_tweeked)

Hyper-paramètres

In [None]:
params={
    "y_name": "nace2025",
    "text_feature": "libelle",
    "text_feature_tweeked": "libelle_tweeked",
    "df_sample_size": 50000,
    "max_epochs": 5, #50
    "train_proportion": 0.8,
    "lr": 0.004,
    "buckets": 2000000, #2000000
    "dim": 200, # 180
    "minCount": 1,
    "minn": 3,
    "maxn": 5,
    "wordNgrams": 3,
    "ft_thread": 100,
    "ft_loss": "softmax", #"softmax","ova"
    "ft_lrUpdateRate": 100, #100
    "ft_neg": 5, # 5
    "torch_batch_size": 64,
    "torch_patience": 3,
    "torch_sparse": True,
    "torch_num_workers": 4,
    "categorical_features": ["activ_nat_et", "liasse_type"] ,
}


Data

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio.lab.sspcloud.fr'},
    key = os.environ["AWS_ACCESS_KEY_ID"], 
    secret = os.environ["AWS_SECRET_ACCESS_KEY"], 
    token = os.environ["AWS_SESSION_TOKEN"])
df = (
    pq.ParquetDataset(
        "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
)


In [None]:
df

In [None]:
print(f"Nombre de valeurs vide : {(df[params["y_name"]]=="").sum()}")
print(f"Nombre de valeurs NA : {df[params["y_name"]].isna().sum()}")

df = df.dropna(subset=[params["y_name"]])

df = df.sample(params["df_sample_size"], random_state=123)

counts = df[params["y_name"]].value_counts()
modalites_suffisantes = counts[counts >= 3].index
df = df[df[params["y_name"]].isin(modalites_suffisantes)]

print(f"Shape of sampled df after removal of rare outcomes : {len(df)}")

In [None]:
# Clean text feature
df = clean_text_feature(df, text_feature=params["text_feature"])



Ajout d'une variable textuelle de concaténation du libellé textuel et des variables catégorielles (astuce utilisée avec la lib fasttext dans les modèles en prod)

In [None]:
df[params['text_feature_tweeked']]=''
for idx, item in df.iterrows():
    formatted_item = item[params['text_feature']]
    if params["categorical_features"] != []:
        for feature in params["categorical_features"]:
            formatted_item += f" {feature}_{item[feature]}"
    print(formatted_item)
    df.at[idx, params['text_feature_tweeked']] = formatted_item

df[params['text_feature_tweeked']].sample(10)


In [None]:
# Encode outputs and categorical variables
encoder = LabelEncoder()
df[params["y_name"]] = encoder.fit_transform(df[params["y_name"]])

for var_categ_name in params["categorical_features"]:
    encoder = LabelEncoder()
    df[var_categ_name] = encoder.fit_transform(df[var_categ_name])


In [None]:
# Check
print((df[params["y_name"]].value_counts()<3).sum())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[[params["text_feature"], params["text_feature_tweeked"]] + params["categorical_features"]],
    df[params["y_name"]],
    test_size=1 - params["train_proportion"],
    random_state=0,
    shuffle=True,
    stratify=df[params["y_name"]]
)

df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [None]:
num_classes = y_train.nunique()
print(f"Nombre de classes dans y_train : {num_classes}")

## Cas 1 : FastText 

On tweek le libelle textuel en entrée de Fasttext

In [None]:
def write_training_data(
    df: pd.DataFrame,
    y: str,
    text_feature: str,
    categorical_features: Optional[List[str]],
    label_prefix: str = "__label__",
) -> str:
    """
    Write training data to file.

    Args:
        df (pd.DataFrame): DataFrame.
        y (str): Output variable name.
        text_feature (str): Text feature.
        categorical_features (Optional[List[str]]): Categorical features.
        label_prefix (str, optional): Label prefix. Defaults to "__label__".

    Returns:
        str: Training data path.
    """
    training_data_path = Path("data/training_data.txt")

    with open(training_data_path, "w", encoding="utf-8") as file:
        for _, item in df.iterrows():
            formatted_item = f"{label_prefix}{item[y]} {item[text_feature]}"
            if categorical_features != []:
                for feature in categorical_features:
                    formatted_item += f" {feature}_{item[feature]}"
            file.write(f"{formatted_item}\n")
    return training_data_path.as_posix()

In [None]:
# Write training data in a .txt file (fasttext-specific)
training_data_path = write_training_data(
    df=df_train,
    y=params["y_name"],
    text_feature=params["text_feature"],
    categorical_features=params["categorical_features"],
)


In [None]:
# Train the fasttext model

start_time = time.time()

model_ft = fasttext.train_supervised(
    input=training_data_path,
    dim=params["dim"],
    lr=params["lr"],
    epoch=params["max_epochs"],
    lrUpdateRate=params["ft_lrUpdateRate"],
    neg=params["ft_neg"],
    wordNgrams=params["wordNgrams"],
    minn=params["minn"],
    maxn=params["maxn"],
    minCount=params["minCount"],
    bucket=params["buckets"],
    thread=params["ft_thread"],
    loss=params["ft_loss"],
    label_prefix="__label__",
    verbose=2
)

end_time = time.time()
elapsed_time_ft = (end_time - start_time) / 60
print("Temps écoulé pour entrainer la lib fasttext : ", elapsed_time_ft, " minutes")

Preprocess val data

In [None]:
val_input = []
for _, item in df_val.iterrows():
    formatted_item = f"{"__label__"}{item[params["y_name"]]} {item[params["text_feature"]]}"
    val_input.append(formatted_item)

Calculate accuracy

In [None]:
predictions = model_ft.predict(val_input, k=1)
predictions = [x[0].replace("__label__", "") for x in predictions[0]]
booleans = [
    prediction == str(label)
    for prediction, label in zip(predictions, df_val[params["y_name"]])
]
accuracy_ft = sum(booleans) / len(booleans)
accuracy_ft

## Cas 2 : Réimplémentation PyTorch avec intégration des variables catégorielles

In [None]:
# torch.set_num_threads(1)

Texte d'origine et variable additionnelles proprement intégrées au modèle

In [None]:
training_text = X_train[params["text_feature"]].to_list()
tokenizer = NGramTokenizer(
    params['minCount'], params['minn'], params['maxn'], params['buckets'], params['wordNgrams'], training_text
)

In [None]:
train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].to_list() for column in X_train[params["categorical_features"]]
    ],
    texts=training_text,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].to_list() for column in X_val[params["categorical_features"]]
    ],
    texts=X_val[params["text_feature"]].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)
train_dataloader = train_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)
val_dataloader = val_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)


In [None]:
num_classes = df[params["y_name"]].nunique()
categorical_vocabulary_sizes = [
    len(np.unique(X_train[feature])) for feature in params["categorical_features"]
]
print(categorical_vocabulary_sizes)

In [None]:
model_torch = FastTextModel(
    embedding_dim=params['dim'],
    vocab_size=params['buckets'] + tokenizer.get_nwords() + 1,
    num_classes=num_classes,
    categorical_vocabulary_sizes=categorical_vocabulary_sizes,
    padding_idx=params['buckets'] + tokenizer.get_nwords(),
    sparse=params['torch_sparse'],
)

In [None]:
# Define optimizer & scheduler
if params['torch_sparse']:
    optimizer = SGD
else:
    optimizer = Adam
optimizer_params = {"lr": params['lr']}
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau
scheduler_params = {
    "mode": "min",
    "patience": params['torch_patience'],
}


# Lightning module
module_torch = FastTextModule(
    model=model_torch,
    loss=nn.CrossEntropyLoss(),
    optimizer=optimizer,
    optimizer_params=optimizer_params,
    scheduler=scheduler,
    scheduler_params=scheduler_params,
    scheduler_interval="epoch",
)

# Trainer callbacks
checkpoints = [
    {
        "monitor": "validation_loss",
        "save_top_k": 1,
        "save_last": False,
        "mode": "min",
    }
]
callbacks = [ModelCheckpoint(**checkpoint) for checkpoint in checkpoints]
callbacks.append(
    EarlyStopping(
        monitor="validation_loss",
        patience=params['torch_patience'],
        mode="min",
    )
)
callbacks.append(LearningRateMonitor(logging_interval="step"))

# Strategy
strategy = "auto"

# Trainer
trainer_torch = pl.Trainer(
    callbacks=callbacks,
    max_epochs=params['max_epochs'],
    num_sanity_val_steps=2,
    strategy=strategy,
    log_every_n_steps=2,
    logger=tb_logger,
)

# Training
torch.cuda.empty_cache()
torch.set_float32_matmul_precision("medium")
torch.get_num_threads()


In [None]:
print(f"threads avant fit = {torch.get_num_threads()}")
start_time = time.time()

trainer_torch.fit(module_torch, train_dataloader, val_dataloader)

end_time = time.time()
elapsed_time_torch = (end_time - start_time) / 60
print(f"threads après fit = {torch.get_num_threads()}")
print("Temps écoulé pour entrainer la réimplementation PyTorch : ", elapsed_time_torch, " minutes")

In [None]:
# Passer le modèle en mode évaluation
model_torch.eval()

# Initialiser les listes pour stocker les vraies valeurs et les prédictions
all_labels = []
all_preds = []

# Boucle d'évaluation sur le DataLoader de test
with torch.no_grad():  # Pas de calcul de gradient lors de l'évaluation
    for batch in val_dataloader:
        inputs, labels = batch[:-1], batch[-1]
        # Obtenir les prédictions
        outputs = model_torch(inputs)
        _, preds = torch.max(outputs, 1)  # Obtenir les classes prédictes
        
        # Ajouter les labels et les prédictions aux listes
        all_labels.extend(labels.numpy())  # Pas besoin de .cpu() car tu es sur CPU
        all_preds.extend(preds.numpy())

# Calcul des métriques avec scikit-learn
accuracy_torch = accuracy_score(all_labels, all_preds)
precision_torch = precision_score(all_labels, all_preds, average='weighted')  # 'weighted' pour la moyenne pondérée par classe
recall_torch = recall_score(all_labels, all_preds, average='weighted')
f1_torch = f1_score(all_labels, all_preds, average='weighted')

print(f"Accuracy: {accuracy_torch:.4f}")
print(f"Precision: {precision_torch:.4f}")
print(f"Recall: {recall_torch:.4f}")
print(f"F1 Score: {f1_torch:.4f}")


## Cas 3 : Torch avec intégration des variables catégorielles dans le libellé textuel

In [None]:
training_text_tweeked = X_train[params["text_feature_tweeked"]].to_list()
tokenizer = NGramTokenizer(
    params['minCount'], params['minn'], params['maxn'], params['buckets'], params['wordNgrams'], training_text_tweeked
)

In [None]:
train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].to_list() for column in X_train[[]] # empty list
    ],
    texts=training_text_tweeked,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].to_list() for column in X_val[[]]
    ],
    texts=X_val[params["text_feature_tweeked"]].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)
train_dataloader = train_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)
val_dataloader = val_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)


In [None]:
num_classes = df[params["y_name"]].nunique()
categorical_vocabulary_sizes = [
    len(np.unique(X_train[feature])) for feature in [] #empty list
]
print(categorical_vocabulary_sizes)

In [None]:
model_torch_libelle_tweeked = FastTextModel(
    embedding_dim=params['dim'],
    vocab_size=params['buckets'] + tokenizer.get_nwords() + 1,
    num_classes=num_classes,
    categorical_vocabulary_sizes=categorical_vocabulary_sizes,
    padding_idx=params['buckets'] + tokenizer.get_nwords(),
    sparse=params['torch_sparse'],
)

In [None]:
# Define optimizer & scheduler
if params['torch_sparse']:
    optimizer = SGD
else:
    optimizer = Adam
optimizer_params = {"lr": params['lr']}
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau
scheduler_params = {
    "mode": "min",
    "patience": params['torch_patience'],
}


# Lightning module
module_libelle_tweeked = FastTextModule(
    model=model_torch_libelle_tweeked,
    loss=nn.CrossEntropyLoss(),
    optimizer=optimizer,
    optimizer_params=optimizer_params,
    scheduler=scheduler,
    scheduler_params=scheduler_params,
    scheduler_interval="epoch",
)

# Trainer callbacks
checkpoints = [
    {
        "monitor": "validation_loss",
        "save_top_k": 1,
        "save_last": False,
        "mode": "min",
    }
]
callbacks = [ModelCheckpoint(**checkpoint) for checkpoint in checkpoints]
callbacks.append(
    EarlyStopping(
        monitor="validation_loss",
        patience=params['torch_patience'],
        mode="min",
    )
)
callbacks.append(LearningRateMonitor(logging_interval="step"))

# Strategy
strategy = "auto"



# Trainer
trainer = pl.Trainer(
    callbacks=callbacks,
    max_epochs=params['max_epochs'],
    num_sanity_val_steps=2,
    strategy=strategy,
    log_every_n_steps=2,
    logger=tb_logger_libelle_tweeked,
)

# Training
torch.cuda.empty_cache()
torch.set_float32_matmul_precision("medium")
torch.get_num_threads()


In [None]:
print(f"threads avant fit = {torch.get_num_threads()}")
start_time = time.time()

trainer.fit(module_libelle_tweeked, train_dataloader, val_dataloader)

end_time = time.time()
elapsed_time_torch_tweeked = (end_time - start_time) / 60
print(f"threads après fit = {torch.get_num_threads()}")
print("Temps écoulé pour entrainer la réimplementation PyTorch tweeked : ", elapsed_time_torch_tweeked, " minutes")

In [None]:
# Passer le modèle en mode évaluation
model_torch_libelle_tweeked.eval()

# Initialiser les listes pour stocker les vraies valeurs et les prédictions
all_labels = []
all_preds = []

# Boucle d'évaluation sur le DataLoader de test
with torch.no_grad():  # Pas de calcul de gradient lors de l'évaluation
    for batch in val_dataloader:
        inputs, labels = batch[:-1], batch[-1]
        # Obtenir les prédictions
        outputs = model_torch_libelle_tweeked(inputs)
        _, preds = torch.max(outputs, 1)  # Obtenir les classes prédictes
        
        # Ajouter les labels et les prédictions aux listes
        all_labels.extend(labels.numpy())  # Pas besoin de .cpu() car tu es sur CPU
        all_preds.extend(preds.numpy())

# Calcul des métriques avec scikit-learn
accuracy_libelle_tweeked = accuracy_score(all_labels, all_preds)
precision_libelle_tweeked = precision_score(all_labels, all_preds, average='weighted')  # 'weighted' pour la moyenne pondérée par classe
recall_libelle_tweeked = recall_score(all_labels, all_preds, average='weighted')
f1_libelle_tweeked = f1_score(all_labels, all_preds, average='weighted')

print(f"Accuracy: {accuracy_libelle_tweeked:.4f}")
print(f"Precision: {precision_libelle_tweeked:.4f}")
print(f"Recall: {recall_libelle_tweeked:.4f}")
print(f"F1 Score: {f1_libelle_tweeked:.4f}")


## Résultats

In [None]:
# print(f"Accuracy de la lib Fasttext: {accuracy_ft}")
# print(f"Accuracy de la réimplémentation PyTorch: {accuracy}")

Vérification sur la structure de chaque modèle

In [None]:
# total_params_expected = params["dim"] * (params["buckets"] + tokenizer.get_nwords() + 1) + ((params["dim"] * num_classes) + num_classes)
# torch_total_params = sum(p.numel() for p in model_torch.parameters())
# ft_embedding_dim = model_ft.get_input_matrix().shape[1]

# ft_nb_labels = len(model_ft.get_labels())
# ft_nb_words = len(model_ft.get_words())

# ft_vocab_size = model_ft.get_input_matrix().shape[0]
# torch_vocab_size = model_torch.embeddings.weight.shape[0]

# ft_total_params = ft_vocab_size * ft_embedding_dim + (ft_embedding_dim * ft_nb_labels + ft_nb_labels)

# print(f"Nombre de labels d'après FastText = {ft_nb_labels} ({num_classes} attendus)")
# print(f"Nombre de mots d'après FastText = {ft_nb_words} et d'après Torch = {tokenizer.get_nwords()}")

# print(f"Nombre de tokens d'après FastText = {ft_vocab_size}") 
# print(f"Nombre de tokens d'après Torch = {torch_vocab_size}") 

# print(f"Nombre total de paramètres dans Torch : {torch_total_params}") 
# print(f"Nombre total de paramètres dans Fasttext (attendu) : {ft_total_params}") 
# print(f"Nombre de paramètres attendus en théorie : {total_params_expected}")


Graphiques des accuracy d'entrainement et de validation au cours des epochs

In [None]:
def get_accuracy_chart(log_dir):
    # Chercher tous les fichiers `.tfevents` dans le dossier de logs
    tfevents_files = glob.glob(os.path.join(log_dir, "**", "*.tfevents.*"), recursive=True)
    
    if not tfevents_files:
        raise ValueError("No tfevents file found")

    # Trouver le fichier le plus récent en triant par date de modification
    most_recent_file = max(tfevents_files, key=os.path.getmtime)
    print(f"Most recent tfevent found: {most_recent_file}")

    # Charger les événements du fichier
    ea = event_accumulator.EventAccumulator(most_recent_file)
    ea.Reload()  # Charge les événements

    # Récupérer les tags disponibles
    tags = ea.Tags()['scalars']
    print(f"Les tags disponibles : {tags}")
    # Extraire les données pour train_accuracy et val_accuracy
    train_accuracy_data = ea.Scalars('train_accuracy_epoch') if 'train_accuracy_epoch' in tags else []
    val_accuracy_data = ea.Scalars('validation_accuracy') if 'validation_accuracy' in tags else []

    # Extraire les étapes et les valeurs pour chaque métrique si elles existent
    if train_accuracy_data:
        train_accuracy_batches = [e.step for e in train_accuracy_data]
        train_accuracies = [e.value for e in train_accuracy_data]
    else:
        raise ValueError("train_accuracy_epoch not found in tfevent tags")

    if val_accuracy_data:
        val_accuracy_batches = [e.step for e in val_accuracy_data]
        val_accuracies = [e.value for e in val_accuracy_data]
    else:
        raise ValueError("validation_accuracy not found in tfevent tags")

    # Création de la figure
    plt.figure(figsize=(10, 5))

    # Tracer la courbe de l'accuracy d'entraînement
    if train_accuracy_batches and train_accuracies:
        plt.plot(train_accuracy_batches, train_accuracies, label='Train Accuracy', color='blue')

    # Tracer la courbe de l'accuracy de validation
    if val_accuracy_batches and val_accuracies:
        plt.plot(val_accuracy_batches, val_accuracies, label='Validation Accuracy', color='red')

    # Configuration des axes et du titre
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Evolution de l\'accuracy au cours des epochs (Train vs Validation)')
    plt.legend()
    plt.grid(True)
    return(plt)


In [None]:
log_dir = "logs/" + path_logger_torch
fig_torch = get_accuracy_chart(log_dir)
fig_torch.show()

In [None]:
log_dir_tweek = "logs/" + path_logger_torch_tweeked
fig_torch_tweek = get_accuracy_chart(log_dir_tweek)
fig_torch_tweek.show()

## Logging sur MLflow

In [None]:
with mlflow.start_run(run_name=run_name):
    mlflow.log_metric("accuracy_libelle_tweeked", accuracy_libelle_tweeked)
    mlflow.log_metric("precision_libelle_tweeked", precision_libelle_tweeked)
    mlflow.log_metric("recall_libelle_tweeked", recall_libelle_tweeked)
    mlflow.log_metric("f1_libelle_tweeked", f1_libelle_tweeked)
    mlflow.log_metric("accuracy_libelle_tweeked", accuracy_libelle_tweeked)
    mlflow.log_metric("precision_libelle_tweeked", precision_libelle_tweeked)
    mlflow.log_metric("recall_libelle_tweeked", recall_libelle_tweeked)
    mlflow.log_metric("f1_libelle_tweeked", f1_libelle_tweeked)
    mlflow.log_metric("accuracy_fasttext", accuracy_ft)
    mlflow.log_metric("time_fasttext", elapsed_time_ft)
    mlflow.log_metric("time_torch", elapsed_time_torch)
    mlflow.log_metric("time_torch_tweeked", elapsed_time_torch_tweeked)
    for param_name in sorted(params):
        mlflow.log_param(param_name, params[param_name])
    mlflow.log_artifact("requirements.txt")
    mlflow.log_artifacts("src/", artifact_path="src")
    mlflow.log_artifact("./benchmark_test.ipynb", artifact_path="nb")

In [None]:
torch.get_num_threads()

Questions : 

- Sur la réimplémentation torch :
    - Pourquoi pas de référence à softmax, ova, etc. dans la définition du modèles torch ?
- Sur le modèle de la lib FastText : 
    - comment on fait du negative sampling en même temps que le classifier ?

Todo : 
- loss curves (adaptation du module lightning pour conserver les loss des epochs)

## Tests