# Benchmark FastText vs PyTorch sur APE

## Environnement

In [None]:
import time
from pathlib import Path
import sys
import s3fs
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import pyarrow.parquet as pq
import fasttext
import os
import warnings
import pytorch_lightning as pl
import torch
from torch import nn
from torch.optim import Adam, SGD
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
import unidecode
from src.model import FastTextModule, FastTextModel
from src.dataset import FastTextModelDataset
from src.tokenizer import NGramTokenizer
from src.preprocess import clean_text_feature
import warnings
import nltk
nltk.download('stopwords')

In [None]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
np.random.seed(0)
random.seed(0)

MLflow

In [None]:
remote_server_uri = mlflow.get_tracking_uri()
experiment_name = "benchmark_fasttext"
run_name=""

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(experiment_name)


Hyper-paramètres

In [None]:
params={
    "y_name": "nace",
    "text_feature": "text",
    "df_sample_size": 50000,
    "max_epochs": 50, #50
    "train_proportion": 0.8,
    "lr": 0.4,
    "buckets": 2000000, #2000000
    "dim": 180, # 180
    "minCount": 1,
    "minn": 3,
    "maxn": 5,
    "wordNgrams": 3,
    "ft_thread": 100,
    "ft_loss": "ova", #"softmax",
    "ft_lrUpdateRate": 0, #100
    "ft_neg": 1000, # 5
    "torch_batch_size": 64,
    "torch_patience": 3,
    "torch_sparse": True,
    "torch_num_workers": 20
}
categorical_features_torch=[] 

Data

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"}, anon=True
)
df = (
    pq.ParquetDataset(
        "projet-formation/diffusion/mlops/data/firm_activity_data.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
)


In [None]:
print(f"Nombre de valeurs vide : {(df[params["y_name"]]=="").sum()}")
print(f"Nombre de valeurs NA : {df["nace"].isna().sum()}")

df = df.sample(params["df_sample_size"], random_state=123)

counts = df[params["y_name"]].value_counts()
modalites_suffisantes = counts[counts >= 3].index
df = df[df[params["y_name"]].isin(modalites_suffisantes)]

print(f"Shape of sampled df after removal of rare outcomes : {len(df)}")


In [None]:
# Clean text feature
df = clean_text_feature(df, text_feature="text")

# Encode classes
encoder = LabelEncoder()
df[params["y_name"]] = encoder.fit_transform(df[params["y_name"]])

In [None]:
print((df[params["y_name"]].value_counts()<3).sum())

In [None]:
counts = df[params["y_name"]].value_counts()
modalites_suffisantes = counts[counts < 3].index
print(modalites_suffisantes)

In [None]:
df


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[[params["text_feature"]]],
    df[params["y_name"]],
    test_size=1 - params["train_proportion"],
    random_state=0,
    shuffle=True,
    stratify=df[params["y_name"]]
)

df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [None]:
num_classes = y_train.nunique()
print(f"Nombre de classes dans y_train : {num_classes}")

## Cas 1 : FastText 

In [None]:
def write_training_data(
    df: pd.DataFrame,
    y: str,
    text_feature: str,
    categorical_features: Optional[List[str]],
    label_prefix: str = "__label__",
) -> str:
    """
    Write training data to file.

    Args:
        df (pd.DataFrame): DataFrame.
        y (str): Output variable name.
        text_feature (str): Text feature.
        categorical_features (Optional[List[str]]): Categorical features.
        label_prefix (str, optional): Label prefix. Defaults to "__label__".

    Returns:
        str: Training data path.
    """
    training_data_path = Path("data/training_data.txt")

    with open(training_data_path, "w", encoding="utf-8") as file:
        for _, item in df.iterrows():
            formatted_item = f"{label_prefix}{item[y]} {item[text_feature]}"
            if categorical_features != []:
                for feature in categorical_features:
                    formatted_item += f" {feature}_{item[feature]}"
            file.write(f"{formatted_item}\n")
    return training_data_path.as_posix()

In [None]:
# Write training data in a .txt file (fasttext-specific)
training_data_path = write_training_data(
    df=df_train,
    y=params["y_name"],
    text_feature=params["text_feature"],
    categorical_features=[],
)


In [None]:
# Train the fasttext model

start_time = time.time()

model_ft = fasttext.train_supervised(
    input=training_data_path,
    dim=params["dim"],
    lr=params["lr"],
    epoch=params["max_epochs"],
    lrUpdateRate=params["ft_lrUpdateRate"],
    neg=params["ft_neg"],
    wordNgrams=params["wordNgrams"],
    minn=params["minn"],
    maxn=params["maxn"],
    minCount=params["minCount"],
    bucket=params["buckets"],
    thread=params["ft_thread"],
    loss=params["ft_loss"],
    label_prefix="__label__",
    verbose=2
)

end_time = time.time()
elapsed_time_ft = (end_time - start_time) / 60
print("Temps écoulé pour entrainer la lib fasttext : ", elapsed_time_ft, " minutes")

Preprocess val data

In [None]:
val_input = []
for _, item in df_val.iterrows():
    formatted_item = f"{"__label__"}{item[params["y_name"]]} {item[params["text_feature"]]}"
    val_input.append(formatted_item)

Calculate accuracy

In [None]:
predictions = model_ft.predict(val_input, k=1)
predictions = [x[0].replace("__label__", "") for x in predictions[0]]
booleans = [
    prediction == str(label)
    for prediction, label in zip(predictions, df_val[params["y_name"]])
]
accuracy_ft = sum(booleans) / len(booleans)
accuracy_ft

## Cas 2 : Entraînement et evaluation avec la réimplémentation PyTorch

In [None]:
# torch.set_num_threads(1)

In [None]:
training_text = X_train[params["text_feature"]].to_list()
tokenizer = NGramTokenizer(
    params['minCount'], params['minn'], params['maxn'], params['buckets'], params['wordNgrams'], training_text
)

In [None]:
train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].to_list() for column in X_train[categorical_features_torch]
    ],
    texts=training_text,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].to_list() for column in X_val[categorical_features_torch]
    ],
    texts=X_val[params["text_feature"]].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)
train_dataloader = train_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)
val_dataloader = val_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)


In [None]:
model_torch = FastTextModel(
    embedding_dim=params['dim'],
    vocab_size=params['buckets'] + tokenizer.get_nwords() + 1,
    num_classes=num_classes,
    categorical_vocabulary_sizes=[], # use case without add variables
    padding_idx=params['buckets'] + tokenizer.get_nwords(),
    sparse=params['torch_sparse'],
)

In [None]:
# Define optimizer & scheduler
if params['torch_sparse']:
    optimizer = SGD
else:
    optimizer = Adam
optimizer_params = {"lr": params['lr']}
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau
scheduler_params = {
    "mode": "min",
    "patience": params['torch_patience'],
}


# Lightning module
module = FastTextModule(
    model=model_torch,
    loss=nn.CrossEntropyLoss(),
    optimizer=optimizer,
    optimizer_params=optimizer_params,
    scheduler=scheduler,
    scheduler_params=scheduler_params,
    scheduler_interval="epoch",
)

# Trainer callbacks
checkpoints = [
    {
        "monitor": "validation_loss",
        "save_top_k": 1,
        "save_last": False,
        "mode": "min",
    }
]
callbacks = [ModelCheckpoint(**checkpoint) for checkpoint in checkpoints]
callbacks.append(
    EarlyStopping(
        monitor="validation_loss",
        patience=params['torch_patience'],
        mode="min",
    )
)
callbacks.append(LearningRateMonitor(logging_interval="step"))

# Strategy
strategy = "auto"

# Trainer
trainer = pl.Trainer(
    callbacks=callbacks,
    max_epochs=params['max_epochs'],
    num_sanity_val_steps=2,
    strategy=strategy,
    log_every_n_steps=2,
)

# Training
torch.cuda.empty_cache()
torch.set_float32_matmul_precision("medium")
torch.get_num_threads()


In [None]:
print(f"threads avant fit = {torch.get_num_threads()}")
start_time = time.time()

trainer.fit(module, train_dataloader, val_dataloader)

end_time = time.time()
elapsed_time_torch = (end_time - start_time) / 60
print(f"threads après fit = {torch.get_num_threads()}")
print("Temps écoulé pour entrainer la réimplementation PyTorch : ", elapsed_time_torch, " minutes")

In [None]:
# Passer le modèle en mode évaluation
model_torch.eval()

# Initialiser les listes pour stocker les vraies valeurs et les prédictions
all_labels = []
all_preds = []

# Boucle d'évaluation sur le DataLoader de test
with torch.no_grad():  # Pas de calcul de gradient lors de l'évaluation
    for batch in val_dataloader:
        inputs, labels = batch[:-1], batch[-1]
        # Obtenir les prédictions
        outputs = model_torch(inputs)
        _, preds = torch.max(outputs, 1)  # Obtenir les classes prédictes
        
        # Ajouter les labels et les prédictions aux listes
        all_labels.extend(labels.numpy())  # Pas besoin de .cpu() car tu es sur CPU
        all_preds.extend(preds.numpy())

# Calcul des métriques avec scikit-learn
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')  # 'weighted' pour la moyenne pondérée par classe
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


## Résultats

In [None]:
print(f"Accuracy de la lib Fasttext: {accuracy_ft}")
print(f"Accuracy de la réimplémentation PyTorch: {accuracy}")

Vérification sur la structure de chaque modèle

In [None]:
total_params_expected = params["dim"] * (params["buckets"] + tokenizer.get_nwords() + 1) + ((params["dim"] * num_classes) + num_classes)
torch_total_params = sum(p.numel() for p in model_torch.parameters())
ft_embedding_dim = model_ft.get_input_matrix().shape[1]

ft_nb_labels = len(model_ft.get_labels())
ft_nb_words = len(model_ft.get_words())

ft_vocab_size = model_ft.get_input_matrix().shape[0]
torch_vocab_size = model_torch.embeddings.weight.shape[0]

ft_total_params = ft_vocab_size * ft_embedding_dim + (ft_embedding_dim * ft_nb_labels + ft_nb_labels)

print(f"Nombre de labels d'après FastText = {ft_nb_labels} ({num_classes} attendus)")
print(f"Nombre de mots d'après FastText = {ft_nb_words} et d'après Torch = {tokenizer.get_nwords()}")

print(f"Nombre de tokens d'après FastText = {ft_vocab_size}") 
print(f"Nombre de tokens d'après Torch = {torch_vocab_size}") 

print(f"Nombre total de paramètres dans Torch : {torch_total_params}") 
print(f"Nombre total de paramètres dans Fasttext (attendu) : {ft_total_params}") 
print(f"Nombre de paramètres attendus en théorie : {total_params_expected}")


## Logging sur MLflow

In [None]:
with mlflow.start_run(run_name=run_name):
    mlflow.log_metric("accuracy_torch", accuracy)
    mlflow.log_metric("accuracy_fasttext", accuracy_ft)
    mlflow.log_metric("time_fasttext", elapsed_time_ft)
    mlflow.log_metric("time_torch", elapsed_time_torch)
    for param_name in sorted(params.keys()):
         mlflow.log_param(param_name, params[param_name])
    mlflow.log_param("categorical_features_torch", categorical_features_torch)
    mlflow.log_artifact("requirements.txt")
    mlflow.log_artifacts("src/", artifact_path="src")
    mlflow.log_artifact("./benchmark_test.ipynb", artifact_path="nb")

In [None]:
torch.get_num_threads()

Questions : 

- Sur la réimplémentation torch :
    - Pourquoi pas de référence à softmax, ova, etc. dans la définition du modèles torch ?
- Sur le modèle de la lib FastText : 
    - comment on fait du negative sampling en même temps que le classifier ?