# Benchmark Torch reimplementation and meta's Fasttext 

We use the same model in both cases.
No categorical variables

## Training

General params

In [None]:
!pip install -r ../requirements.txt -q
!pip install fasttext -q

In [None]:
import os
import sys
import time
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import s3fs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import fasttext
from typing import List, Optional, Dict
from pathlib import Path
sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Hyperparamètres

In [2]:
params={
    "y_name": "apet_finale",
    "text_feature": "libelle_processed",
    "categorical_features": [] ,
    "max_epochs": 10,
    "lr": 0.001,
    "buckets": 10000,
    "dim": 80,
    "minCount": 1,
    "minn": 3,
    "maxn": 6,
    "wordNgrams": 3,
    "ft_thread": 100,
    "ft_loss": "softmax", #"ova", #"softmax",
    "ft_lrUpdateRate": 0, #100
    "ft_neg": 5, # 5
    "torch_batch_size": 256,
    "torch_patience_train": 3,
    "torch_sparse": False,
}

Data

In [4]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
    anon=True,
)
df = (
    pq.ParquetDataset(
        "projet-ape/extractions/20241027_sirene4.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
).sample(frac=0.001).fillna(np.nan)
print(f"df contains {df.shape[0]} rows")

with fs.open("projet-ape/data/naf2008.csv") as file:
    naf2008 = pd.read_csv(file, sep=";")

categorical_features = ["evenement_type", "cj",  "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"]
text_feature = "libelle"
y = params["y_name"]
textual_features = None
df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)
df[params["text_feature"]] = clean_text_feature(df["libelle"])
df


2025-03-06 10:12:50 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 10:12:50 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 10:12:50 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 10:12:50 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 10:12:51 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 10:12:54 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


df contains 2645 rows
	*** 732 codes have been added in the database...



Unnamed: 0,liasse_numero,id,evenement_type,categorie_demande,cj,domas,ssdom,liasse_type,activ_sec_agri_et,activ_nat_et,...,date_modification,mode_calcul_apet,mode_calcul_apen,apet_finale,codif_apet,codif_apen,emetteur,libelle,is_apet,libelle_processed
1069034,J00049205636,2468905.0,01P,CG,,,,X,,,...,2023-12-31 12:03:37.860131+00:00,AUTO,,9003B,True,False,Guichet unique,Productions journalistiques (rédaction d'artic...,1.0,"product journalist (redact d'articles, intervi..."
1238859,J00058119132,2935832.0,01P,CG,,,,X,,,...,2024-02-20 17:45:49.734603+00:00,FASTTEXT,,9602B,True,False,Guichet unique,"Dermopigmentation, microblading et tricopigmen...",1.0,"dermopigmentation, microblading tricopigment m..."
1410803,G03525201664,3443831.0,54M,CG,5710,,,C,,99,...,2024-04-03 14:06:13.590839+00:00,AUTO,,3511Z,True,False,Greffe,Production délectricité,1.0,product delectricit
2305151,J00091356543,6763530.0,01M,CG,5710,,,C,,99,...,2024-09-05 14:54:02.924691+00:00,FASTTEXT,,4791A,True,False,Guichet unique,Commerce de détail de tous types de produits n...,1.0,commerc detail tous typ produit non reglement ...
717709,J00032512733,1544400.0,01P,CG,,,,C,,99,...,2023-09-07 01:35:03.602498+00:00,AUTO,,4791B,True,False,Guichet unique,vente en ligne de prêt a porter et création de...,1.0,vent lign pret port creation model uniqu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,,,,,,,,,,,...,NaT,,,9609Z,,,,Autres services personnels n.c.a.,,autr servic personnel n.c.a.
728,,,,,,,,,,,...,NaT,,,9700Z,,,,Activités des ménages en tant qu'employeurs de...,,activit menag tant qu'employeur personnel domest
729,,,,,,,,,,,...,NaT,,,9810Z,,,,Activités indifférenciées des ménages en tant ...,,activit indifferencie menag tant producteur bi...
730,,,,,,,,,,,...,NaT,,,9820Z,,,,Activités indifférenciées des ménages en tant ...,,activit indifferencie menag tant producteur se...


In [5]:
encoder = LabelEncoder()
df[params["y_name"]] = encoder.fit_transform(df[params["y_name"]])
df, _ = clean_and_tokenize_df(df, text_feature=params["text_feature"])

# X without categorical variables
# X = df[[params["text_feature"], "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
X = df[[params["text_feature"]]].values
y = df[params["y_name"]].values

  df.fillna("nan", inplace=True)


In [6]:
X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)
assert set(range(len(naf2008["code"]))) == set(np.unique(y_train))


### TorchFasttext

In [None]:
model = torchFastText(
    num_tokens=params["buckets"],
    embedding_dim=params["buckets"],
    categorical_embedding_dims=None,
    min_count=params["minCount"],
    min_n=params["minn"],
    max_n=params["maxn"],
    len_word_ngrams=params["wordNgrams"],
    sparse = params["torch_sparse"]
)
model.build(X_train, y_train, lightning=True, lr = params["lr"])

In [None]:
model.train(
    X_train,
    y_train,
    X_test,
    y_test,
    num_epochs=params["max_epochs"],
    batch_size=params["torch_batch_size"],
    patience_scheduler=params["torch_patience_train"],
    patience_train=params["torch_patience_train"],
    lr=params["lr"],
    verbose = True
)
model.load_from_checkpoint(model.best_model_path)

In [None]:
pred, conf = model.predict(X_test, top_k=1)
accurary_torch = accuracy_score(y_test, pred.numpy())


### Fasttext meta

In [None]:
df_train = pd.concat([pd.DataFrame(X_train, columns=[params["text_feature"]]), pd.DataFrame(y_train, columns=[params["y_name"]])], axis=1)
df_test = pd.concat([pd.DataFrame(X_test, columns=[params["text_feature"]]), pd.DataFrame(y_test, columns=[params["y_name"]])], axis=1)

In [None]:
df_train

In [None]:

def write_training_data(
    df: pd.DataFrame,
    y: str,
    text_feature: str,
    categorical_features: Optional[List[str]],
    label_prefix: str = "__label__",
) -> str:
    """
    Write training data to file.

    Args:
        df (pd.DataFrame): DataFrame.
        y (str): Output variable name.
        text_feature (str): Text feature.
        categorical_features (Optional[List[str]]): Categorical features.
        label_prefix (str, optional): Label prefix. Defaults to "__label__".

    Returns:
        str: Training data path.
    """
    training_data_path = Path("data/training_data.txt")

    with open(training_data_path, "w", encoding="utf-8") as file:
        for _, item in df.iterrows():
            formatted_item = f"{label_prefix}{item[y]} {item[text_feature]}"
            if categorical_features != []:
                for feature in categorical_features:
                    formatted_item += f" {feature}_{item[feature]}"
            file.write(f"{formatted_item}\n")
    return training_data_path.as_posix()

# Write training data in a .txt file (fasttext-specific)
training_data_path = write_training_data(
    df=df_train,
    y=params["y_name"],
    text_feature=params["text_feature"],
    categorical_features=params["categorical_features"],
)

In [None]:
# Train the fasttext model

start_time = time.time()

model_ft = fasttext.train_supervised(
    input=training_data_path,
    dim=params["dim"],
    lr=params["lr"],
    epoch=params["max_epochs"],
    lrUpdateRate=params["ft_lrUpdateRate"],
    neg=params["ft_neg"],
    wordNgrams=params["wordNgrams"],
    minn=params["minn"],
    maxn=params["maxn"],
    minCount=params["minCount"],
    bucket=params["buckets"],
    thread=params["ft_thread"],
    loss=params["ft_loss"],
    label_prefix="__label__",
    verbose=2
)

end_time = time.time()
elapsed_time_ft = (end_time - start_time) / 60
print("Temps écoulé pour entrainer la lib fasttext : ", elapsed_time_ft, " minutes")

In [None]:
test_input = []
for _, item in df_test.iterrows():
    formatted_item = f"{"__label__"}{item[params["y_name"]]} {item[params["text_feature"]]}"
    test_input.append(formatted_item)

In [None]:
predictions = model_ft.predict(test_input, k=1)
predictions = [x[0].replace("__label__", "") for x in predictions[0]]
booleans = [
    prediction == str(label)
    for prediction, label in zip(predictions, df_test[params["y_name"]])
]
accuracy_ft = sum(booleans) / len(booleans)


## Results

In [None]:
print(f"Fasttext meta accuracy: {accuracy_ft}")
print(f"Torch accuracy: {accurary_torch:.4f}")