# Benchmark FastText vs PyTorch sur APE

## Environnement

In [None]:
from pathlib import Path
import sys
import s3fs
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import mlflow
import pyarrow.parquet as pq
from src.preprocess import clean_text_feature
import fasttext
import warnings
import nltk
nltk.download('stopwords')

In [None]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
np.random.seed(0)
random.seed(0)

MLflow

In [None]:
# remote_server_uri = sys.argv[1]
# experiment_name = sys.argv[2]
# run_name = sys.argv[3]


Hyper-paramètres

In [None]:
params={
    "y_name": "nace",
    "text_feature": "text",
    "df_sample_size": 1000,
    "max_epochs": 50,
    "train_proportion": 0.8,
    "lr": 0.2,
    "buckets": 2000000,
    "dim": 180,
    "minCount": 3,
    "minn": 3,
    "maxn": 4,
    "wordNgrams": 3,
    "ft_thread": 100,
    "ft_loss": "ova",
}

Data

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"}, anon=True
)
df = (
    pq.ParquetDataset(
        "projet-formation/diffusion/mlops/data/firm_activity_data.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
)


In [None]:
print(f"Nombre de valeurs vide : {(df[params["y_name"]]=="").sum()}")
print(f"Nombre de valeurs NA : {df["nace"].isna().sum()}")

counts = df[params["y_name"]].value_counts()
modalites_suffisantes = counts[counts >= 5].index
df = df[df[params["y_name"]].isin(modalites_suffisantes)]

df = df.sample(params["df_sample_size"], random_state=123)


In [None]:
# Clean text feature
df = clean_text_feature(df, text_feature="text")

# Encode classes
encoder = LabelEncoder()
df[params["y_name"]] = encoder.fit_transform(df[params["y_name"]])

In [None]:
df


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[params["text_feature"]],
    df[params["y_name"]],
    test_size=1 - params["train_proportion"],
    random_state=0,
    shuffle=True,
)

df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

## Cas 1 : FastText 

In [None]:
def write_training_data(
    df: pd.DataFrame,
    y: str,
    text_feature: str,
    categorical_features: Optional[List[str]],
    label_prefix: str = "__label__",
) -> str:
    """
    Write training data to file.

    Args:
        df (pd.DataFrame): DataFrame.
        y (str): Output variable name.
        text_feature (str): Text feature.
        categorical_features (Optional[List[str]]): Categorical features.
        label_prefix (str, optional): Label prefix. Defaults to "__label__".

    Returns:
        str: Training data path.
    """
    training_data_path = Path("data/training_data.txt")

    with open(training_data_path, "w", encoding="utf-8") as file:
        for _, item in df.iterrows():
            formatted_item = f"{label_prefix}{item[y]} {item[text_feature]}"
            if categorical_features != []:
                for feature in categorical_features:
                    formatted_item += f" {feature}_{item[feature]}"
            file.write(f"{formatted_item}\n")
    return training_data_path.as_posix()

In [None]:
# Write training data in a .txt file (fasttext-specific)
training_data_path = write_training_data(
    df=df_train,
    y=params["y_name"],
    text_feature=params["text_feature"],
    categorical_features=[],
)


In [None]:
# Train the fasttext model
model = fasttext.train_supervised(
    input=training_data_path,
    dim=params["dim"],
    lr=params["lr"],
    epoch=params["max_epochs"],
    wordNgrams=params["wordNgrams"],
    minn=params["minn"],
    maxn=params["maxn"],
    minCount=params["minCount"],
    bucket=params["buckets"],
    thread=params["ft_thread"],
    loss=params["ft_loss"],
    label_prefix="__label__",
    verbose=2
)