## Init

In [None]:
import time
import shutil
import glob
from pathlib import Path
import sys
import s3fs
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import pyarrow.parquet as pq
import fasttext
import os
import warnings
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
from tensorboard.backend.event_processing import event_accumulator
import torch
from torch import nn
from torch.optim import Adam, SGD
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
import unidecode
# from src.model_negsamp import FastTextModule_negsamp, FastTextModel_negsamp
from src.model import FastTextModule, FastTextModel
from src.dataset import FastTextModelDataset
from src.tokenizer import NGramTokenizer
from src.preprocess import clean_text_feature
import warnings
import nltk
nltk.download('stopwords')
from tensorboard.backend.event_processing import event_accumulator
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
np.random.seed(0)
random.seed(0)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device available: {device}")

In [None]:
params={
    "y_name": "nace2025",
    "text_feature": "libelle",
    "text_feature_tweeked": "libelle_tweeked",
    "df_sample_size": 100000,
    "max_epochs": 50, #50
    "train_proportion": 0.8,
    "buckets": 2000000, #2000000
    "dim": 180, # 180
    "minCount": 1,
    "minn": 3,
    "maxn": 6,
    "wordNgrams": 3,
    "ft_lr": 0.4,
    "ft_thread": 100,
    "ft_loss": "softmax", #"softmax","ova"
    "ft_lrUpdateRate": 100, #100
    "ft_neg": 5, # 5
    "torch_lr": 0.4,
    "torch_batch_size": 256,
    "torch_patience_scheduler": 1,
    "torch_patience_EarlyStopping": 5,
    "torch_sparse": False,
    "torch_num_workers": 100,
    # "categorical_features": [] ,
    "categorical_features": ["activ_nat_et", "liasse_type"] ,
}


In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio.lab.sspcloud.fr'},
    key = os.environ["AWS_ACCESS_KEY_ID"], 
    secret = os.environ["AWS_SECRET_ACCESS_KEY"], 
    token = os.environ["AWS_SESSION_TOKEN"])
df = (
    pq.ParquetDataset(
        "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet",
        filesystem=fs,
    )
    .read_pandas()
    .to_pandas()
)

print(f"Nombre de valeurs vide : {(df[params["y_name"]]=="").sum()}")
print(f"Nombre de valeurs NA : {df[params["y_name"]].isna().sum()}")

df = df.dropna(subset=[params["y_name"]])

df = df.sample(params["df_sample_size"], random_state=123)

counts = df[params["y_name"]].value_counts()
modalites_suffisantes = counts[counts >= 3].index
df = df[df[params["y_name"]].isin(modalites_suffisantes)]

print(f"Shape of sampled df after removal of rare outcomes : {len(df)}")

In [None]:
# Clean text feature
df = clean_text_feature(df, text_feature=params["text_feature"])



Ajout d'une variable textuelle de concaténation du libellé textuel et des variables catégorielles (astuce utilisée avec la lib fasttext dans les modèles en prod)

In [None]:
df[params['text_feature_tweeked']]=''
for idx, item in df.iterrows():
    formatted_item = item[params['text_feature']]
    if params["categorical_features"] != []:
        for feature in params["categorical_features"]:
            formatted_item += f" {feature}_{item[feature]}"
    df.at[idx, params['text_feature_tweeked']] = formatted_item

df[params['text_feature_tweeked']].sample(10)


In [None]:
# Encode outputs and categorical variables
encoder = LabelEncoder()
df[params["y_name"]] = encoder.fit_transform(df[params["y_name"]])

for var_categ_name in params["categorical_features"]:
    encoder = LabelEncoder()
    df[var_categ_name] = encoder.fit_transform(df[var_categ_name])


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[[params["text_feature"], params["text_feature_tweeked"]] + params["categorical_features"]],
    df[params["y_name"]],
    test_size=1 - params["train_proportion"],
    random_state=0,
    shuffle=True,
    stratify=df[params["y_name"]]
)

df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [None]:
num_classes = y_train.nunique()
print(f"Nombre de classes dans y_train : {num_classes}")

## Préparation des inputs du modele

Texte d'origine et variable additionnelles proprement intégrées au modèle

In [None]:
training_text = X_train[params["text_feature"]].to_list()
tokenizer = NGramTokenizer(
    params['minCount'], params['minn'], params['maxn'], params['buckets'], params['wordNgrams'], training_text
)

In [None]:
train_dataset = FastTextModelDataset(
    categorical_variables=[
        X_train[column].to_list() for column in X_train[params["categorical_features"]]
    ],
    texts=training_text,
    outputs=y_train.to_list(),
    tokenizer=tokenizer,
)
val_dataset = FastTextModelDataset(
    categorical_variables=[
        X_val[column].to_list() for column in X_val[params["categorical_features"]]
    ],
    texts=X_val[params["text_feature"]].to_list(),
    outputs=y_val.to_list(),
    tokenizer=tokenizer,
)
train_dataloader = train_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)
val_dataloader = val_dataset.create_dataloader(
    batch_size=params['torch_batch_size'], num_workers=params["torch_num_workers"]
)


In [None]:
num_classes = df[params["y_name"]].nunique()
categorical_vocabulary_sizes = [
    len(np.unique(X_train[feature])) for feature in params["categorical_features"]
]
print(categorical_vocabulary_sizes)

## Test modele

In [None]:

embedding_dim=params['dim']
vocab_size=params['buckets'] + tokenizer.get_nwords() + 1
num_classes=num_classes
categorical_vocabulary_sizes=categorical_vocabulary_sizes
padding_idx=params['buckets'] + tokenizer.get_nwords()
sparse=params['torch_sparse']


print("Model Initialization Parameters:")
print(f"embedding_dim: {embedding_dim}")
print(f"vocab_size: {vocab_size}")
print(f"num_classes: {num_classes}")
print(f"categorical_vocabulary_sizes: {categorical_vocabulary_sizes}")
print(f"padding_idx: {padding_idx}")
print(f"sparse: {sparse}")


In [None]:
self_num_classes = num_classes
self_padding_idx = padding_idx


In [None]:

self_embeddings = nn.Embedding(
    embedding_dim=embedding_dim,
    num_embeddings=vocab_size,
    padding_idx=padding_idx,
    sparse=sparse,
)


In [None]:

self_categorical_embeddings = {}
for var_idx, vocab_size in enumerate(categorical_vocabulary_sizes):
    # emb = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)
    variable_name = f"emb_{var_idx}"
    self_categorical_embeddings[variable_name] = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)

self_fc = nn.Linear(embedding_dim, num_classes)


Forward

In [None]:
inputs = next(iter(train_dataloader))

Les inputs : dans chaque batch, chaque résumé textuel est transformé en un vecteur de taille : "nombre de tokens maximum des documents du batch".
Le veceur d'un document contient les index des tokens du documents dans la matrice d'embedding.
Ainsi, les index vont de 0 à num_embeddings.
Généralement, le vecteur se terminer par plusieurs tokens de padding (le faux token qui sert à faire en sorte que, dans un batch, tous les vecteurs représentant un résumé textuel aient la même taille).

In [None]:
x_1 = inputs[0]
print(x_1)
print(f"shape: {x_1.shape}")


Sélection, pour chaque résumé textuel, du vecteur d'embedding de chacun de ses tokens.

In [None]:
x_1 = self_embeddings(x_1)
print(x_1)
print(f"shape: {x_1.shape}")

In [None]:
self_categorical_embeddings.items()


Pour chaque variable additionnelle catégorielle, on extrait le vecteur d'embedding de la valeure prise.
Remarque : "i+1" car le premier élément de inputs est le tensor des résumés textuels.

In [None]:
x_cat = []
for i, (variable, embedding_layer) in enumerate(
    self_categorical_embeddings.items()
):
    x_cat.append(embedding_layer(inputs[i + 1]))

In [None]:
print(len(x_cat))
print(x_cat[0].shape)


In [None]:
print(x_1.sum(-1))
print((x_1.sum(-1)).shape)

On cherche à connaître, pour chaque document textuel d'un batch, le nombre de vrais tokens composant le document textuel.
Ainsi, on cherche à connaître le nombre de tokens non-padding pour chaque document textuel.
Les tokens padding ont pour vecteur d'embedding un vecteur de zéros.


In [None]:
non_zero_tokens = x_1.sum(-1) != 0
print(non_zero_tokens)
print(non_zero_tokens.shape)

In [None]:
non_zero_tokens = non_zero_tokens.sum(-1)
print(non_zero_tokens)
print(non_zero_tokens.shape)

Pour chaque document textuel, on somme la valeur de l'ensemble des tokens par dimension d'embedding. A ce stade, chaque document du batch est représenté par un vecteur de taille dim (180 par exemple). Ensuite, pour standardiser ces sommes, on divise chaque vecteur par le nombre de "vrais tokens" par documents. A noter qu'on remplace les éventuelles valeurs infinis par des 0 a posteriori : gestion des cas où un document textuel ne contiendrait aucun token de la matrice d'embedding.

In [None]:
print(x_1.sum(dim=-2))
print(x_1.sum(dim=-2).shape)

In [None]:
x_1 = x_1.sum(dim=-2)


In [None]:
x_1 /= non_zero_tokens.unsqueeze(-1)
print(x_1)
print(x_1.shape)

In [None]:
x_1 = torch.nan_to_num(x_1)

Pour chaque input du jeu de données, on fait la somme du vecteur d'embedding du document textuel et des vecteurs d'embedding de chacune des variables catégorielles.

In [None]:
print(torch.stack(x_cat, dim=0).sum(dim=0))
print(torch.stack(x_cat, dim=0).sum(dim=0).shape)

In [None]:

if x_cat != []:
    x_in = x_1 + torch.stack(x_cat, dim=0).sum(dim=0)
else:
    x_in = x_1

print(x_in)
print(x_in.shape)

Enfin, on crée une couche linéaire

In [None]:
z = self_fc(x_in)
print(z)
print(z.shape)

In [None]:
   def forward(self, inputs: List[torch.LongTensor]) -> torch.Tensor:
        """
        Forward method.

        Args:
            inputs (List[torch.LongTensor]): Model inputs.

        Returns:
            torch.Tensor: Model output.
        """
        # Embed tokens
        x_1 = inputs[0]
        x_1 = self.embeddings(x_1)

        x_cat = []
        for i, (variable, embedding_layer) in enumerate(
            self.categorical_embeddings.items()
        ):
            x_cat.append(embedding_layer(inputs[i + 1]))

        # Mean of tokens
        non_zero_tokens = x_1.sum(-1) != 0
        non_zero_tokens = non_zero_tokens.sum(-1)
        x_1 = x_1.sum(dim=-2)
        x_1 /= non_zero_tokens.unsqueeze(-1)
        x_1 = torch.nan_to_num(x_1)

        if x_cat != []:
            x_in = x_1 + torch.stack(x_cat, dim=0).sum(dim=0)
        else:
            x_in = x_1

        # Linear layer
        z = self.fc(x_in)
        return z
