# Explain how it works

- introduction to `torch` framework
- expose `torchFasttext` specificities 

## Environment and data 

In [None]:
!pip install -r ../requirements.txt -q

In [7]:
import os
import sys
import time
import s3fs
from typing import List, Optional, Dict
from pathlib import Path
from utils_describe import get_data

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

import numpy as np
# import pandas as pd
# import pyarrow.parquet as pq

# from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
X_train, X_test, y_train, y_test = get_data()

2025-03-06 11:05:20 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 11:05:21 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 11:05:21 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 11:05:21 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 11:05:22 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-06 11:05:25 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


	*** 732 codes have been added in the database...



  df.fillna("nan", inplace=True)


In [4]:
print("Features for the 3 first training obs:\n")
print(X_train[:3])
print("\n")
print("NAF codes (labels) for the 3 first training obs:\n")
print(y_train[:3])

Features for the 3 first training obs:

[['fabriqu carton ondul' 21 23 8 7 0 0]
 ['defens' 21 23 8 7 0 0]
 ['heberg medicalis adult handicap autr' 21 23 8 7 0 0]]


NAF codes (labels) for the 3 first training obs:

[129 644 677]


Parameters : 
- example with a unique embedding dimension for categorical variables

In [16]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix = size of the embedded vocabulary
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 3 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 3 # Minimum length of char n-grams
MAX_N = 6 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
LR = 4e-3 # Learning rate
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

## Building the model

Explore what's happen during the build step.
Hypothesis:
- we use the `torchFastText.build_from_tokenizer` method to build: the tokenizer must be created first.
- we don't use json method to get parameters


In [17]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

NUM_CAT_VAR = categorical_variables.shape[1]
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))

print(f"NUM_CAT_VAR: {NUM_CAT_VAR}")
print(f"CAT_VOCAB_SIZE: {CAT_VOCAB_SIZE}")
print(f"NUM_CLASSES: {NUM_CLASSES}")

NUM_CAT_VAR: 6
CAT_VOCAB_SIZE: [22, 24, 9, 11, 3, 4]
NUM_CLASSES: 732


### The tokenizer

Lets describe the constructor of `NGramTokenizer` class

```python
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )


In [20]:
min_n=MIN_N
max_n=MAX_N
num_tokens= NUM_TOKENS
len_word_ngrams=LEN_WORD_NGRAMS
min_count=MIN_COUNT
training_text=training_text

if min_n < 2:
    raise ValueError("`min_n` parameter must be greater than 1.")
if max_n > 6:
    raise ValueError("`max_n` parameter must be smaller than 7.")

word_counts = {}
for sentence in training_text:
    for word in sentence.split(" "):
        word_counts[word] = word_counts.setdefault(word, 0) + 1


word_id_mapping = {}
i = 1
for word, counts in word_counts.items():
    if word_counts[word] >= min_count:
        word_id_mapping[word] = i
        i += 1
nwords = len(word_id_mapping)

print(f"word_counts: {word_counts}")
print(f"word_id_mapping: {word_id_mapping}")
print(f"nwords: {nwords}")

word_counts: {'fabriqu': 216, 'carton': 5, 'ondul': 1, 'defens': 1, 'heberg': 24, 'medicalis': 4, 'adult': 8, 'handicap': 7, 'autr': 159, 'machin': 20, 'industr': 6, 'papi': 7, 'cultur': 31, 'riz': 1, 'repar': 44, 'meubl': 114, "d'equip": 30, 'foi': 2, 'commerc': 151, 'gros': 65, '(commerc': 44, 'interentreprises)': 44, 'cuir': 8, 'peau': 1, 'organis': 34, 'jeux': 9, 'hasard': 1, "d'argent": 1, 'commun': 28, 'construct': 52, 'reseau': 30, 'fluid': 1, 'transport': 56, 'ferroviair': 3, 'fret': 9, 'materiel': 39, 'distribu': 20, 'command': 4, 'electr': 36, 'edit': 20, 'video': 10, "d'instrument": 1, 'scientif': 5, 'techniqu': 28, 'activit': 202, 'professionnel': 135, 'prepar': 23, 'fibr': 12, 'textil': 18, 'filatur': 1, 'cafe,': 3, 'the,': 2, 'cacao': 1, 'epic': 3, 'tubes,': 1, 'tuyaux,': 1, 'profil': 3, 'creux': 2, 'accessoir': 31, 'correspond': 4, 'aci': 2, 'jus': 2, 'fruit': 14, 'legum': 6, 'volaill': 3, 'gibi': 1, 'pat': 2, 'produit': 130, 'chimiqu': 6, 'imprimer': 2, '(labeur)': 1, '

In [22]:
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )
tokenizer

<torchFastText.datasets.tokenizer.NGramTokenizer at 0x7fad9b0d3b90>

In [41]:
indices, id_to_token, all_tokens_id = tokenizer.indices_matrix("carrelage carré")
print("==== indices\n")
print(indices)
print("\n==== indices length\n")
print(len(indices))
print("\n==== id_to_token\n")
print(id_to_token)
print("\n==== id_to_token length\n")
print(len(id_to_token))
print("\n==== all_tokens_id\n")
print(all_tokens_id)
print("\n==== all_tokens_id length\n")
print(len(all_tokens_id))

==== indices

tensor([17781, 34827, 45506, 33418, 87582, 60521, 64421, 77222, 18351, 24293,
        67859, 41713, 58190, 91385, 75698, 24004,  5488, 86293, 38766,  6075,
        11945, 42818, 29025,  7542, 40020, 31010, 63130, 77394, 77489, 65269,
        17781, 34827, 45506, 67374, 27304, 24293, 67859, 65741, 74712, 86293,
        26794, 75377, 71248, 13540,     0, 37371, 93475, 65582])

==== indices length

48

==== id_to_token

{17781: '<ca', 34827: 'car', 45506: 'arr', 33418: 'rre', 87582: 'rel', 60521: 'ela', 64421: 'lag', 77222: 'age', 18351: 'ge>', 24293: '<car', 67859: 'carr', 41713: 'arre', 58190: 'rrel', 91385: 'rela', 75698: 'elag', 24004: 'lage', 5488: 'age>', 86293: '<carr', 38766: 'carre', 6075: 'arrel', 11945: 'rrela', 42818: 'relag', 29025: 'elage', 7542: 'lage>', 40020: '<carre', 31010: 'carrel', 63130: 'arrela', 77394: 'rrelag', 77489: 'relage', 65269: 'elage>', 67374: 'rré', 27304: 'ré>', 65741: 'arré', 74712: 'rré>', 26794: 'carré', 75377: 'arré>', 71248: '<carré', 

In [52]:
tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts = tokenizer.tokenize(text=["aa", "bbb aa"], text_tokens=True, preprocess=False)
print('tokenized_text_tokens:')
print(tokenized_text_tokens)
print('\ntokenized_text:')
print(tokenized_text)
print('\nid_to_token_dicts:')
print(id_to_token_dicts)
print('\ntoken_to_id_dicts:')
print(token_to_id_dicts)

print(len(tokenized_text_tokens[0]))
print(len(tokenized_text[0]))
print(len(id_to_token_dicts[0]))
print(len(token_to_id_dicts[0]))

tokenized_text_tokens:
[['<aa', 'aa>', '<aa>', '</s>', 'aa </s>'], ['<bb', 'bbb', 'bb>', '<bbb', 'bbb>', '<bbb>', '<aa', 'aa>', '<aa>', '</s>', 'bbb aa', 'aa </s>', 'bbb aa </s>']]

tokenized_text:
[tensor([43399, 82997, 54531,     0, 14832]), tensor([81541, 53519, 38283, 98789, 95499, 46537, 43399, 82997, 54531,     0,
        32296, 14832, 72845])]

id_to_token_dicts:
[{43399: '<aa', 82997: 'aa>', 54531: '<aa>', 0: '</s>', 14832: 'aa </s>'}, {81541: '<bb', 53519: 'bbb', 38283: 'bb>', 98789: '<bbb', 95499: 'bbb>', 46537: '<bbb>', 43399: '<aa', 82997: 'aa>', 54531: '<aa>', 0: '</s>', 32296: 'bbb aa', 14832: 'aa </s>', 72845: 'bbb aa </s>'}]

token_to_id_dicts:
[{'<aa': 43399, 'aa>': 82997, '<aa>': 54531, '</s>': 0, 'aa </s>': 14832}, {'<bb': 81541, 'bbb': 53519, 'bb>': 38283, '<bbb': 98789, 'bbb>': 95499, '<bbb>': 46537, '<aa': 43399, 'aa>': 82997, '<aa>': 54531, '</s>': 0, 'bbb aa': 32296, 'aa </s>': 14832, 'bbb aa </s>': 72845}]
5
5
5
5


In [34]:
text = ["aa", "bbb"]
text_tokens=True
preprocess=False

tokenized_text = []
id_to_token_dicts = []
token_to_id_dicts = []

sentence = text[0]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)

sentence = text[1]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)


In [35]:
print(tokenized_text)
print(id_to_token_dicts)
print(token_to_id_dicts)

[tensor([43399, 82997, 54531,     0, 14832]), tensor([81541, 53519, 38283, 98789, 95499, 46537,     0, 33450])]
[{43399: '<aa', 82997: 'aa>', 54531: '<aa>', 0: '</s>', 14832: 'aa </s>'}, {81541: '<bb', 53519: 'bbb', 38283: 'bb>', 98789: '<bbb', 95499: 'bbb>', 46537: '<bbb>', 0: '</s>', 33450: 'bbb </s>'}]
[{'<aa': 43399, 'aa>': 82997, '<aa>': 54531, '</s>': 0, 'aa </s>': 14832}, {'<bb': 81541, 'bbb': 53519, 'bb>': 38283, '<bbb': 98789, 'bbb>': 95499, '<bbb>': 46537, '</s>': 0, 'bbb </s>': 33450}]


In [39]:
padding_index=2009603
end_of_string_index=0
i=0
tokenized_sentence = tokenized_text[0]
print(tokenized_sentence)
token_id = tokenized_sentence[0]
token_id.item()
id_to_token_dicts[i][token_id.item()]

tensor([43399, 82997, 54531,     0, 14832])


'<aa'

In [None]:

 [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)


In [None]:

for sentence in text:
    all_ind, id_to_token, token_to_id = self.indices_matrix(
        sentence
    )  # tokenize and convert to token indices
    tokenized_text.append(all_ind)
    id_to_token_dicts.append(id_to_token)
    token_to_id_dicts.append(token_to_id)

if text_tokens:
    tokenized_text_tokens = tokenized_text_in_tokens(tokenized_text, id_to_token_dicts)
    return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
else:
    return tokenized_text, id_to_token_dicts, token_to_id_dicts

In [None]:

def tokenized_text_in_tokens(
    tokenized_text, id_to_token_dicts, padding_index=2009603, end_of_string_index=0
):
    return [
        [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)
    ]

In [42]:
all_tokens_id = {'bonjour': 1, 'monde': 2, '!': 1}

# Inversion du dictionnaire : IDs -> tokens
id_to_token = {v: k for k, v in all_tokens_id.items()}

print(id_to_token)

{1: '!', 2: 'monde'}
