# Explain how it works

- introduction to `torch` framework
- expose `torchFasttext` specificities 

## Environment and data 

In [1]:
!pip install -r ../requirements.txt -q

In [2]:
import os
import sys
import time
import s3fs
from typing import List, Optional, Dict
from pathlib import Path
from utils_describe import get_data

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

import numpy as np
# import pandas as pd
# import pyarrow.parquet as pq

# from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

In [3]:
X_train, X_test, y_train, y_test = get_data()

2025-03-10 16:42:43 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-10 16:42:43 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-10 16:42:44 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-10 16:42:45 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-10 16:42:46 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-10 16:42:50 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


	*** 732 codes have been added in the database...



  df.fillna("nan", inplace=True)


In [4]:
print("Features for the 3 first training obs:\n")
print(X_train[:3])
print("\n")
print("NAF codes (labels) for the 3 first training obs:\n")
print(y_train[:3])

Features for the 3 first training obs:

[['affair etranger' 22 27 7 8 0 0]
 ['raffinag petrol' 22 27 7 8 0 0]
 ["fabriqu d'autr equip transport n.c.a." 22 27 7 8 0 0]]


NAF codes (labels) for the 3 first training obs:

[643 142 285]


Parameters : 
- example with a unique embedding dimension for categorical variables

In [5]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix = size of the embedded vocabulary
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 3 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 2 # Minimum length of char n-grams
MAX_N = 4 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
LR = 4e-3 # Learning rate
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

## Building the model

Explore what's happen during the build step.
Hypothesis:
- we use the `torchFastText.build_from_tokenizer` method to build: the tokenizer must be created first.
- we don't use json method to get parameters


In [6]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

NUM_CAT_VAR = categorical_variables.shape[1]
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))

print(f"NUM_CAT_VAR: {NUM_CAT_VAR}")
print(f"CAT_VOCAB_SIZE: {CAT_VOCAB_SIZE}")
print(f"NUM_CLASSES: {NUM_CLASSES}")

NUM_CAT_VAR: 6
CAT_VOCAB_SIZE: [23, 28, 8, 12, 3, 4]
NUM_CLASSES: 732


### The tokenizer

Lets describe the constructor of `NGramTokenizer` class

```python
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )


Constructor steps : 
- Checks params min_n and max_n
- Creates a mapping of all unique words contained in `training_text` ==> `word_id_mapping`
- Counts the number of unique words in the all training text ==> `nwords`
- Counts the number of occurrences of each word in the all training text ==> `word_counts` 

In [7]:
min_n=MIN_N
max_n=MAX_N
num_tokens= NUM_TOKENS
len_word_ngrams=LEN_WORD_NGRAMS
min_count=MIN_COUNT
training_text=training_text

if min_n < 2:
    raise ValueError("`min_n` parameter must be greater than 1.")
if max_n > 6:
    raise ValueError("`max_n` parameter must be smaller than 7.")

word_counts = {}
for sentence in training_text:
    for word in sentence.split(" "):
        word_counts[word] = word_counts.setdefault(word, 0) + 1


word_id_mapping = {}
i = 1
for word, counts in word_counts.items():
    if word_counts[word] >= min_count:
        word_id_mapping[word] = i
        i += 1
nwords = len(word_id_mapping)

print(f"word_counts: {word_counts}")
print(f"word_id_mapping: {word_id_mapping}")
print(f"nwords: {nwords}")

word_counts: {'affair': 18, 'etranger': 4, 'raffinag': 1, 'petrol': 2, 'fabriqu': 220, "d'autr": 72, 'equip': 26, 'transport': 49, 'n.c.a.': 19, 'cafeteri': 1, 'autr': 149, 'libres-servic': 1, 'commerc': 149, 'gros': 57, '(commerc': 46, 'interentreprises)': 46, 'produit': 147, 'intermediair': 15, 'construct': 48, 'bateau': 4, 'plaisanc': 1, 'textil': 17, 'techniqu': 23, 'industriel': 21, 'cultur': 36, 'cann': 1, 'sucr': 2, 'machin': 25, 'specialise': 8, 'chimiqu': 6, "d'instrument": 1, 'scientif': 2, 'conduit': 7, 'fruit': 15, 'legum': 7, 'briques,': 1, 'tuil': 1, 'construction,': 32, 'terr': 4, 'cuit': 1, 'repar': 38, "d'articl": 25, "d'horloger": 3, 'bijouter': 5, 'inorgan': 1, 'bas': 16, 'prepar': 21, 'fibr': 11, 'filatur': 1, 'abras': 2, 'fil': 3, 'cabl': 3, 'electron': 18, 'electr': 29, 'beton': 4, 'pret': 5, "l'emploi": 2, 'materiel': 53, 'agricol': 14, 'distribu': 17, 'combustibl': 5, 'gazeux': 4, 'etirag': 1, 'froid': 6, 'barr': 1, 'tubes,': 1, 'tuyaux,': 1, 'profil': 2, 'creux

In [8]:
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )
tokenizer

<torchFastText.datasets.tokenizer.NGramTokenizer at 0x7fbd09f815b0>

Some internal functions : 

In [9]:
tokenizer.get_ngram_list("charcutier", n=3)

['cha', 'har', 'arc', 'rcu', 'cut', 'uti', 'tie', 'ier']

`get_hash` gives a unique hash (int) to a token

In [10]:
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("data")) 
print(tokenizer.get_hash("scientist")) 

865323743
865323743
3631407781
2349978481


For a words ngram like "data scientist", `tokenizer.get_hash` give the position (=index) of this word ngram in the embedding matrix.

This index is between (nwords) and (nwords + bucket - 1).

This means that the indices of the embedding matrix are:

- [0, nwords-1] for vocabulary words.
- [nwords, nwords + bucket - 1] for n-grams (word or subword ngrams).

**Also note that the number of rows in the embedding matrix is : nword + num_tokens** 

In [11]:
hashes = (tokenizer.get_hash("data"), tokenizer.get_hash("scientist"))
tokenizer.get_word_ngram_id(hashes=hashes, bucket=num_tokens, nwords=4000)

51736

`get_subword_index` : Get the index of a subword ngram in the embedding matrix

In [None]:
tokenizer.get_subword_index(subword="cha")

73450

`get_word_index`: get the indice of a word (directly from `word_id_mapping`)

In [23]:
tokenizer.get_word_index(word="fil")

36

`get_subwords`: return all subword ngrams and indices from a word.

In all cases, also returns the entire word and its indice.

Tags "<" and ">" are added to the word

In [24]:
tokenizer.get_subwords(word="fil")

(['<f', 'fi', 'il', 'l>', '<fi', 'fil', 'il>', '<fil', 'fil>'],
 [42621, 90150, 66222, 9701, 66880, 73450, 2504, 47688, 9780])

`indices_matrix`

Tokenize a sentence and returns : 
- an tensor of indices

Adds and "end of string" tag: `</s>` with indice = 0.

Used in `collate_fn` (`FastTextModelDataset`).


In [29]:
indices, id_to_token, all_tokens_id = tokenizer.indices_matrix("fil carrelage materiel")
print("==== indices\n")
print(indices)
print("\n==== indices length\n")
print(len(indices))
print("\n==== id_to_token\n")
print(id_to_token)
print("\n==== id_to_token length\n")
print(len(id_to_token))
print("\n==== all_tokens_id\n")
print(all_tokens_id)
print("\n==== all_tokens_id length\n")
print(len(all_tokens_id))

==== indices

tensor([42621, 90150, 66222,  9701, 66880, 73450,  2504, 47688,  9780, 54526,
        20959, 14864, 32507, 31078, 94746, 23508, 40103, 59247, 59504, 17761,
        34807, 45486, 33398, 87562, 60501, 64401, 77202, 18331, 24273, 67839,
        41693, 58170, 91365, 75678, 23984,  5468,    42, 99288, 46577, 49150,
        72008, 60460, 62506, 22889, 94746,  9701, 92143,  8851, 53869, 49590,
        61475, 15705, 54435, 58116, 35309, 63486,  3509,  7841, 19534, 20851,
        49887,     0, 43957, 66827,  4667, 69631, 25158])

==== indices length

67

==== id_to_token

{42621: '<f', 90150: 'fi', 66222: 'il', 9701: 'l>', 66880: '<fi', 73450: 'fil', 2504: 'il>', 47688: '<fil', 9780: 'fil>', 54526: '<c', 20959: 'ca', 14864: 'ar', 32507: 'rr', 31078: 're', 94746: 'el', 23508: 'la', 40103: 'ag', 59247: 'ge', 59504: 'e>', 17761: '<ca', 34807: 'car', 45486: 'arr', 33398: 'rre', 87562: 'rel', 60501: 'ela', 64401: 'lag', 77202: 'age', 18331: 'ge>', 24273: '<car', 67839: 'carr', 41693: '

In [27]:
tokenizer.get_word_index("fil")
tokenizer.get_subwords("fil")

(['<f', 'fi', 'il', 'l>', '<fi', 'fil', 'il>', '<fil', 'fil>'],
 [42621, 90150, 66222, 9701, 66880, 73450, 2504, 47688, 9780])

In [28]:
tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts = tokenizer.tokenize(text=["aa fil", "bbb aa"], text_tokens=True, preprocess=False)
print('tokenized_text_tokens:')
print(tokenized_text_tokens)
print('\ntokenized_text:')
print(tokenized_text)
print('\nid_to_token_dicts:')
print(id_to_token_dicts)
print('\ntoken_to_id_dicts:')
print(token_to_id_dicts)

print(len(tokenized_text_tokens[0]))
print(len(tokenized_text[0]))
print(len(id_to_token_dicts[0]))
print(len(token_to_id_dicts[0]))

tokenized_text_tokens:
[['<a', 'aa', 'a>', '<aa', 'aa>', '<aa>', '<f', 'fi', 'il', 'l>', '<fi', 'fil', 'il>', '<fil', 'fil>', '</s>', 'aa fil', 'fil </s>', 'aa fil </s>'], ['<b', 'bb', 'bb', 'b>', '<bb', 'bbb', 'bb>', '<bbb', 'bbb>', '<a', 'aa', 'a>', '<aa', 'aa>', '<aa>', '</s>', 'bbb aa', 'aa </s>', 'bbb aa </s>']]

tokenized_text:
[tensor([ 9764, 95341, 15820, 43379, 82977, 54511, 42621, 90150, 66222,  9701,
        66880, 73450,  2504, 47688,  9780,     0, 24471, 15051, 58634]), tensor([32145, 27579, 27579, 47575, 81521, 53499, 38263, 98769, 95479,  9764,
        95341, 15820, 43379, 82977, 54511,     0, 32276, 14812, 72825])]

id_to_token_dicts:
[{9764: '<a', 95341: 'aa', 15820: 'a>', 43379: '<aa', 82977: 'aa>', 54511: '<aa>', 42621: '<f', 90150: 'fi', 66222: 'il', 9701: 'l>', 66880: '<fi', 73450: 'fil', 2504: 'il>', 47688: '<fil', 9780: 'fil>', 0: '</s>', 24471: 'aa fil', 15051: 'fil </s>', 58634: 'aa fil </s>'}, {32145: '<b', 27579: 'bb', 47575: 'b>', 81521: '<bb', 53499: 'bbb',

In [18]:
text = ["aa", "bbb"]
text_tokens=True
preprocess=False

tokenized_text = []
id_to_token_dicts = []
token_to_id_dicts = []

sentence = text[0]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)

sentence = text[1]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)


In [19]:
print(tokenized_text)
print(id_to_token_dicts)
print(token_to_id_dicts)

[tensor([ 9764, 95341, 15820, 43379, 82977, 54511,     0, 14812]), tensor([32145, 27579, 27579, 47575, 81521, 53499, 38263, 98769, 95479,     0,
        33430])]
[{9764: '<a', 95341: 'aa', 15820: 'a>', 43379: '<aa', 82977: 'aa>', 54511: '<aa>', 0: '</s>', 14812: 'aa </s>'}, {32145: '<b', 27579: 'bb', 47575: 'b>', 81521: '<bb', 53499: 'bbb', 38263: 'bb>', 98769: '<bbb', 95479: 'bbb>', 0: '</s>', 33430: 'bbb </s>'}]
[{'<a': 9764, 'aa': 95341, 'a>': 15820, '<aa': 43379, 'aa>': 82977, '<aa>': 54511, '</s>': 0, 'aa </s>': 14812}, {'<b': 32145, 'bb': 27579, 'b>': 47575, '<bb': 81521, 'bbb': 53499, 'bb>': 38263, '<bbb': 98769, 'bbb>': 95479, '</s>': 0, 'bbb </s>': 33430}]


In [20]:
padding_index=2009603
end_of_string_index=0
i=0
tokenized_sentence = tokenized_text[0]
print(tokenized_sentence)
token_id = tokenized_sentence[0]
token_id.item()
id_to_token_dicts[i][token_id.item()]

tensor([ 9764, 95341, 15820, 43379, 82977, 54511,     0, 14812])


'<a'

In [21]:

 [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)


IndentationError: unexpected indent (1763970530.py, line 6)

In [None]:

for sentence in text:
    all_ind, id_to_token, token_to_id = self.indices_matrix(
        sentence
    )  # tokenize and convert to token indices
    tokenized_text.append(all_ind)
    id_to_token_dicts.append(id_to_token)
    token_to_id_dicts.append(token_to_id)

if text_tokens:
    tokenized_text_tokens = tokenized_text_in_tokens(tokenized_text, id_to_token_dicts)
    return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
else:
    return tokenized_text, id_to_token_dicts, token_to_id_dicts

In [None]:

def tokenized_text_in_tokens(
    tokenized_text, id_to_token_dicts, padding_index=2009603, end_of_string_index=0
):
    return [
        [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)
    ]

In [None]:
all_tokens_id = {'bonjour': 1, 'monde': 2, '!': 1}

# Inversion du dictionnaire : IDs -> tokens
id_to_token = {v: k for k, v in all_tokens_id.items()}

print(id_to_token)