# Explain how it works

- introduction to `torch` framework
- expose `torchFasttext` specificities 

## Environment and data 

In [1]:
!pip install -r ../requirements.txt -q

In [2]:
import os
import sys
import time
import s3fs
from typing import List, Optional, Dict
from pathlib import Path
from utils_describe import get_data

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

import numpy as np
# import pandas as pd
# import pyarrow.parquet as pq

# from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

In [3]:
X_train, X_test, y_train, y_test = get_data()

2025-03-14 08:48:14 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 08:48:14 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 08:48:14 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 08:48:14 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 08:48:15 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 08:48:17 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


	*** 732 codes have been added in the database...



  df.fillna("nan", inplace=True)


In [4]:
print("Features for the 3 first training obs:\n")
print(X_train[:3])
print("\n")
print("NAF codes (labels) for the 3 first training obs:\n")
print(y_train[:3])

Features for the 3 first training obs:

[["fabriqu d'emballag matier plastiqu" 21 22 7 7 0 0]
 ['activit organis polit' 21 22 7 7 0 0]
 ["fabriqu moteur turbines, l'except d'avion vehicul" 21 22 7 7 0 0]]


NAF codes (labels) for the 3 first training obs:

[165 711 251]


Parameters : 
- example with a unique embedding dimension for categorical variables

In [5]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix = size of the embedded vocabulary
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 3 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 2 # Minimum length of char n-grams
MAX_N = 4 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
LR = 4e-3 # Learning rate
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

## Building the model

Explore what's happen during the build step.
Hypothesis:
- we use the `torchFastText.build_from_tokenizer` method to build: the tokenizer must be created first.
- we don't use json method to get parameters


In [6]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

NUM_CAT_VAR = categorical_variables.shape[1]
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))

print(f"NUM_CAT_VAR: {NUM_CAT_VAR}")
print(f"CAT_VOCAB_SIZE: {CAT_VOCAB_SIZE}")
print(f"NUM_CLASSES: {NUM_CLASSES}")

NUM_CAT_VAR: 6
CAT_VOCAB_SIZE: [22, 23, 8, 12, 3, 4]
NUM_CLASSES: 732


### The tokenizer

Lets describe the constructor of `NGramTokenizer` class

```python
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )


Constructor steps : 
- Checks params min_n and max_n
- Counts the number of occurrences of each word in the all training text ==> `word_counts` 
- Creates a mapping of all words seen at leat `min_count` times in `training_text` ==> `word_id_mapping`
- Counts the number of unique words among previous selected ==> `nwords`


In [7]:
min_n=MIN_N
max_n=MAX_N
num_tokens= NUM_TOKENS
len_word_ngrams=LEN_WORD_NGRAMS
min_count=MIN_COUNT
training_text=training_text

if min_n < 2:
    raise ValueError("`min_n` parameter must be greater than 1.")
if max_n > 6:
    raise ValueError("`max_n` parameter must be smaller than 7.")

word_counts = {}
for sentence in training_text:
    for word in sentence.split(" "):
        word_counts[word] = word_counts.setdefault(word, 0) + 1


word_id_mapping = {}
i = 1
for word, counts in word_counts.items():
    if word_counts[word] >= min_count:
        word_id_mapping[word] = i
        i += 1
nwords = len(word_id_mapping)

print(f"word_counts: {word_counts}")
print(f"word_id_mapping: {word_id_mapping}")
print(f"nwords: {nwords}")

word_counts: {'fabriqu': 223, "d'emballag": 5, 'matier': 22, 'plastiqu': 12, 'activit': 206, 'organis': 33, 'polit': 4, 'moteur': 3, 'turbines,': 1, "l'except": 9, "d'avion": 2, 'vehicul': 72, 'commerc': 150, 'detail': 55, 'tapis,': 1, 'moquet': 2, 'revet': 11, 'mur': 5, 'sol': 11, 'magasin': 27, 'specialis': 39, "d'articl": 33, 'chauss': 1, 'maill': 3, 'meuner': 1, 'ennobl': 1, 'textil': 11, 'manutent': 4, 'portuair': 2, 'servic': 149, 'funerair': 2, 'extract': 13, 'pierr': 4, 'ornemental': 2, 'construction,': 32, 'calcair': 1, 'industriel,': 1, 'gypse,': 1, 'crai': 1, "d'ardois": 1, 'blanchisserie-teinturer': 2, 'gros': 62, '(commerc': 47, 'interentreprises)': 47, 'produit': 160, 'chimiqu': 6, "d'autr": 68, 'equip': 25, 'transport': 58, 'n.c.a.': 21, 'couteller': 1, 'trefilag': 1, 'froid': 4, 'articl': 20, 'robinetter': 1, 'fibr': 10, 'verr': 8, 'construct': 43, 'navir': 2, 'structur': 11, 'flott': 1, 'tissag': 1, 'trait': 17, 'elimin': 2, 'dechet': 8, 'danger': 4, 'refractair': 1, "

In [8]:
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )
tokenizer

<torchFastText.datasets.tokenizer.NGramTokenizer at 0x7fb6cb422210>

Some internal functions : 

In [9]:
tokenizer.get_ngram_list("charcutier", n=3)

['cha', 'har', 'arc', 'rcu', 'cut', 'uti', 'tie', 'ier']

`get_hash` gives a unique hash (int) to a token

In [10]:
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("data")) 
print(tokenizer.get_hash("scientist")) 

865323743
865323743
3631407781
2349978481


For a words ngram like "data scientist", `tokenizer.get_hash` give the position (=index) of this word ngram in the embedding matrix.

This index is between (nwords) and (nwords + bucket - 1).

This means that the indices of the embedding matrix are:

- [0, nwords-1] for vocabulary words.
- [nwords, nwords + bucket - 1] for n-grams (word or subword ngrams).

**Also note that the number of rows in the embedding matrix is : nword + num_tokens** 

In [12]:
hashes = (tokenizer.get_hash("data"), tokenizer.get_hash("scientist"))
print(num_tokens)
tokenizer.get_word_ngram_id(hashes=hashes, bucket=num_tokens, nwords=4000)

100000


51736

`get_subword_index` : Get the index of a subword ngram in the embedding matrix

In [13]:
tokenizer.get_subword_index(subword="cha")

91612

`get_word_index`: get the indice of a word (directly from `word_id_mapping`)

In [14]:
tokenizer.get_word_index(word="fil")

193

`get_subwords`: return all subword ngrams and indices from a word.

Also returns the entire word and its indice if this word is in the selected word vocabulary.

Tags "<" and ">" are added to the word

In [15]:
tokenizer.get_subwords(word="manufacturier")

(['<m',
  'ma',
  'an',
  'nu',
  'uf',
  'fa',
  'ac',
  'ct',
  'tu',
  'ur',
  'ri',
  'ie',
  'er',
  'r>',
  '<ma',
  'man',
  'anu',
  'nuf',
  'ufa',
  'fac',
  'act',
  'ctu',
  'tur',
  'uri',
  'rie',
  'ier',
  'er>',
  '<man',
  'manu',
  'anuf',
  'nufa',
  'ufac',
  'fact',
  'actu',
  'ctur',
  'turi',
  'urie',
  'rier',
  'ier>'],
 [99297,
  46586,
  41541,
  17707,
  75873,
  11111,
  50588,
  11921,
  13921,
  23493,
  62515,
  22898,
  60469,
  75376,
  92152,
  74574,
  15592,
  75709,
  50552,
  12912,
  43210,
  29732,
  9455,
  41052,
  15714,
  25874,
  43863,
  11508,
  17805,
  29500,
  45292,
  64653,
  92446,
  16281,
  51604,
  78330,
  6791,
  76098,
  89142])

`indices_matrix`

Tokenize a sentence and returns : 
- indices: an tensor of indices
- id_to_token: a dict of tokens: id (**/!\ problem if 2 different tokens with the same id**)
- all_tokens_id : a dict of id: token 

Adds an "end of string" tag: `</s>` with indice = 0.

Used in `collate_fn` (`FastTextModelDataset`).


In [16]:
indices, id_to_token, all_tokens_id = tokenizer.indices_matrix("fil carrelage materiel")
print("==== indices\n")
print(indices)
print("\n==== indices length\n")
print(len(indices))
print("\n==== id_to_token\n")
print(id_to_token)
print("\n==== id_to_token length\n")
print(len(id_to_token))
print("\n==== all_tokens_id\n")
print(all_tokens_id)
print("\n==== all_tokens_id length\n")
print(len(all_tokens_id))

==== indices

tensor([  193, 42630, 90159, 66231,  9710, 66889,  2513, 47697,  9789, 54535,
        20968, 14873, 32516, 31087, 94755, 23517, 40112, 59256, 59513, 17770,
        34816, 45495, 33407, 87571, 60510, 64410, 77211, 18340, 24282, 67848,
        41702, 58179, 91374, 75687, 23993,  5477,    47, 99297, 46586, 49159,
        72017, 60469, 62515, 22898, 94755,  9710, 92152,  8860, 53878, 49599,
        61484, 15714, 54444, 58125, 35318, 63495,  3518,  7850, 19543, 20860,
        49896,     0, 43966, 66836,  4676, 69640, 25167])

==== indices length

67

==== id_to_token

{193: 'fil', 42630: '<f', 90159: 'fi', 66231: 'il', 9710: 'l>', 66889: '<fi', 2513: 'il>', 47697: '<fil', 9789: 'fil>', 54535: '<c', 20968: 'ca', 14873: 'ar', 32516: 'rr', 31087: 're', 94755: 'el', 23517: 'la', 40112: 'ag', 59256: 'ge', 59513: 'e>', 17770: '<ca', 34816: 'car', 45495: 'arr', 33407: 'rre', 87571: 'rel', 60510: 'ela', 64410: 'lag', 77211: 'age', 18340: 'ge>', 24282: '<car', 67848: 'carr', 41702: 'ar

`tokenize` : loops indices_matrix results for a full training texte (= a list of sentences).

/!\ problem with param `text_tokens=True`: only used for explanability.

In [18]:
( 
    tokenized_text, 
    id_to_token_dicts, 
    token_to_id_dicts
) = tokenizer.tokenize(
        text=["aa fil", "bbb aa"], 
        text_tokens=False, 
        preprocess=False
    )

print('\ntokenized_text:')
print(tokenized_text)
print('\nid_to_token_dicts:')
print(id_to_token_dicts)
print('\ntoken_to_id_dicts:')
print(token_to_id_dicts)

print(len(tokenized_text[0]))
print(len(id_to_token_dicts[0]))
print(len(token_to_id_dicts[0]))


tokenized_text:
[tensor([ 9773, 15829, 43388, 82986,   193, 42630, 90159, 66231,  9710, 66889,
         2513, 47697,  9789,     0, 24480, 15060, 58643]), tensor([32154, 27588, 27588, 47584, 81530, 38272, 98778, 95488,  9773, 15829,
        43388, 82986,     0, 32285, 14821, 72834])]

id_to_token_dicts:
[{9773: '<a', 15829: 'a>', 43388: '<aa', 82986: 'aa>', 193: 'fil', 42630: '<f', 90159: 'fi', 66231: 'il', 9710: 'l>', 66889: '<fi', 2513: 'il>', 47697: '<fil', 9789: 'fil>', 0: '</s>', 24480: 'aa fil', 15060: 'fil </s>', 58643: 'aa fil </s>'}, {32154: '<b', 27588: 'bb', 47584: 'b>', 81530: '<bb', 38272: 'bb>', 98778: '<bbb', 95488: 'bbb>', 9773: '<a', 15829: 'a>', 43388: '<aa', 82986: 'aa>', 0: '</s>', 32285: 'bbb aa', 14821: 'aa </s>', 72834: 'bbb aa </s>'}]

token_to_id_dicts:
[{'<a': 9773, 'a>': 15829, '<aa': 43388, 'aa>': 82986, 'fil': 193, '<f': 42630, 'fi': 90159, 'il': 66231, 'l>': 9710, '<fi': 66889, 'il>': 2513, '<fil': 47697, 'fil>': 9789, '</s>': 0, 'aa fil': 24480, 'fil </s>

In [None]:
text = ["aa", "bbb"]
text_tokens=True
preprocess=False

tokenized_text = []
id_to_token_dicts = []
token_to_id_dicts = []

sentence = text[0]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)

sentence = text[1]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)


In [None]:
print(tokenized_text)
print(id_to_token_dicts)
print(token_to_id_dicts)

In [None]:
padding_index=2009603
end_of_string_index=0
i=0
tokenized_sentence = tokenized_text[0]
print(tokenized_sentence)
token_id = tokenized_sentence[0]
token_id.item()
id_to_token_dicts[i][token_id.item()]

In [None]:

 [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)


In [None]:

for sentence in text:
    all_ind, id_to_token, token_to_id = self.indices_matrix(
        sentence
    )  # tokenize and convert to token indices
    tokenized_text.append(all_ind)
    id_to_token_dicts.append(id_to_token)
    token_to_id_dicts.append(token_to_id)

if text_tokens:
    tokenized_text_tokens = tokenized_text_in_tokens(tokenized_text, id_to_token_dicts)
    return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
else:
    return tokenized_text, id_to_token_dicts, token_to_id_dicts

In [None]:

def tokenized_text_in_tokens(
    tokenized_text, id_to_token_dicts, padding_index=2009603, end_of_string_index=0
):
    return [
        [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)
    ]

In [None]:
all_tokens_id = {'bonjour': 1, 'monde': 2, '!': 1}

# Inversion du dictionnaire : IDs -> tokens
id_to_token = {v: k for k, v in all_tokens_id.items()}

print(id_to_token)