# Explain how it works

- introduction to `torch` framework
- expose `torchFasttext` specificities 

## Environment and data 

In [1]:
!pip install -r ../requirements.txt -q

In [3]:
import os
import sys
import time
import s3fs
from typing import List, Optional, Dict
from pathlib import Path
from utils_describe import get_data

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

import numpy as np
# import pandas as pd
# import pyarrow.parquet as pq

# from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

In [4]:
X_train, X_test, y_train, y_test = get_data()

2025-03-14 16:21:09 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 16:21:09 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 16:21:09 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 16:21:10 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 16:21:10 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-14 16:21:14 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


	*** 732 codes have been added in the database...



  df.fillna("nan", inplace=True)


In [5]:
print("Features for the 3 first training obs:\n")
print(X_train[:3])
print("\n")
print("NAF codes (labels) for the 3 first training obs:\n")
print(y_train[:3])

Features for the 3 first training obs:

[["fabriqu d'huil graiss brut" 23 25 7 8 0 0]
 ['siderurg' 23 25 7 8 0 0]
 ["organis jeux hasard d'argent" 23 25 7 8 0 0]]


NAF codes (labels) for the 3 first training obs:

[ 63 193 700]


Parameters : 
- example with a unique embedding dimension for categorical variables

In [6]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix = size of the embedded vocabulary
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 3 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 2 # Minimum length of char n-grams
MAX_N = 4 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
LR = 4e-3 # Learning rate
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

## Building the model

Explore what's happen during the build step.
Hypothesis:
- we use the `torchFastText.build_from_tokenizer` method to build: the tokenizer must be created first.
- we don't use json method to get parameters


In [7]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

NUM_CAT_VAR = categorical_variables.shape[1]
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))

print(f"NUM_CAT_VAR: {NUM_CAT_VAR}")
print(f"CAT_VOCAB_SIZE: {CAT_VOCAB_SIZE}")
print(f"NUM_CLASSES: {NUM_CLASSES}")

NUM_CAT_VAR: 6
CAT_VOCAB_SIZE: [24, 26, 8, 12, 3, 4]
NUM_CLASSES: 732


### The tokenizer

Lets describe the constructor of `NGramTokenizer` class

```python
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )


Constructor steps : 
- Checks params min_n and max_n
- Counts the number of occurrences of each word in the all training text ==> `word_counts` 
- Creates a mapping of all words seen at leat `min_count` times in `training_text` ==> `word_id_mapping`
- Counts the number of unique words among previous selected ==> `nwords`


In [8]:
min_n=MIN_N
max_n=MAX_N
num_tokens= NUM_TOKENS
len_word_ngrams=LEN_WORD_NGRAMS
min_count=MIN_COUNT
training_text=training_text

if min_n < 2:
    raise ValueError("`min_n` parameter must be greater than 1.")
if max_n > 6:
    raise ValueError("`max_n` parameter must be smaller than 7.")

word_counts = {}
for sentence in training_text:
    for word in sentence.split(" "):
        word_counts[word] = word_counts.setdefault(word, 0) + 1


word_id_mapping = {}
i = 1
for word, counts in word_counts.items():
    if word_counts[word] >= min_count:
        word_id_mapping[word] = i
        i += 1
nwords = len(word_id_mapping)

print(f"word_counts: {word_counts}")
print(f"word_id_mapping: {word_id_mapping}")
print(f"nwords: {nwords}")

word_counts: {'fabriqu': 224, "d'huil": 3, 'graiss': 3, 'brut': 2, 'siderurg': 1, 'organis': 27, 'jeux': 10, 'hasard': 1, "d'argent": 2, 'commerc': 157, 'detail': 65, "d'appareil": 12, 'electromenager': 5, 'magasin': 36, 'specialis': 35, "d'isol": 2, 'piec': 8, 'isol': 5, 'ceram': 7, 'locat': 250, 'location-bail': 10, 'machin': 23, 'equip': 19, 'agricol': 20, "d'equip": 30, 'commun': 19, 'radiateur': 1, 'chaudier': 2, 'chauffag': 5, 'central': 7, "d'articl": 40, 'sport': 9, 'cacao,': 1, 'chocolat': 2, 'produit': 150, 'confiser': 4, "d'enregistr": 3, 'musical': 10, 'video': 10, 'activit': 219, "d'ordr": 1, 'public': 18, 'securit': 10, 'frapp': 1, 'monnai': 1, 'gestion': 139, 'muse': 1, 'defens': 2, "d'achat": 11, 'non': 213, 'alimentair': 34, 'gaz': 4, 'industriel': 18, 'enseign': 23, 'secondair': 4, 'techniqu': 21, 'professionnel': 167, 'construct': 45, 'entretien': 36, 'tunnel': 1, 'transform': 15, 'the': 6, 'caf': 5, 'coll': 1, 'livr': 9, 'carbur': 2, 'margarin': 1, 'comestibl': 2, '

In [9]:
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )
tokenizer

<torchFastText.datasets.tokenizer.NGramTokenizer at 0x7fb35c5bf710>

Some internal functions : 

In [10]:
tokenizer.get_ngram_list("charcutier", n=3)

['cha', 'har', 'arc', 'rcu', 'cut', 'uti', 'tie', 'ier']

`get_hash` gives a unique hash (int) to a token

In [11]:
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("data")) 
print(tokenizer.get_hash("scientist")) 

865323743
865323743
3631407781
2349978481


For a words ngram like "data scientist", `tokenizer.get_hash` give the position (=index) of this word ngram in the embedding matrix.

This index is between (nwords) and (nwords + bucket - 1).

This means that the indices of the embedding matrix are:

- [0, nwords-1] for vocabulary words.
- [nwords, nwords + bucket - 1] for n-grams (word or subword ngrams).

**Also note that the number of rows in the embedding matrix is : nword + num_tokens** 

In [12]:
hashes = (tokenizer.get_hash("data"), tokenizer.get_hash("scientist"))
print(num_tokens)
tokenizer.get_word_ngram_id(hashes=hashes, bucket=num_tokens, nwords=4000)

100000


51736

`get_subword_index` : Get the index of a subword ngram in the embedding matrix

In [13]:
tokenizer.get_subword_index(subword="cha")

91599

`get_word_index`: get the indice of a word (directly from `word_id_mapping`)

In [14]:
tokenizer.get_word_index(word="fil")

212

`get_subwords`: return all subword ngrams and indices from a word.

Also returns the entire word and its indice if this word is in the selected word vocabulary.

Tags "<" and ">" are added to the word

In [15]:
tokenizer.get_subwords(word="manufacturier")

(['<m',
  'ma',
  'an',
  'nu',
  'uf',
  'fa',
  'ac',
  'ct',
  'tu',
  'ur',
  'ri',
  'ie',
  'er',
  'r>',
  '<ma',
  'man',
  'anu',
  'nuf',
  'ufa',
  'fac',
  'act',
  'ctu',
  'tur',
  'uri',
  'rie',
  'ier',
  'er>',
  '<man',
  'manu',
  'anuf',
  'nufa',
  'ufac',
  'fact',
  'actu',
  'ctur',
  'turi',
  'urie',
  'rier',
  'ier>'],
 [99284,
  46573,
  41528,
  17694,
  75860,
  11098,
  50575,
  11908,
  13908,
  23480,
  62502,
  22885,
  60456,
  75363,
  92139,
  74561,
  15579,
  75696,
  50539,
  12899,
  43197,
  29719,
  9442,
  41039,
  15701,
  25861,
  43850,
  11495,
  17792,
  29487,
  45279,
  64640,
  92433,
  16268,
  51591,
  78317,
  6778,
  76085,
  89129])

`indices_matrix`

Tokenize a sentence and returns : 
- indices: an tensor of indices
- id_to_token: a dict of tokens: id (**/!\ problem if 2 different tokens with the same id**)
- all_tokens_id : a dict of id: token 

Adds an "end of string" tag: `</s>` with indice = 0.

Used in `collate_fn` (`FastTextModelDataset`).


In [16]:
indices, id_to_token, all_tokens_id = tokenizer.indices_matrix("fil carrelage materiel")
print("==== indices\n")
print(indices)
print("\n==== indices length\n")
print(len(indices))
print("\n==== id_to_token\n")
print(id_to_token)
print("\n==== id_to_token length\n")
print(len(id_to_token))
print("\n==== all_tokens_id\n")
print(all_tokens_id)
print("\n==== all_tokens_id length\n")
print(len(all_tokens_id))

==== indices

tensor([  212, 42617, 90146, 66218,  9697, 66876,  2500, 47684,  9776, 54522,
        20955, 14860, 32503, 31074, 94742, 23504, 40099, 59243, 59500, 17757,
        34803, 45482, 33394, 87558, 60497, 64397, 77198, 18327, 24269, 67835,
        41689, 58166, 91361, 75674, 23980,  5464,    88, 99284, 46573, 49146,
        72004, 60456, 62502, 22885, 94742,  9697, 92139,  8847, 53865, 49586,
        61471, 15701, 54431, 58112, 35305, 63482,  3505,  7837, 19530, 20847,
        49883,     0, 43953, 66823,  4663, 69627, 25154])

==== indices length

67

==== id_to_token

{212: 'fil', 42617: '<f', 90146: 'fi', 66218: 'il', 9697: 'l>', 66876: '<fi', 2500: 'il>', 47684: '<fil', 9776: 'fil>', 54522: '<c', 20955: 'ca', 14860: 'ar', 32503: 'rr', 31074: 're', 94742: 'el', 23504: 'la', 40099: 'ag', 59243: 'ge', 59500: 'e>', 17757: '<ca', 34803: 'car', 45482: 'arr', 33394: 'rre', 87558: 'rel', 60497: 'ela', 64397: 'lag', 77198: 'age', 18327: 'ge>', 24269: '<car', 67835: 'carr', 41689: 'ar

`tokenize` : loops indices_matrix results for a full training texte (= a list of sentences).

/!\ problem with param `text_tokens=True`: only used for explanability.

In [17]:
( 
    tokenized_text, 
    id_to_token_dicts, 
    token_to_id_dicts
) = tokenizer.tokenize(
        text=["aa fil", "bbb aa"], 
        text_tokens=False, 
        preprocess=False
    )

print('\ntokenized_text:')
print(tokenized_text)
print('\nid_to_token_dicts:')
print(id_to_token_dicts)
print('\ntoken_to_id_dicts:')
print(token_to_id_dicts)

print(len(tokenized_text[0]))
print(len(id_to_token_dicts[0]))
print(len(token_to_id_dicts[0]))


tokenized_text:
[tensor([ 9760, 15816, 43375, 82973,   212, 42617, 90146, 66218,  9697, 66876,
         2500, 47684,  9776,     0, 24467, 15047, 58630]), tensor([32141, 27575, 27575, 47571, 81517, 38259, 98765, 95475,  9760, 15816,
        43375, 82973,     0, 32272, 14808, 72821])]

id_to_token_dicts:
[{9760: '<a', 15816: 'a>', 43375: '<aa', 82973: 'aa>', 212: 'fil', 42617: '<f', 90146: 'fi', 66218: 'il', 9697: 'l>', 66876: '<fi', 2500: 'il>', 47684: '<fil', 9776: 'fil>', 0: '</s>', 24467: 'aa fil', 15047: 'fil </s>', 58630: 'aa fil </s>'}, {32141: '<b', 27575: 'bb', 47571: 'b>', 81517: '<bb', 38259: 'bb>', 98765: '<bbb', 95475: 'bbb>', 9760: '<a', 15816: 'a>', 43375: '<aa', 82973: 'aa>', 0: '</s>', 32272: 'bbb aa', 14808: 'aa </s>', 72821: 'bbb aa </s>'}]

token_to_id_dicts:
[{'<a': 9760, 'a>': 15816, '<aa': 43375, 'aa>': 82973, 'fil': 212, '<f': 42617, 'fi': 90146, 'il': 66218, 'l>': 9697, '<fi': 66876, 'il>': 2500, '<fil': 47684, 'fil>': 9776, '</s>': 0, 'aa fil': 24467, 'fil </s>

In [18]:
# text = ["aa", "bbb"]
# text_tokens=True
# preprocess=False

# tokenized_text = []
# id_to_token_dicts = []
# token_to_id_dicts = []

# sentence = text[0]
# all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
# tokenized_text.append(all_ind)
# id_to_token_dicts.append(id_to_token)
# token_to_id_dicts.append(token_to_id)

# sentence = text[1]
# all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
# tokenized_text.append(all_ind)
# id_to_token_dicts.append(id_to_token)
# token_to_id_dicts.append(token_to_id)

# print(tokenized_text)
# print(id_to_token_dicts)
# print(token_to_id_dicts)
# padding_index=2009603
# end_of_string_index=0
# i=0
# tokenized_sentence = tokenized_text[0]
# print(tokenized_sentence)
# token_id = tokenized_sentence[0]
# token_id.item()
# id_to_token_dicts[i][token_id.item()]

#  [
#             id_to_token_dicts[i][token_id.item()]
#             for token_id in tokenized_sentence
#             if token_id.item() not in {padding_index}
#         ]
#         for i, tokenized_sentence in enumerate(tokenized_text)


# for sentence in text:
#     all_ind, id_to_token, token_to_id = self.indices_matrix(
#         sentence
#     )  # tokenize and convert to token indices
#     tokenized_text.append(all_ind)
#     id_to_token_dicts.append(id_to_token)
#     token_to_id_dicts.append(token_to_id)

# if text_tokens:
#     tokenized_text_tokens = tokenized_text_in_tokens(tokenized_text, id_to_token_dicts)
#     return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
# else:
#     return tokenized_text, id_to_token_dicts, token_to_id_dicts

# def tokenized_text_in_tokens(
#     tokenized_text, id_to_token_dicts, padding_index=2009603, end_of_string_index=0
# ):
#     return [
#         [
#             id_to_token_dicts[i][token_id.item()]
#             for token_id in tokenized_sentence
#             if token_id.item() not in {padding_index}
#         ]
#         for i, tokenized_sentence in enumerate(tokenized_text)
#     ]
# all_tokens_id = {'bonjour': 1, 'monde': 2, '!': 1}

# # Inversion du dictionnaire : IDs -> tokens
# id_to_token = {v: k for k, v in all_tokens_id.items()}

# print(id_to_token)

### Build model from tokenizer

Question: what happens under the hood when using `torchFastText.build_from_tokenizer` ?

In [19]:
EMBED_DIM
CAT_EMBED_DIM
SPARSE
LR
NUM_CLASSES
NUM_CAT_VAR
CAT_VOCAB_SIZE

[24, 26, 8, 12, 3, 4]

In [20]:
model = torchFastText.build_from_tokenizer(
    tokenizer, 
    embedding_dim=EMBED_DIM, 
    categorical_embedding_dims=CAT_EMBED_DIM, 
    sparse=SPARSE, 
    lr = LR, 
    num_classes=NUM_CLASSES, 
    num_categorical_features=NUM_CAT_VAR, 
    categorical_vocabulary_sizes=CAT_VOCAB_SIZE
    )

2025-03-14 16:21:23 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.
2025-03-14 16:21:23 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).


In [22]:
print(model.num_rows)
print(model.padding_idx)

101011
101010
