# Explain how it works

- introduction to `torch` framework
- expose `torchFasttext` specificities 

## Environment and data 

In [1]:
!pip install -r ../requirements.txt -q

In [2]:
import os
import sys
import time
import s3fs
from typing import List, Optional, Dict
from pathlib import Path
from utils_describe import get_data

sys.path.append("../")
from torchFastText import torchFastText
from torchFastText.preprocess import clean_text_feature
from torchFastText.datasets import NGramTokenizer
sys.path.append("./notebooks")

import numpy as np
# import pandas as pd
# import pyarrow.parquet as pq

# from utils import add_libelles, clean_and_tokenize_df, stratified_split_rare_labels

%load_ext autoreload
%autoreload 2

In [3]:
X_train, X_test, y_train, y_test = get_data()

2025-03-13 09:50:07 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-13 09:50:07 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-13 09:50:07 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-13 09:50:07 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-13 09:50:08 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-03-13 09:50:11 - botocore.httpchecksum - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


	*** 732 codes have been added in the database...



  df.fillna("nan", inplace=True)


In [4]:
print("Features for the 3 first training obs:\n")
print(X_train[:3])
print("\n")
print("NAF codes (labels) for the 3 first training obs:\n")
print(y_train[:3])

Features for the 3 first training obs:

[['terrain camping parc caravan vehicul loisir' 22 26 7 7 0 0]
 ['fabriqu vet travail' 22 26 7 7 0 0]
 ["commerc gros (commerc interentreprises) d'ordinateurs, d'equip informat peripher logiciel"
  22 26 7 7 0 0]]


NAF codes (labels) for the 3 first training obs:

[516 110 413]


Parameters : 
- example with a unique embedding dimension for categorical variables

In [5]:
# Parameters for model building
NUM_TOKENS= int(1e5) # Number of rows in the embedding matrix = size of the embedded vocabulary
EMBED_DIM = 50 # Dimension of the embedding = number of columns in the embedding matrix
SPARSE = False # Whether to use sparse Embedding layer for fast computation (see PyTorch documentation)
CAT_EMBED_DIM = 10 # Dimension of the embedding for categorical features

# Parameters for tokenizer
MIN_COUNT = 3 # Minimum number of occurrences of a word in the corpus to be included in the vocabulary
MIN_N = 2 # Minimum length of char n-grams
MAX_N = 4 # Maximum length of char n-grams
LEN_WORD_NGRAMS = 3 # Length of word n-grams

# Parameters for training - not useful immediately
LR = 4e-3 # Learning rate
NUM_EPOCHS = 1
BATCH_SIZE = 256
PATIENCE = 3

## Building the model

Explore what's happen during the build step.
Hypothesis:
- we use the `torchFastText.build_from_tokenizer` method to build: the tokenizer must be created first.
- we don't use json method to get parameters


In [6]:
training_text = X_train[:, 0].tolist()
categorical_variables = X_train[:, 1:]

NUM_CAT_VAR = categorical_variables.shape[1]
CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
NUM_CLASSES = len(np.unique(y_train))

print(f"NUM_CAT_VAR: {NUM_CAT_VAR}")
print(f"CAT_VOCAB_SIZE: {CAT_VOCAB_SIZE}")
print(f"NUM_CLASSES: {NUM_CLASSES}")

NUM_CAT_VAR: 6
CAT_VOCAB_SIZE: [23, 27, 8, 12, 3, 4]
NUM_CLASSES: 732


### The tokenizer

Lets describe the constructor of `NGramTokenizer` class

```python
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )


Constructor steps : 
- Checks params min_n and max_n
- Counts the number of occurrences of each word in the all training text ==> `word_counts` 
- Creates a mapping of all words seen at leat `min_count` times in `training_text` ==> `word_id_mapping`
- Counts the number of unique words among previous selected ==> `nwords`


In [7]:
min_n=MIN_N
max_n=MAX_N
num_tokens= NUM_TOKENS
len_word_ngrams=LEN_WORD_NGRAMS
min_count=MIN_COUNT
training_text=training_text

if min_n < 2:
    raise ValueError("`min_n` parameter must be greater than 1.")
if max_n > 6:
    raise ValueError("`max_n` parameter must be smaller than 7.")

word_counts = {}
for sentence in training_text:
    for word in sentence.split(" "):
        word_counts[word] = word_counts.setdefault(word, 0) + 1


word_id_mapping = {}
i = 1
for word, counts in word_counts.items():
    if word_counts[word] >= min_count:
        word_id_mapping[word] = i
        i += 1
nwords = len(word_id_mapping)

print(f"word_counts: {word_counts}")
print(f"word_id_mapping: {word_id_mapping}")
print(f"nwords: {nwords}")

word_counts: {'terrain': 11, 'camping': 1, 'parc': 3, 'caravan': 1, 'vehicul': 83, 'loisir': 11, 'fabriqu': 221, 'vet': 21, 'travail': 11, 'commerc': 147, 'gros': 63, '(commerc': 47, 'interentreprises)': 46, "d'ordinateurs,": 3, "d'equip": 32, 'informat': 39, 'peripher': 4, 'logiciel': 32, 'recherche-developp': 3, 'biotechnolog': 1, 'caoutchouc': 3, 'synthet': 2, "d'autr": 65, 'produit': 173, 'chimiqu': 6, 'n.c.a.': 19, 'fromag': 1, 'extract': 13, 'houill': 1, 'pharmaceut': 7, 'bas': 14, 'cultur': 29, 'plant': 8, 'fibr': 15, 'machin': 21, "d'usag": 1, 'general': 44, 'textil': 14, "l'extraction,": 1, 'construct': 39, 'gen': 5, 'civil': 10, 'chasse,': 1, 'piegeag': 1, 'servic': 161, 'annex': 7, 'locat': 254, 'videocasset': 1, 'disqu': 1, 'video': 11, 'enseign': 24, 'conduit': 6, 'repar': 38, "d'articl": 35, "d'horloger": 3, 'bijouter': 5, 'heberg': 20, 'medicalis': 3, 'enfant': 9, 'handicap': 7, 'locomot': 1, 'materiel': 49, 'ferroviair': 3, 'roul': 2, 'imprimer': 2, 'journal': 2, 'artic

In [8]:
tokenizer = NGramTokenizer(
    min_n=MIN_N, 
    max_n=MAX_N, 
    num_tokens= NUM_TOKENS,
    len_word_ngrams=LEN_WORD_NGRAMS, 
    min_count=MIN_COUNT, 
    training_text=training_text
    )
tokenizer

<torchFastText.datasets.tokenizer.NGramTokenizer at 0x7f684a9307d0>

Some internal functions : 

In [9]:
tokenizer.get_ngram_list("charcutier", n=3)

['cha', 'har', 'arc', 'rcu', 'cut', 'uti', 'tie', 'ier']

`get_hash` gives a unique hash (int) to a token

In [10]:
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("uti")) 
print(tokenizer.get_hash("data")) 
print(tokenizer.get_hash("scientist")) 

865323743
865323743
3631407781
2349978481


For a words ngram like "data scientist", `tokenizer.get_hash` give the position (=index) of this word ngram in the embedding matrix.

This index is between (nwords) and (nwords + bucket - 1).

This means that the indices of the embedding matrix are:

- [0, nwords-1] for vocabulary words.
- [nwords, nwords + bucket - 1] for n-grams (word or subword ngrams).

**Also note that the number of rows in the embedding matrix is : nword + num_tokens** 

In [11]:
hashes = (tokenizer.get_hash("data"), tokenizer.get_hash("scientist"))
tokenizer.get_word_ngram_id(hashes=hashes, bucket=num_tokens, nwords=4000)

51736

`get_subword_index` : Get the index of a subword ngram in the embedding matrix

In [12]:
tokenizer.get_subword_index(subword="cha")

91580

`get_word_index`: get the indice of a word (directly from `word_id_mapping`)

In [13]:
tokenizer.get_word_index(word="fil")

163

`get_subwords`: return all subword ngrams and indices from a word.

Also returns the entire word and its indice if this word is in the selected word vocabulary.

Tags "<" and ">" are added to the word

In [21]:
tokenizer.get_subwords(word="manufacturier")

(['<m',
  'ma',
  'an',
  'nu',
  'uf',
  'fa',
  'ac',
  'ct',
  'tu',
  'ur',
  'ri',
  'ie',
  'er',
  'r>',
  '<ma',
  'man',
  'anu',
  'nuf',
  'ufa',
  'fac',
  'act',
  'ctu',
  'tur',
  'uri',
  'rie',
  'ier',
  'er>',
  '<man',
  'manu',
  'anuf',
  'nufa',
  'ufac',
  'fact',
  'actu',
  'ctur',
  'turi',
  'urie',
  'rier',
  'ier>'],
 [99265,
  46554,
  41509,
  17675,
  75841,
  11079,
  50556,
  11889,
  13889,
  23461,
  62483,
  22866,
  60437,
  75344,
  92120,
  74542,
  15560,
  75677,
  50520,
  12880,
  43178,
  29700,
  9423,
  41020,
  15682,
  25842,
  43831,
  11476,
  17773,
  29468,
  45260,
  64621,
  92414,
  16249,
  51572,
  78298,
  6759,
  76066,
  89110])

`indices_matrix`

Tokenize a sentence and returns : 
- indices: an tensor of indices
- id_to_token: a dict of tokens: id (**/!\ problem if 2 different tokens with the same id**)
- all_tokens_id : a dict of id: token 

Adds and "end of string" tag: `</s>` with indice = 0.

Used in `collate_fn` (`FastTextModelDataset`).


In [None]:
indices, id_to_token, all_tokens_id = tokenizer.indices_matrix("fil carrelage materiel")
print("==== indices\n")
print(indices)
print("\n==== indices length\n")
print(len(indices))
print("\n==== id_to_token\n")
print(id_to_token)
print("\n==== id_to_token length\n")
print(len(id_to_token))
print("\n==== all_tokens_id\n")
print(all_tokens_id)
print("\n==== all_tokens_id length\n")
print(len(all_tokens_id))

In [None]:
tokenizer.get_word_index("fil")
tokenizer.get_subwords("fil")

`tokenize` : loops indices_matrix results for a full training texte (= a list of sentences).

/!\ problem with param `text_tokens=True`: only used for explanability.

In [None]:
( 
    tokenized_text, 
    id_to_token_dicts, 
    token_to_id_dicts
) = tokenizer.tokenize(
        text=["aa fil", "bbb aa"], 
        text_tokens=False, 
        preprocess=False
    )

print('\ntokenized_text:')
print(tokenized_text)
print('\nid_to_token_dicts:')
print(id_to_token_dicts)
print('\ntoken_to_id_dicts:')
print(token_to_id_dicts)

print(len(tokenized_text_tokens[0]))
print(len(tokenized_text[0]))
print(len(id_to_token_dicts[0]))
print(len(token_to_id_dicts[0]))

In [None]:
text = ["aa", "bbb"]
text_tokens=True
preprocess=False

tokenized_text = []
id_to_token_dicts = []
token_to_id_dicts = []

sentence = text[0]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)

sentence = text[1]
all_ind, id_to_token, token_to_id = tokenizer.indices_matrix(sentence)
tokenized_text.append(all_ind)
id_to_token_dicts.append(id_to_token)
token_to_id_dicts.append(token_to_id)


In [None]:
print(tokenized_text)
print(id_to_token_dicts)
print(token_to_id_dicts)

In [None]:
padding_index=2009603
end_of_string_index=0
i=0
tokenized_sentence = tokenized_text[0]
print(tokenized_sentence)
token_id = tokenized_sentence[0]
token_id.item()
id_to_token_dicts[i][token_id.item()]

In [None]:

 [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)


In [None]:

for sentence in text:
    all_ind, id_to_token, token_to_id = self.indices_matrix(
        sentence
    )  # tokenize and convert to token indices
    tokenized_text.append(all_ind)
    id_to_token_dicts.append(id_to_token)
    token_to_id_dicts.append(token_to_id)

if text_tokens:
    tokenized_text_tokens = tokenized_text_in_tokens(tokenized_text, id_to_token_dicts)
    return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
else:
    return tokenized_text, id_to_token_dicts, token_to_id_dicts

In [None]:

def tokenized_text_in_tokens(
    tokenized_text, id_to_token_dicts, padding_index=2009603, end_of_string_index=0
):
    return [
        [
            id_to_token_dicts[i][token_id.item()]
            for token_id in tokenized_sentence
            if token_id.item() not in {padding_index}
        ]
        for i, tokenized_sentence in enumerate(tokenized_text)
    ]

In [None]:
all_tokens_id = {'bonjour': 1, 'monde': 2, '!': 1}

# Inversion du dictionnaire : IDs -> tokens
id_to_token = {v: k for k, v in all_tokens_id.items()}

print(id_to_token)