# CBOW model trained on 20000 lieues sous les mers

## Needed libraries

You will need the following new libraries:

-   `spacy` for tokenizing
-   `gensim` for cosine similarities (use `gensim>=4.0.0`)

You will also need to download rules for tokenizing a french text.

``` bash
python -m spacy download fr_core_news_sm
```

In [21]:
import numpy as np
import torch
from torch import nn
import torch.optim as optim

import spacy
from gensim.models.keyedvectors import KeyedVectors

## Tokenizing the corpus

In [22]:
# Installe directement le modèle français depuis le dépôt officiel
!pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m39.9 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m


In [23]:
# Use a french tokenizer to create a tokenizer for the french language
spacy_fr = spacy.load("fr_core_news_sm")
with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f:
    document = spacy_fr.tokenizer(f.read())

# Define a filtered set of tokens by iterating on `document`. Define a
# subset of tokens that are
#
# - alphanumeric
# - in lower case
tokens = [
    token.text.lower()
    for token in document if token.is_alpha or token.is_digit and not token.is_space and not len(token.text) > 2 and not token.is_stop
]

# Make a list of unique tokens and dictionary that maps tokens to
# their index in that list.
idx2tok = list(set(tokens))
tok2idx = {token: idx for idx, token in enumerate(idx2tok)}

## The continuous bag of words model

In [24]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # Define an Embedding module (`nn.Embedding`) and a linear
        # transform (`nn.Linear`) without bias.
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size)
        self.U_transpose = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, context):
        # Implements the forward pass of the CBOW model
        # `context` is of size `batch_size` * NGRAMS

        # `e_i` is of size `batch_size` * NGRAMS * `embedding_size`
        e_i = self.embeddings(context)

        # `e_bar` is of size `batch_size` * `embedding_size`
        e_bar = torch.mean(e_i, dim=1)

        # `UT_e_bar` is of size `batch_size` * `vocab_size`
        UT_e_bar = self.U_transpose(e_bar)

        return UT_e_bar


# Set the size of vocabulary and size of embedding
VOCAB_SIZE = len(idx2tok)
EMBEDDING_SIZE = 128

# Create a Continuous bag of words model
cbow = CBOW(VOCAB_SIZE, EMBEDDING_SIZE)

# Send to GPU if any
device = "cuda:0" if torch.cuda.is_available() else "cpu"
cbow.to(device)

CBOW(
  (embeddings): Embedding(14558, 128)
  (U_transpose): Linear(in_features=128, out_features=14558, bias=False)
)

## Preparing the data

In [25]:
# Generate n-grams for a given list of tokens, use yield, use window length of n-grams
def ngrams_iterator(token_list, ngrams):
    """Generates successive N-grams from a list of tokens."""

    for i in range(len(token_list) - ngrams + 1):
        idxs = [tok2idx[tok] for tok in token_list[i:i+ngrams]]

        # Get center element in `idxs`
        center = idxs.pop(ngrams // 2)

        # Yield the index of center word and indexes of context words
        # as a Numpy array (for Pytorch to automatically convert it to
        # a Tensor).
        yield center, np.array(idxs)


# Create center, context data
NGRAMS = 9
ngrams = list(ngrams_iterator(tokens, NGRAMS))

BATCH_SIZE = 512
data = torch.utils.data.DataLoader(ngrams, batch_size=BATCH_SIZE, shuffle=True)

## Learn CBOW model

In [26]:
# Gradient descent algorithm to use
optimizer = optim.Adam(cbow.parameters(), lr=0.01)

# Use a cross-entropy loss from the `nn` submodule
ce_loss = nn.CrossEntropyLoss()

In [27]:
EPOCHS = 50
for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    for i, (center, context) in enumerate(data):
        center, context = center.to(device), context.to(device)

        # Reset the gradients of the computational graph
        cbow.zero_grad()

        # Forward pass
        UT_ebar = cbow.forward(context)

        # Compute negative log-likelihood loss averaged over the
        # mini-batch
        loss = ce_loss(UT_ebar, center)

        # Backward pass to compute gradients of each parameter
        loss.backward()

        # Gradient descent step according to the chosen optimizer
        optimizer.step()

        total_loss += loss.data

        if i % 20 == 0:
            loss_avg = float(total_loss / (i + 1))
            print(
                f"Epoch ({epoch}/{EPOCHS}), batch: ({i}/{len(data)}), loss: {loss_avg}"
            )

    # Print average loss after each epoch
    loss_avg = float(total_loss / len(data))
    print("{}/{} loss {:.2f}".format(epoch, EPOCHS, loss_avg))

    # Predict if `predict_center_word` is implemented
    try:
        left_words = ["le", "capitaine"]
        right_words = ["me", "dit"]
        word = predict_center_word(word2vec, *left_words, *right_words)[0]
        print(" ".join(left_words + [word] + right_words))
    except NameError:
        pass

Epoch (1/50), batch: (0/271), loss: 9.589542388916016
Epoch (1/50), batch: (20/271), loss: 8.639681816101074
Epoch (1/50), batch: (40/271), loss: 8.157389640808105
Epoch (1/50), batch: (60/271), loss: 7.898660659790039
Epoch (1/50), batch: (80/271), loss: 7.733100891113281
Epoch (1/50), batch: (100/271), loss: 7.601471900939941
Epoch (1/50), batch: (120/271), loss: 7.50297212600708
Epoch (1/50), batch: (140/271), loss: 7.430117130279541
Epoch (1/50), batch: (160/271), loss: 7.369608402252197
Epoch (1/50), batch: (180/271), loss: 7.31596040725708
Epoch (1/50), batch: (200/271), loss: 7.260052680969238
Epoch (1/50), batch: (220/271), loss: 7.209349632263184
Epoch (1/50), batch: (240/271), loss: 7.1690568923950195
Epoch (1/50), batch: (260/271), loss: 7.133075714111328
1/50 loss 7.12
Epoch (2/50), batch: (0/271), loss: 5.37155818939209
Epoch (2/50), batch: (20/271), loss: 5.359081268310547
Epoch (2/50), batch: (40/271), loss: 5.3762946128845215
Epoch (2/50), batch: (60/271), loss: 5.38553

## Prediction functions

Now that the model is learned we can give it a context it has never seen
and see what center word it predicts.

In [33]:
def predict_center_word_idx(cbow, *context_words_idx, k=10):
    """Return k-best center words given indexes of context words."""

    # Create a fake minibatch containing just one example
    fake_minibatch = torch.LongTensor(context_words_idx).unsqueeze(0).to(device)

    # Forward propagate through the cbow model
    score_center = cbow.forward(fake_minibatch)

    # Retrieve top k-best indexes using `torch.topk`
    _, best_idxs = torch.topk(score_center, k)
    
    best_idxs_list = best_idxs[0].cpu().tolist()

    # Return actual tokens using `idx2tok`
    return [idx2tok[idx] for idx in best_idxs_list]


def predict_center_word(cbow, *context_words, k=10):
    """Return k-best centaer words given context words."""

    idxs = [tok2idx[tok] for tok in context_words]
    return predict_center_word_idx(cbow, *idxs, k=k)

In [35]:
predict_center_word(cbow, "vingt", "mille", "sous", "les")
predict_center_word(cbow, "mille", "lieues", "les", "mers")
predict_center_word(cbow, "le", "capitaine", "fut", "le")
predict_center_word(cbow, "le", "commandant", "fut", "le")

['farragut',
 'tint',
 'nautilus',
 'vivement',
 'faite',
 'canot',
 'tendre',
 'stewart',
 'vers',
 'partit']

## Testing the embedding

We use the library `gensim` to easily compute most similar words for the
embedding we just learned. Use `gensim>=4.0.0`.

In [36]:
m = KeyedVectors(vector_size=EMBEDDING_SIZE)
m.add_vectors(idx2tok, cbow.embeddings.weight.detach().cpu().numpy())

You can now test most similar words for, for example “lieues”, “mers”,
“professeur”… You can look at `words_decreasing_freq` to test most
frequent tokens.

In [37]:
unique, freq = np.unique(tokens, return_counts=True)
idxs = freq.argsort()[::-1]
words_decreasing_freq = list(zip(unique[idxs], freq[idxs]))

In [38]:
# Personnages et entités clés
print("--- Capitaine ---")
print(m.most_similar("capitaine"))

print("\n--- Nemo ---")
print(m.most_similar("nemo"))

print("\n--- Nautilus ---")
print(m.most_similar("nautilus"))

# Environnement
print("\n--- Mer ---")
print(m.most_similar("mer"))

print("\n--- Eau ---")
print(m.most_similar("eau"))

# Unités de mesure (très fréquentes chez Jules Verne)
print("\n--- Lieues ---")
print(m.most_similar("lieues"))

print("\n--- Mètres ---")
print(m.most_similar("mètres"))

--- Capitaine ---
[('82', 0.3960556387901306), ('lancez', 0.3935999870300293), ('mystère', 0.35987669229507446), ('canadien', 0.34933871030807495), ('taux', 0.34639009833335876), ('oppose', 0.33605027198791504), ('commencement', 0.3342539966106415), ('indifféremment', 0.3316226899623871), ('tropique', 0.32994717359542847), ('24', 0.3298708498477936)]

--- Nemo ---
[('bougea', 0.5895227789878845), ('acharnement', 0.4862517714500427), ('regagnâmes', 0.479692667722702), ('souriant', 0.4608738422393799), ('prévenus', 0.42575061321258545), ('nicolas', 0.3940410614013672), ('illuminer', 0.39238157868385315), ('savent', 0.3870413899421692), ('amabilité', 0.3739234507083893), ('tabac', 0.37359553575515747)]

--- Nautilus ---
[('coulant', 0.3906456232070923), ('tropique', 0.38268303871154785), ('correspondant', 0.3807491958141327), ('pereire', 0.37972718477249146), ('examina', 0.3782614469528198), ('paramatta', 0.37448400259017944), ('milliard', 0.3620046079158783), ('tabac', 0.354568749666214)