In [16]:
import numpy as np 
import pathlib
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from inlay.tokenizer import HyphenTokenizer, SentencePieceTokenizer

class GensimEmbedder:
    def __init__(self, tokenizer, size=10, window=3, min_count=1, iter=5, workers=2, prefix=""):
        self.tokenizer = tokenizer 
        self.size = size
        self.window = window
        self.min_count = min_count
        self.iter = iter 
        self.workers = workers
        self.prefix = prefix
    
    def fit(self, X, y=None):
        mod_name = f"{self.prefix}-{self.size}-{self.window}-{self.iter}.kv"
        tokens = self.tokenizer.transform(X)
        self.model_ = Word2Vec(sentences=tokens, 
                               size=self.size, 
                               window=self.window, 
                               min_count=self.min_count, 
                               iter=self.iter, 
                               workers=self.workers)
        return self
            
    def transform(self, X, y=None):
        result = np.zeros((len(X), self.model_.wv.vector_size))
        for idx_x, x in enumerate(X):
            tokens = self.tokenizer.transform([x])
            vectors = np.zeros((len(tokens[0]), self.model_.wv.vector_size))
            for idx_t, tok in enumerate(tokens[0]):
                try:
                    vectors[idx_t] = self.model_.wv[tok]
                except KeyError:
                    pass
            result[idx_x] = np.array(vectors).sum(axis=0)
        return result
        
    @classmethod
    def train_file(cls, tokenizer, input_file, size=10, window=3, min_count=1, iter=5, workers=2, mod_name=None):
        text = pathlib.Path(input_file).read_text()
        tokens = tokenizer.transform(text.split("\n"))
        model = Word2Vec(sentences=tokens, size=size, window=window, min_count=min_count, iter=iter, workers=workers)
        if not mod_name:
            mod_name = f"{pathlib.Path(input_file).stem}-{size}-{window}-{iter}.kv"
        model.wv.save(mod_name)
        return mod_name
    
    @classmethod
    def from_file(kv_file, tokenizer):
        self.tokenizer = tokenizer 
        self.kv = KeyedVectors.load(kv_file)
        raise NotImplementedError

In [2]:
texts = [t for t in pathlib.Path("tests/textdata/simpsons.txt").read_text().split("\n") if len(t) > 2]

In [3]:
# mod = GensimEmbedder(tokenizer=HyphenTokenizer(lang="nl_NL"), size=30, iter=100).fit(texts)

In [20]:
tokens = SentencePieceTokenizer(vocab_size=1600, model_type="word").fit(texts).transform(["hello world there"])
tokens

[['▁hello', '▁world', '▁there']]

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [30]:
class DeepAverageEmbedder(nn.Module):
    def __init__(self, vocab_size, hidden_size, emb_out_size, tokenizer):
        super(DeepAverageEmbedder, self).__init__()
        self.vocab_size = vocab_size
        self.tokenizer = tokenizer
        self.linear1 = nn.Linear(vocab_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, emb_out_size)

    def forward(self, x):
        return self.linear2(self.linear1(x))
    
    def tok_to_int(self, tokens):
        try:
            return [hash(tok) % self.vocab_size for tok in tokens]
        except TypeError:
            print(tokens)
    
    def texts_to_int_array(self, texts):
        output = np.zeros((len(texts), self.vocab_size))
        for i, tokens in enumerate(self.tokenizer.transform(texts)):
            for j in [self.tok_to_int(t) for t in tokens]:
                output[i, j] = 1
        return torch.FloatTensor(output)

In [31]:
tokenizer = HyphenTokenizer(lang="nl_NL").fit(texts)
model = DeepAverageEmbedder(vocab_size=100, hidden_size=10, emb_out_size=18, tokenizer=tokenizer)

## Script to Fetch Data

In [32]:
import pandas as pd 

label = ""
examples = []
for line in pathlib.Path("nlu.md").read_text().split("\n"):
    if len(line) > 0 and line.startswith("#"):
        label = line[line.find("# intent:"):]
    else:
        examples.append({
            'text': line.replace("- ", "").replace("["," ").replace("]"," ").replace("("," ").replace(")"," "),
            'label': label
        })

df = pd.DataFrame(examples)
texts = df['text'].to_list()
labels = pd.get_dummies(df['label']).values

We need to make a custom embedding layer.

In [53]:
torch.sum(torch.argmax(y_pred, axis=1) == torch.argmax(y, axis=1))

tensor(1407)

In [55]:
mod = SentencePieceTokenizer(vocab_size=1600, model_type="word").fit(texts)

model = DeepAverageEmbedder(vocab_size=1000, hidden_size=100, emb_out_size=18, tokenizer=mod)

x = model.tok_to_int(tokens)
y = torch.FloatTensor(labels)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=5, factor=0.5)

for t in range(200):
    x = model.texts_to_int_array(texts)
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = nn.Softmax(0)(model(x))

    # Compute and print loss
    loss = criterion(y_pred, y)
    scheduler.step(loss)
    if t % 10 == 0:
        print(t, loss.item(), torch.sum(torch.argmax(y_pred, axis=1) == torch.argmax(y, axis=1)).item()/len(y))

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

[['▁hello', '▁world', '▁there']]
0 0.4285973608493805 0.062443845462713386
10 0.3879072666168213 0.531895777178796
20 0.3706926107406616 0.5543575920934412
30 0.3641543388366699 0.5741239892183289
40 0.3602246642112732 0.60062893081761
50 0.35816213488578796 0.6154537286612758
60 0.3569827675819397 0.6185983827493261
70 0.35626426339149475 0.6257861635220126
80 0.35580796003341675 0.6257861635220126
90 0.355506032705307 0.6271338724168913
100 0.3553053140640259 0.628032345013477
110 0.35516679286956787 0.6298292902066487
120 0.35506683588027954 0.6311769991015274
130 0.3549911379814148 0.6302785265049416
Epoch   140: reducing learning rate of group 0 to 5.0000e-03.
140 0.35493454337120056 0.6302785265049416
Epoch   147: reducing learning rate of group 0 to 2.5000e-03.
150 0.35491445660591125 0.6302785265049416
Epoch   153: reducing learning rate of group 0 to 1.2500e-03.
Epoch   159: reducing learning rate of group 0 to 6.2500e-04.
160 0.3549080491065979 0.6311769991015274
Epoch   165:

In [34]:
def encode_model(text):
    x = model.texts_to_int_array([text])
    return model(x)[0]

encode_model("rasa")

tensor([ -4.6035, -10.3357,  -7.8589, -12.1112,  -0.9519,   1.2320,   0.2778,
         -1.9073,  -1.4492, -29.9091,  -2.8240, -20.9509,  -6.1455,  -0.6653,
         -1.3931,  -5.1210,   0.5278,  -3.0557], grad_fn=<SelectBackward>)

In [35]:
from whatlies import EmbeddingSet, Embedding
from whatlies.transformers import Pca, Umap

In [36]:
embset = EmbeddingSet(*[Embedding(name=t, vector=encode_model(t).detach().numpy()) for t in set(df['text'])])

In [37]:
embset.transform(Pca(2)).plot_interactive(annot=False)

In [38]:
embset.transform(Umap(2)).plot_interactive(annot=False)