In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from whatlies import Embedding, EmbeddingSet
from whatlies.language import SpacyLanguage

In [59]:
lang = SpacyLanguage('en_core_web_sm')

words1 = ["dog", "cat", "mouse", "deer", "elephant", "zebra", "fish", "rabbit", "rat", 
          "tomato", "banana", "coffee", "tea", "apple", "union"]

words2 = ["run", "swim", "dance", "sit", "eat", "hear", "look", "run", "stand"]

## Stateful Transformations

Many transformations that we offer carry state. The first time they are applied they learn from the data that they see, but they can be re-used afterwards.

Let's show a quick example below.

In [60]:
from whatlies.transformers import Pca

In [61]:
pca = Pca(2)

emb1 = lang[words1].transform(pca)
emb2 = lang[words2].transform(pca)

p1 = emb1.plot_interactive('pca_0', 'pca_1', )
p2 = emb2.plot_interactive('pca_0', 'pca_1', )

p1 | p2

### Out of Scope Detection

We can improve this flow of working though. We can add properties to embeddings that can be used for plotting.

To demonstrate this we're going to train UMAP on one set and then we'll apply it on another one. 

A clearer example might be to demonstrate `Umap`. We will train it on one set but apply it on both. 

In [145]:
umap = Umap(2)

emb1 = lang[words1].transform(umap).add_property('set', lambda d: 'set-one')
emb2 = lang[words2].transform(umap).add_property('set', lambda d: 'set-two')
both = emb1.merge(emb2)

both.plot_interactive('umap_0', 'umap_1', color='set').properties(width=400, height=400)

  "n_neighbors is larger than the dataset size; truncating to "


In [117]:
# import pathlib

In [122]:
# texts = [t.lower() for t in pathlib.Path("untitled.txt").read_text().split("\n")]

The plot demonstrates a tendency to place words that are out of scope on the outside.

In [123]:
# emb = lang[texts]

In [124]:
# umap = Umap(2)

# emb_sara = emb.transform(umap).add_property('set', lambda d: 'set-sara')
# emb_other = lang[words1 + words2].transform(umap).add_property('set', lambda d: 'set-other')

# together = emb_sara.merge(emb_other)
# together.plot_interactive('umap_0', 'umap_1', color='set', annot=False).properties(width=700, height=700)