In [1]:
from adaptnlp import EasyWordEmbeddings, EasyStackedEmbeddings, EasyDocumentEmbeddings

# Easy Embeddings

### Pretrained keys are available in Transformer's documentation or Flair's tutorials

## Example of producing embeddings using NovettaWordEmbeddings

In [2]:
example_text = "This is Albert.  My last name is Einstein.  I like physics and atoms."

In [7]:
# Instantiate embeddings tagger
embeddings = EasyWordEmbeddings()

In [4]:
# Get GPT2 embeddings of example text
# A list of flair Sentence objects are generated
sentences = embeddings.embed_text(example_text, model_name_or_path="gpt2")
# Iterate through first Sentence to access the embeddings
for token in sentences[0]:
    print(token.get_embedding())
    break

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([ 0.0530, -0.0137, -0.2393,  ..., -1.2358, -0.9708,  0.6150],
       device='cuda:0')


In [5]:
# Same thing but with BERT embeddings
sentences = embeddings.embed_text(example_text, model_name_or_path="bert-base-cased")
# Iterate through first Sentence to access the embeddings
for token in sentences[0]:
    print(token.get_embedding())
    break

tensor([ 0.5918, -0.4142,  1.0203,  ...,  0.4004, -0.1586,  1.0107],
       device='cuda:0')


In [None]:
# Same thing but roBERTa embeddings
sentences = embeddings.embed_text(example_text, model_name_or_path="roberta-base")
# Iterate through first Sentence to access the embeddings
for token in sentences[0]:
    print(token.get_embedding())
    break

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…

## Producing stacked embeddings with NovettaStackedEmbeddings

In [9]:
# Instantiate stacked embeddings with a variable number of language models
embeddings = EasyStackedEmbeddings("bert-base-cased", "distilbert-base-cased")

May need a couple moments to instantiate...


In [10]:
# Get stacked/concatenated word embeddings
sentences = embeddings.embed_text(example_text)
# Iterate through first Sentence to access the embeddings
for token in sentences[0]:
    print(token.get_embedding())
    break

tensor([ 0.5918, -0.4142,  1.0203,  ...,  1.0985, -1.8035, -1.5887],
       device='cuda:0')


## Document Embeddings with NovettaDocumentEmbeddings

In [3]:
# Instantiate with variable number of language models
embeddings = EasyDocumentEmbeddings("bert-base-cased", "distilbert-base-cased")

May need a couple moments to instantiate...
Pooled embedding loaded
RNN embeddings loaded


In [4]:
# Document Pool embedding
sentences = embeddings.embed_pool(example_text)
# Get the text/document embedding
for sentence in sentences:
    print(sentence.get_embedding())

tensor([ 0.4740,  0.0244,  0.3428,  ..., -0.0057, -0.1208, -0.0247],
       device='cuda:0', grad_fn=<CatBackward>)


In [5]:
# Now again but with Document RNN embedding
sentences = embeddings.embed_rnn(example_text)
# Get the text/document embedding
for sentence in sentences:
    print(sentence.get_embedding())

tensor([ 8.6051e-01,  9.9875e-01, -5.1653e-01,  4.4028e-01, -2.6489e-01,
         8.1826e-01,  4.1479e-01,  9.7935e-01, -9.1401e-01, -5.3021e-01,
        -8.2195e-01,  6.7288e-01, -1.2646e-02,  8.5230e-01,  6.4977e-01,
        -9.6448e-01,  7.8760e-01,  9.4110e-01, -9.8675e-01,  4.4215e-01,
        -9.9318e-01,  4.2201e-01, -5.9397e-01, -4.1114e-01, -8.9010e-01,
        -2.6712e-02,  9.9513e-01, -7.9625e-01, -9.8152e-01, -3.6621e-01,
         8.4872e-01,  4.9597e-01,  4.6508e-01,  9.7859e-01,  3.7736e-01,
         3.5898e-01, -5.4604e-01, -4.8421e-01,  8.7586e-01,  6.1857e-01,
         8.5983e-01,  3.9903e-01,  5.5581e-01, -1.7296e-01, -8.6435e-01,
         9.9972e-02,  6.2743e-01,  9.3582e-01,  8.5841e-01,  7.8694e-02,
        -8.6842e-01, -8.7992e-01, -9.7008e-01, -5.9344e-01,  6.0902e-01,
        -9.7268e-01,  5.6673e-01, -5.6100e-01, -9.3629e-01,  7.8566e-01,
         3.7182e-01,  9.2551e-01,  7.4267e-01,  3.6521e-01, -1.0547e-02,
         9.1705e-01,  9.9012e-01, -1.1793e-01,  9.9