# Introduction  
Working with transformer pipeline
(Examples gently stolen from Mariona Coll Ardanuy)

In [21]:
import transformers
from transformers import pipeline
import numpy as np

# Contextual Vectors and Ambiguity

Apple: comptuter or fruit?

In [30]:
def extract_vector(sentence,feature_pipeline,target_word, verbose=False):
    # Load the **SAME** tokenizer used in the pipeline:
    tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased")
    # Encode the sentence into a sequence of vocabulary IDs
    token_ids = tokenizer.encode(sentence)
    # And get the tokens given the vocabulary IDs
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    if verbose:
        print(token_ids)
        print(tokens)
    # This removes single-dimensional entries (i.e. for vector readability)
    vectors = np.squeeze(feature_pipeline(sentence))
    return vectors[tokens.index(target_word)]

In [31]:
feature_pipeline = pipeline("feature-extraction",
                    model='bert-base-uncased',
                    tokenizer='bert-base-uncased')

In [24]:
target = 'apple'
sentence_1 = "i eat an apple for breakfast and i like other fruits as well"
v1 = extract_vector(sentence_1,feature_pipeline,target, verbose=True)

[101, 1045, 4521, 2019, 6207, 2005, 6350, 1998, 1045, 2066, 2060, 10962, 2004, 2092, 102]
['[CLS]', 'i', 'eat', 'an', 'apple', 'for', 'breakfast', 'and', 'i', 'like', 'other', 'fruits', 'as', 'well', '[SEP]']


In [25]:
v1.shape

(768,)

In [26]:
sentence_2 = 'i bought an apple laptop and keyboard but still have a computer at home'
sentence_3 = 'i like to have one apple and two pears please '

In [27]:
v2 = extract_vector(sentence_2,feature_pipeline,target)
v3 = extract_vector(sentence_3,feature_pipeline,target)

In [28]:
from scipy.spatial.distance import cosine
cosine_similarity = lambda vector_1, vector_2: 1 - cosine(vector_1, vector_2)
print(cosine_similarity(v1,v2))
print(cosine_similarity(v1,v3))

0.6802836426267569
0.8602012913830954


# Mask Predict

In [29]:
unmasker = pipeline('fill-mask',
                    model='distilbert-base-uncased',
                    tokenizer='distilbert-base-uncased')

In [15]:
outputs = unmasker("The cell is guarded by a [MASK].")
# Let's print the results in an easier-to-read format:
for one_output in outputs:
    print("Prediction:", one_output['token_str'])
    print("Score:     ", round(one_output['score'],4))
    print()

Prediction: guard
Score:      0.142

Prediction: fence
Score:      0.0907

Prediction: wall
Score:      0.0531

Prediction: keeper
Score:      0.044

Prediction: gate
Score:      0.0372



In [18]:
unmasker_1850 = pipeline('fill-mask',
                    model='bert/bert_1760_1850',
                    tokenizer='bert-base-uncased')

unmasker_1900 = pipeline('fill-mask',
                    model='bert/bert_1760_1900',
                    tokenizer='bert-base-uncased')

In [19]:
outputs = unmasker_1850("They were told that the [MASK] stopped working.")
# Let's print the results in an easier-to-read format:
for one_output in outputs:
    print("Prediction:", one_output['token_str'])
    print("Score:     ", round(one_output['score'],4))
    print()

Prediction: people
Score:      0.2001

Prediction: men
Score:      0.1234

Prediction: labourers
Score:      0.0655

Prediction: miners
Score:      0.0564

Prediction: horses
Score:      0.0328



In [20]:
outputs = unmasker_1900("They were told that the [MASK] stopped working.")
# Let's print the results in an easier-to-read format:
for one_output in outputs:
    print("Prediction:", one_output['token_str'])
    print("Score:     ", round(one_output['score'],4))
    print()

Prediction: mill
Score:      0.2523

Prediction: engine
Score:      0.0847

Prediction: machine
Score:      0.0566

Prediction: men
Score:      0.0448

Prediction: miners
Score:      0.0354

