# Intro
- zero-shot background and motivation
- three aproaches to do it:  
    - Bert
    - NLI
    - Generative models (gpt)
- Small experiment + results


# Sentence Bert - Embedding based classification


In [None]:
!pip install transformers

In [123]:
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
model = AutoModel.from_pretrained('deepset/sentence_bert')

In [124]:
sentence = 'The tool bar doesnt work on my mac, can you fix it?"'
labels = ['Feature-request', 'Bug', 'Other']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)


We use the batch tokenizer to get the tokenized representation of the sentence and the labels (we use padding to match the max lenghe sequence)

In [125]:
inputs['input_ids'] ,inputs['input_ids'].shape

(tensor([[  101,  1996,  6994,  3347,  2987,  2102,  2147,  2006,  2026,  6097,
           1010,  2064,  2017,  8081,  2009,  1029,  1000,   102],
         [  101,  3444,  1011,  5227,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0],
         [  101, 11829,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0],
         [  101,  2060,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0]]),
 torch.Size([4, 18]))

we got the input tokens for each element:
1 for the sentence and 3 for each label

In [126]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
output.shape

torch.Size([4, 18, 768])

For each input token, we gen an embedding with size 768

In [127]:
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# try using the pooling layer
# output_pooling = model(input_ids, attention_mask=attention_mask)[1]
# sentence_rep = output_pooling[0].unsqueeze(0)
# label_reps = output_pooling[1:]

sentence_rep.shape, label_reps.shape

(torch.Size([1, 768]), torch.Size([3, 768]))

We apply average pooling on the sequence dim to get the final embedding rep for each element (sentence and labels)

In [128]:
# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

label: Bug 	 similarity: 0.27688243985176086
label: Feature-request 	 similarity: 0.13044913113117218
label: Other 	 similarity: 0.01148887537419796


The biggest disatvantage of using bert embedding for this task is the fact that it is uses contextual embedding, which make it harder to embed the single word (our label) in an informative way.

# NLI

In [146]:
# load model pretrained on MNLI
from transformers import BartForSequenceClassification, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

In [150]:
# pose sequence as a NLI premise and label (politics) as a hypothesis
premise = "Thanks for the awesome demo!! loved it."
hypothesis = 'This text is Positive.'

# run through model pre-trained on MNLI
input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')

In [151]:
input_ids, input_ids.shape

(tensor([[    0, 22086,    13,     5,  6344, 19592, 12846,  2638,    24,     4,
              2,     2,   713,  2788,    16, 25968,     4,     2]]),
 torch.Size([1, 18]))

The tokenizer encode both the premise and the hypothesis into a single vecore and uses the special token "2" has a seperator between the elements

In [152]:
logits = model(input_ids).logits
logits, logits.shape

(tensor([[-1.9128, -0.1560,  1.7016]], grad_fn=<AddmmBackward0>),
 torch.Size([1, 3]))

Using the logits layer, we can get the logits for each label: contradiction (0), neutral (1), entailment (2)

In [153]:
# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
true_prob = probs[:,1].item() * 100
print(f'Probability that the label is true: {true_prob:0.2f}%')

Probability that the label is true: 97.38%


Let's make it work on a given list of categories

In [200]:
import numpy as np

def generic_nli_classifier(premise, possible_cotegories, verbose=False):
    probas = []

    hypothesis_template = "This text is about {label}."

    for label in possible_cotegories:
        hypothesis = hypothesis_template.format(**{'label':label})
        input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
        logits = model(input_ids).logits

        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        true_prob = probs[:,1].item()

        probas.append(true_prob)

    softmax_probas = np.exp(probas) / np.sum(np.exp(probas))
    predicted_label = possible_cotegories[np.argmax(softmax_probas)]

    if verbose:
        print(f'\nChosen category: {predicted_label}')
        for idx,label in enumerate(possible_cotegories):
            print(f'- label:{label} probability: {softmax_probas[idx]:0.2f}%')

    return predicted_label


In [215]:
examples = [
# Example 1
{'premise' : "The tool bar doesn't work on my mac, can you fix it?",
'possible_cotegories' : ['Feature-request', 'Bug', 'Other']},

# Example 2
{'premise' : "Can you add AI assistant button on the left bar as well?",
'possible_cotegories' : ['Feature-request', 'Bug', 'Other']},

# Example 3
{'premise' : "The tool bar doesn't work on my mac, its annoying",
'possible_cotegories' : ['Positive', 'Neutral', 'Negative']},

# Example 4
{'premise' : "Thanks for the awesome demo!! loved it",
'possible_cotegories' : ['Positive', 'Neutral', 'Negative']}
]


for idx, example in enumerate(examples):

    premise = example['premise']
    possible_cotegories = example['possible_cotegories']

    print(f'\n\nExample {idx + 1}')
    print(f'\nPremis: {premise}')
    _ = generic_nli_classifier(premise, possible_cotegories, verbose=True)



Example 1

Premis: The tool bar doesn't work on my mac, can you fix it?

Chosen category: Bug
- label:Feature-request probability: 0.30%
- label:Bug probability: 0.41%
- label:Other probability: 0.29%


Example 2

Premis: Can you add AI assistant button on the left bar as well?

Chosen category: Feature-request
- label:Feature-request probability: 0.50%
- label:Bug probability: 0.24%
- label:Other probability: 0.26%


Example 3

Premis: The tool bar doesn't work on my mac, its annoying

Chosen category: Negative
- label:Positive probability: 0.29%
- label:Neutral probability: 0.30%
- label:Negative probability: 0.41%


Example 4

Premis: Thanks for the awesome demo!! loved it

Chosen category: Positive
- label:Positive probability: 0.46%
- label:Neutral probability: 0.27%
- label:Negative probability: 0.27%


One advanatge of this approach over the embedding based is that it produce probability

# NLI using HG pipeline

In [202]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [213]:
sequence_to_classify = "The tool bar doesn't work on my mac, can you fix it?"
candidate_labels = ['Feature-request', 'Bug', 'Other']
hypothesis_template = "This text is about {}."
res = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

print(res['labels'])
print(np.round(res['scores'],2))

['Bug', 'Feature-request', 'Other']
[0.4  0.31 0.29]


In [218]:
sequence_to_classify = "The tool bar doesn't work on my mac, its annoying"
candidate_labels = ['Positive', 'Neutral', 'Negative']

hypothesis_template = "This text is about {}."

res = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

print(res['labels'])
print(np.round(res['scores'],2))

print(' \n -------------- \n')

hypothesis_template = "This text is {}."

res = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

print(res['labels'])
print(np.round(res['scores'],2))


print(' \n -------------- \n')

hypothesis_template = "This sentiment of this text is {}."

res = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

print(res['labels'])
print(np.round(res['scores'],2))

['Negative', 'Neutral', 'Positive']
[0.64 0.2  0.16]
 
 -------------- 

['Negative', 'Positive', 'Neutral']
[0.78 0.12 0.1 ]
 
 -------------- 

['Negative', 'Neutral', 'Positive']
[0.86 0.09 0.05]


# OpenAI GPT

In [None]:
# TODO