In [85]:
from transformers import pipeline
import numpy as np

# Example

In [255]:
# T5 example
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Das Haus ist wunderbar.




In [256]:
prompt = "Given the following Movie Review - sentiment pairs: "\
"'hide new secretions from the parental units' - negative, "\
"'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative, "\
"'that loves its characters and communicates something rather beautiful about human nature' - positive, "\
"'remains utterly satisfied to remain the same throughout' - negative, "\
"Classify the following movie review as positive or negative: "\
"'contains no wit , only labored gags' - negative"

In [257]:
prompt

"Given the following Movie Review - sentiment pairs: 'hide new secretions from the parental units' - negative, 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative, 'that loves its characters and communicates something rather beautiful about human nature' - positive, 'remains utterly satisfied to remain the same throughout' - negative, Classify the following movie review as positive or negative: 'contains no wit , only labored gags' - negative"

In [258]:
from transformers import GPT2Tokenizer, AutoModelForCausalLM
import numpy as np

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(["Is this sentence positive or negative sentiment? 'Everything is fine'. The sentence's sentiment is "], return_tensors="pt")

# Example 1: Print the scores for each token generated with Greedy Search
outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, normalize_logits=True
)
# input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
# encoder-decoder models, like BART or T5.
input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


|  1849 |          | -0.375 | 68.72%
| 24561 | positive | -0.828 | 43.67%
|   393 |  or      | -1.523 | 21.81%
|  4633 |  negative | -0.323 | 72.40%
|    13 | .        | -0.814 | 44.32%


In [259]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")



In [260]:
#### T5 removes bits of the input prompt (text2text rather than causal language modelling)
prefix = "Movie Reviews: "\
        "'hide new secretions from the parental units' (negative) "\
        "'contains no wit , only labored gags' (negative) "\
        "'that loves its characters and communicates something rather beautiful about human nature' (positive) "\
        "'remains utterly satisfied to remain the same throughout' (negative) "\
        "Is the following movie review positive or negative? "

test_sample = "'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up''"

prompt = prefix + test_sample

input_ids = tokenizer(prompt, 
                return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_length=input_ids.size(1) + 10, output_scores=True)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
print(generated_text.split()[-1])

KeyboardInterrupt: 

In [None]:
input_ids.size()

torch.Size([1, 113])

In [None]:
#### Easy example might work

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Define the prompt
prefix = "Given the following Movie Review - sentiment pairs: " \
         "'This movie was bad' - negative " \
         "'This movie was great' - positive " \
         "'This movie was great' - positive " \
         "'This movie was awsome' - positive " \
         "'This movie was shit' - negative " \
         ". Classify the following movie review as positive or negative: "
test_sample = "'The film was good at all' - "
prompt = prefix + test_sample

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text with prompt completion
outputs = model.generate(input_ids,
                         max_length=input_ids.size(1) + 2,
                         num_return_sequences=1
                         )

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
Given the following Movie Review - sentiment pairs: 'This movie was bad' - negative 'This movie was great' - positive 'This movie was great' - positive 'This movie was awsome' - positive 'This movie was shit' - negative. Classify the following movie review as positive or negative: 'The film was good at all' -  positive


In [None]:
#### bad performance for SST examples

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Define the prompt
prefix = "Given the following Movie Review - sentiment pairs: " \
         "'hide new secretions from the parental units' - negative " \
         "'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative " \
         "'that loves its characters and communicates something rather beautiful about human nature' - positive " \
         "'remains utterly satisfied to remain the same throughout' - negative " \
         ". Classify the following movie review as positive or negative: "
test_sample = "'contains no wit , only labored gags' - "
prompt = prefix + test_sample

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text with prompt completion
outputs = model.generate(input_ids,
                         max_length=input_ids.size(1) + 3,
                         num_return_sequences=1
                         )

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
Given the following Movie Review - sentiment pairs: 'hide new secretions from the parental units' - negative 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative 'that loves its characters and communicates something rather beautiful about human nature' - positive'remains utterly satisfied to remain the same throughout' - negative. Classify the following movie review as positive or negative: 'contains no wit, only labored gags' -  'a


In [None]:
#### GPT2: Demonstrating problems with prediction

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the prompt
prompt = "This is a movie review dataset with associated positive or negative sentiment: \n'hide new secretions from the parental units' - negative, \n'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative,\n''that loves its characters and communicates something rather beautiful about human nature' - positive,\n'remains utterly satisfied to remain the same throughout' - negative,\n'I really don't like it' - "

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Perform a single inference step
output = model.generate(input_ids, max_length=input_ids.size(1) + 2, num_return_sequences=1, do_sample=True)

# Extract and decode the generated token
generated_token = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated token
print("Generated Token:", generated_token)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Token: This is a movie review dataset with associated positive or negative sentiment: 
'hide new secretions from the parental units' - negative, 
'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative,
''that loves its characters and communicates something rather beautiful about human nature' - positive,
'remains utterly satisfied to remain the same throughout' - negative,
'I really don't like it' -  very


In [None]:
#### GPT2: Demonstrating problems with prediction

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT2 model and tokenizer
model_name = 'gpt2-large' # 3,2 GB 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the prompt
prompt = "This is a movie review dataset with associated positive or negative sentiment: \n'hide new secretions from the parental units' - negative, \n'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative,\n''that loves its characters and communicates something rather beautiful about human nature' - positive,\n'remains utterly satisfied to remain the same throughout' - negative,\n'I really don't like it' - "

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Perform a single inference step
output = model.generate(input_ids, max_length=input_ids.size(1) + 2, num_return_sequences=1)

# Extract and decode the generated token
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated token
print("Generated Token:", generated_text)


Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Token: This is a movie review dataset with associated positive or negative sentiment: 
'hide new secretions from the parental units' - negative, 
'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up' - negative,
''that loves its characters and communicates something rather beautiful about human nature' - positive,
'remains utterly satisfied to remain the same throughout' - negative,
'I really don't like it' -  positive


In [None]:
# Another approach for any classification task (entailment corresponds to true, contradiction to false)

# load model pretrained on MNLI
from transformers import BartForSequenceClassification, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

# pose sequence as a NLI premise and label (politics) as a hypothesis
premise = 'Who are you voting for in 2020?'
hypothesis = 'This text is about politics.'

# run through model pre-trained on MNLI
input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
logits = model(input_ids)[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
true_prob = probs[:,1].item() * 100
print(f'Probability that the label is true: {true_prob:0.2f}%')

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_7 = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model_7 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

In [None]:
tokenizer_13 = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
model_13 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")

In [None]:
from model import FewShotLearner

llama_7 = FewShotLearner('meta-llama/Llama-2-7b-hf')
llama_13 = FewShotLearner('meta-llama/Llama-2-13b-hf')
llama_70 = FewShotLearner('meta-llama/Llama-2-70b-hf')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from lime.lime_text import LimeTextExplainer

# Load model and tokenizer
model_name = "cross-encoder/nli-deberta-v3-small"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Initialize explainer
explainer = LimeTextExplainer(class_names=['contradiction', 'neutral', 'entailment'])

# Create helper function for LIME
def predictor(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).detach().numpy()
    return probs

# Load a sample from MNLI dataset
from datasets import load_dataset
dataset = load_dataset('SetFit/mnli', split='validation')

# Select a sample
for i in range(3):
        
    sample = dataset[i]
    premise = sample['text1']
    hypothesis = sample['text2']
    text_instance = premise + " [SEP] " + hypothesis

    # Explain a prediction
    exp = explainer.explain_instance(text_instance, predictor, num_features=6, num_samples=50)

    # Visualize the explanation
    exp.show_in_notebook()
