<a href="https://colab.research.google.com/github/jpellicott/openprompt_example/blob/main/openprompt_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U openprompt




In [2]:
!pip install transformers==4.28.0




In [3]:
from openprompt.data_utils import InputExample
classes = [ # There are two classes in Sentiment Analysis, one for negative and one for positive
    "negative",
    "positive"
]
dataset = [ # For simplicity, there's only two examples
    # text_a is the input text of the data, some other datasets may have multiple input sentences in one example.
    InputExample(
        guid = 0,
        text_a = "Albert Einstein was one of the greatest intellects of his time.",
    ),
    InputExample(
        guid = 1,
        text_a = "The film was badly made.",
    ),
]



In [4]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

In [5]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} It was {"mask"}',
    tokenizer = tokenizer,
)

In [6]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        "negative": ["bad"],
        "positive": ["good", "wonderful", "great"],
    },
    tokenizer = tokenizer,
)

In [7]:
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
)

In [8]:
from openprompt import PromptDataLoader
data_loader = PromptDataLoader(
    dataset = dataset,
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
)

tokenizing: 2it [00:00, 326.13it/s]


In [9]:
import torch

# making zero-shot inference using pretrained MLM with prompt
promptModel.eval()
with torch.no_grad():
    for batch in data_loader:
        logits = promptModel(batch)
        preds = torch.argmax(logits, dim = -1)
        print(classes[preds])
# predictions would be 1, 0 for classes 'positive', 'negative'

positive
negative


In [31]:
# Cell Name: OpenPrompt with RoBERTa (Refined)

from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt import PromptForClassification, PromptDataLoader
import torch

# Load RoBERTa for sentiment classification
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "roberta-base")

# Define a refined prompt template to better align with RoBERTa's language
template = ManualTemplate(
    text = '{"placeholder":"text_a"} This statement is {"mask"}.',
    tokenizer=tokenizer,
)

# Expand verbalizer with more sentiment-related words
verbalizer = ManualVerbalizer(
    classes=["negative", "positive"],
    label_words={
        "negative": ["bad", "terrible", "poor", "awful"],
        "positive": ["good", "great", "excellent", "wonderful"]
    },
    tokenizer=tokenizer,
)

# Create the OpenPrompt model for classification
promptModel = PromptForClassification(plm=plm, template=template, verbalizer=verbalizer, freeze_plm=False)

# Create DataLoader to process the dataset
data_loader = PromptDataLoader(
    dataset=dataset,
    template=template,
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=1,
    shuffle=False
)

# Run zero-shot inference and output results
promptModel.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(data_loader):
        logits = promptModel(batch)
        pred = torch.argmax(logits, dim=-1).item()
        print(f"Refined RoBERTa Prediction for '{dataset[batch_idx].text_a}': {classes[pred]}")


tokenizing: 2it [00:00, 686.02it/s]


Refined RoBERTa Prediction for 'Albert Einstein was one of the greatest intellects of his time.': positive
Refined RoBERTa Prediction for 'The film was badly made.': negative


In [32]:
# Cell Name: OpenPrompt with T5 for Sentiment Classification

from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import MixedTemplate
from openprompt import PromptForClassification, PromptDataLoader
from openprompt.prompts import ManualVerbalizer
import torch

# Define a small example dataset
dataset = [
    InputExample(guid=0, text_a="Albert Einstein was one of the greatest intellects of his time.", label=1),
    InputExample(guid=1, text_a="The film was badly made.", label=0),
]

# Define classes and labels
classes = ["negative", "positive"]

# Load T5 model and tokenizer
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-small")

# Define a MixedTemplate for T5
template = MixedTemplate(
    model=plm,
    tokenizer=tokenizer,
    text='{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} sentiment is {"mask"}.'
)

# Define verbalizer for binary classification
verbalizer = ManualVerbalizer(
    tokenizer=tokenizer,
    num_classes=2,
    label_words=[["bad"], ["good"]]
)

# Wrap the tokenizer to work with OpenPrompt
wrapper = WrapperClass(max_seq_length=128, tokenizer=tokenizer, decoder_max_length=3, truncate_method="head")

# Create the OpenPrompt model for classification
prompt_model = PromptForClassification(plm=plm, template=template, verbalizer=verbalizer, freeze_plm=False)

# Create DataLoader to process the dataset
data_loader = PromptDataLoader(
    dataset=dataset,
    template=template,
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    decoder_max_length=3,
    batch_size=1,
    shuffle=False,
    predict_eos_token=False,
)

# Run zero-shot inference and output results
prompt_model.eval()
with torch.no_grad():
    for batch in data_loader:
        logits = prompt_model(batch)
        pred = torch.argmax(logits, dim=-1).item()
        print(f"T5 Prediction: {classes[pred]}")


tokenizing: 2it [00:00, 611.82it/s]

T5 Prediction: positive
T5 Prediction: positive





In [33]:
# Cell Name: OpenPrompt with GPT-2 for Sentiment Classification

from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt import PromptForClassification, PromptDataLoader
import torch

# Define a small example dataset for sentiment analysis
dataset = [
    InputExample(guid=0, text_a="Albert Einstein was one of the greatest intellects of his time.", label=1),
    InputExample(guid=1, text_a="The film was badly made.", label=0),
]

# Define classes and label words
classes = ["negative", "positive"]

# Load GPT-2 as the pretrained language model
plm, tokenizer, model_config, WrapperClass = load_plm("gpt2", "gpt2")

# Define the prompt template for GPT-2
template = ManualTemplate(
    text='{"placeholder":"text_a"} This statement is {"mask"}.',
    tokenizer=tokenizer,
)

# Define the verbalizer mapping for sentiment analysis
verbalizer = ManualVerbalizer(
    classes=classes,
    label_words={
        "negative": ["bad", "terrible", "poor"],
        "positive": ["good", "great", "excellent"]
    },
    tokenizer=tokenizer,
)

# Wrap the tokenizer to work with OpenPrompt
wrapper = WrapperClass(max_seq_length=128, tokenizer=tokenizer, truncate_method="head")

# Create the OpenPrompt model for classification
promptModel = PromptForClassification(plm=plm, template=template, verbalizer=verbalizer, freeze_plm=False)

# Create a DataLoader to process the dataset
data_loader = PromptDataLoader(
    dataset=dataset,
    template=template,
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=128,
    batch_size=1,
    shuffle=False
)

# Run zero-shot inference and output results
promptModel.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(data_loader):
        logits = promptModel(batch)
        pred = torch.argmax(logits, dim=-1).item()
        print(f"GPT-2 Prediction for '{dataset[batch_idx].text_a}': {classes[pred]}")


Using pad_token, but it is not set yet.
tokenizing: 2it [00:00, 719.25it/s]


GPT-2 Prediction for 'Albert Einstein was one of the greatest intellects of his time.': positive
GPT-2 Prediction for 'The film was badly made.': negative
