In [1]:
#@title Imports and Initialization
%pip install datasets
%pip install textwrap
%pip install openai

import collections
from abc import ABC
import datasets
import json
import openai
import numpy as np
from scipy.special import softmax
import textwrap
import matplotlib.pyplot as plt
from IPython.display import clear_output

OPENAI_SECRET_KEY = None

clear_output()

# Setup the OpenAI API

1. Go to www.openai.com and log into your account.
2. Go to https://platform.openai.com/account/api-keys and click the "Create new secret key" button. It doesn't matter what you name it.
3. Copy your API key, and then run the code block below. It will ask you to enter your secret key into a text box.

In [9]:
if OPENAI_SECRET_KEY is None:
  print("Please paste your OpenAI API key here:")
  OPENAI_SECRET_KEY = input().strip()
openai.api_key = OPENAI_SECRET_KEY
clear_output()

class OpenAIEngine():
  def __init__(self, model_name):
    self.model_name = model_name

  def score(self, text):
    """Tokenizes and scores a piece of text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    The score is log-likelihood. A higher score means a token was more
    likely according to the model.

    Returns a list of tokens and a list of scores.
    """
    response = openai.Completion.create(
        engine=self.model_name,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True)

    tokens = response["choices"][0]["logprobs"]["tokens"]
    logprobs = response["choices"][0]["logprobs"]["token_logprobs"]
    if logprobs and logprobs[0] is None:
      # GPT-3 API does not return logprob of the first token
      logprobs[0] = 0.0
    return tokens, logprobs

  def perplexity(self, text):
    """Compute the perplexity of the provided text."""
    completion = openai.Completion.create(
        model=self.model_name,
        prompt=text,
        logprobs=0,
        max_tokens=0,
        temperature=1.0,
        echo=True)
    token_logprobs = completion['choices'][0]['logprobs']['token_logprobs']
    nll = np.mean([i for i in token_logprobs if i is not None])
    ppl = np.exp(-nll)
    return ppl

  def generate(self,
               prompt,
               top_p=1.0,
               num_tokens=32,
               num_samples=1,
               frequency_penalty=0.0,
              presence_penalty=0.0):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    response = openai.Completion.create(
      engine=self.model_name,
      prompt=prompt,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      n=num_samples,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      logprobs=1,
    )
    outputs = [r["text"] for r in response["choices"]]
    return outputs[0] if num_samples == 1 else outputs


  def chat_generate(self,
                    previous_messages,
                    top_p=1.0,
                    num_tokens=32,
                    num_samples=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0):
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=previous_messages,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      n=num_samples,
    )
    return response

# Question 1: Observing the Impact of Decoding Strategy

## 1.1: Rolling a Twenty-Sided Die

In [25]:
MODEL_NAME = "davinci"
engine = OpenAIEngine(MODEL_NAME)

In [32]:
prompt = "Let's roll a D20. The die shows the number"
rolls = engine.generate(prompt, num_tokens=1, num_samples=128, top_p=1.0)
expected_number_of_outcomes = 20

In [34]:
rolls_counter = collections.Counter()
for roll in rolls:
  try:
    roll_num = int(roll)
    # Let's label invalid numbers as -1
    roll_num = roll_num if 1 <= roll_num <= 20 else -1
  except ValueError:
    # Let's just label invalid generation as a roll of -1.
    roll_num = -1
  rolls_counter[roll_num] += 1

print(rolls_counter)
print("Percentage of valid outcomes generated:",
      (len(rolls_counter)-1)/expected_number_of_outcomes)

Counter({-1: 21, 12: 13, 10: 9, 18: 9, 15: 9, 7: 8, 4: 8, 8: 7, 2: 7, 9: 5, 6: 5, 19: 4, 16: 4, 5: 4, 1: 4, 14: 4, 3: 3, 13: 2, 17: 1, 20: 1})
Percentage of valid outcomes generated: 0.95


## 1.2: Longform Generation

In [36]:
prompt = "Let me tell you the story about the morning when a hippopotamus ate my homework."
engine.generate(prompt, num_tokens=256, num_samples=1, top_p=0.8, frequency_penalty=0.0)

'” The Hippopotamus Who Swallowed a Watermelon was my favorite picture book as a child. It was such a silly story, but it still had the silliness and the adventure that I loved. Plus, it’s a great one to learn a few new words, as there are plenty in it.\n\nThe illustrations are adorable and full of life, making the book feel like it’s bursting with energy. It was a fun one to read to my kids and the tale will stick with them for a long time.\n\n3. Nighttime Nap – Babar\n\nThis book was one of the very first books my daughter learned to read by herself. She was around 3 or 4 at the time and still loved the beautiful illustrations in the Babar series. It’s also fun to read the few words that my daughter could read at the time, and then ask her to predict what the word might be. I still read this one to my kids before they go to bed.\n\nI’ve always been a fan of Babar and this series of books is no different. They are a fun and easy way to get kids interested in books. They are simple, bu

In [37]:
prompt = "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness,"
engine.generate(prompt, num_tokens=256, num_samples=1, top_p=0.8, frequency_penalty=0.0)

' it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way – in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.” ~ Charles Dickens, A Tale of Two Cities, 1859\n\nA few of my earliest memories are of lying awake and waiting for the monsters to come. I was absolutely convinced that my parents were asleep and unaware of what was happening to me in the dark, in my bed. I was certain that I was all alone.\n\nThat was true, but only because I didn’t understand what my mother was doing in the living room. I was three and a half years old.\n\nAs I grew older, and the monsters of childhood still lived in my 

# Question 2: Measuring Perplexity

In [38]:
MODEL_NAME = "davinci"
engine = OpenAIEngine(MODEL_NAME)

In [39]:
poem = """
’Twas brillig, and the slithy toves
      Did gyre and gimble in the wabe:
All mimsy were the borogoves,
      And the mome raths outgrabe.

“Beware the Jabberwock, my son!
      The jaws that bite, the claws that catch!
Beware the Jubjub bird, and shun
      The frumious Bandersnatch!”

He took his vorpal sword in hand;
      Long time the manxome foe he sought—
So rested he by the Tumtum tree
      And stood awhile in thought.

And, as in uffish thought he stood,
      The Jabberwock, with eyes of flame,
Came whiffling through the tulgey wood,
      And burbled as it came!

One, two! One, two! And through and through
      The vorpal blade went snicker-snack!
He left it dead, and with its head
      He went galumphing back.

“And hast thou slain the Jabberwock?
      Come to my arms, my beamish boy!
O frabjous day! Callooh! Callay!”
      He chortled in his joy.

’Twas brillig, and the slithy toves
      Did gyre and gimble in the wabe:
All mimsy were the borogoves,
      And the mome raths outgrabe.
"""

engine.perplexity(poem)

1.339475969608505

# Question 3: Experimenting with Few-Shot Prompting

## 3.1: Few-Shot Learning for the Choice of Plausible Alternatives Task

In [40]:
copa_dataset = datasets.load_dataset("super_glue", "copa")

# You may draw on these examples to produce few-shot prompts.
train_data = copa_dataset["train"].shuffle(seed=1).select(range(50))

# Use this development set to try out different few-shot prompts to see
# what works best.
dev_data = copa_dataset["train"].shuffle(seed=1).select(range(50, 150))

# You should only use this at the end during final evaluation to generate
# accuracies to put in your report.
test_data = copa_dataset["test"].shuffle(seed=1).select(range(100))

print("Some examples from the train set:")
for i in range(3):
  print(json.dumps(train_data[i], indent=2))

Downloading builder script:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Some examples from the train set:
{
  "premise": "The woman spotted her friend from across the room.",
  "choice1": "The woman waved.",
  "choice2": "The woman escaped.",
  "question": "cause",
  "idx": 379,
  "label": 0
}
{
  "premise": "The girl made a wish.",
  "choice1": "She saw a black cat.",
  "choice2": "She saw a shooting star.",
  "question": "cause",
  "idx": 45,
  "label": 1
}
{
  "premise": "The woman hired a lawyer.",
  "choice1": "She decided to sue her employer.",
  "choice2": "She decided to run for office.",
  "question": "cause",
  "idx": 39,
  "label": 0
}


In [41]:
prompt = "Given the following premise and cause, label whether the cause seems correct\n\n"
eval_template = "Review: {review}\nSentiment: {sentiment}"

In [42]:
def classify_baseline(premise: str, choice1: str, choice2:str) -> str:
  """ Given a review, returns a sentiment prediction, 0 for negative, 1 for positive."""

  eval_template = """Which of the following makes more sense?

  Choice 1: {premise} This happened because: {choice1}
  Choice 2: {premise} This happened because: {choice2}

  {label} makes more sense.
  """
  label_map = {0: "Choice 1", 1: "Choice 2"}

  label_to_score = {}
  for label, label_str in label_map.items():
    label_prompt = prompt + eval_template.format(
        premise=premise, choice1=choice1, choice2=choice2, label=label_str)
    _, score = engine.score(label_prompt)
    llm_score_for_label = score[-1]

    label_to_score[label] = llm_score_for_label

  return max(label_to_score, key=label_to_score.get)


def evaluate(dataset, verbose: bool=False) -> float:
  """ Evaluate your prompt on the test set """
  correct = []
  for i, instance in enumerate(dataset):
    label = instance["label"]
    predicted = classify_baseline(
        instance["premise"], instance["choice1"], instance["choice2"])
    correct.append(1 if label == predicted else 0)

    if verbose:
      print(f"======== {i+1} / {len(dataset)} ========")
      print(f"PREMISE: {instance['premise']}")
      print(f"CHOICE 1 {'✅' if not label else '❌'}: {instance['choice1']}")
      print(f"CHOICE 2 {'✅' if label else '❌'}: {instance['choice2']}")
      print(f"PREDICTED: {'choice 2' if predicted else 'choice 1'}")

  acc = sum(correct) / len(correct)
  return acc

#  Once you have chosen your prompts, for final evaluation, replace dev_data
# with test_data.
acc = evaluate(dev_data, verbose=True)
print(f"Accuracy of your prompt on {len(test_data)} test examples: {acc:.0%}")

PREMISE: My car was towed.
CHOICE 1 ✅: I parked illegally.
CHOICE 2 ❌: I jumped the battery.
PREDICTED: choice 2
PREMISE: I rubbed the soap between my hands.
CHOICE 1 ✅: The soap foamed.
CHOICE 2 ❌: My hands went numb.
PREDICTED: choice 2
PREMISE: The stain came out of the shirt.
CHOICE 1 ❌: I patched the shirt.
CHOICE 2 ✅: I bleached the shirt.
PREDICTED: choice 2
PREMISE: The police closed the investigation.
CHOICE 1 ✅: They apprehended the suspect.
CHOICE 2 ❌: The victim recovered.
PREDICTED: choice 2
PREMISE: My foot went numb.
CHOICE 1 ❌: I put my shoes on.
CHOICE 2 ✅: I shook my foot.
PREDICTED: choice 2
PREMISE: The hospital sent the patient home.
CHOICE 1 ✅: The patient's symptoms cleared up.
CHOICE 2 ❌: The patient's family visited him.
PREDICTED: choice 2
PREMISE: The parents recognized their daughter's creativity.
CHOICE 1 ❌: They taught her how to ride a bike.
CHOICE 2 ✅: They encouraged her to become an artist.
PREDICTED: choice 2
PREMISE: The girl went down the hill on he