## 14.3 Semantic searching with OpenAI Embeddings

In [1]:
# inspired from 
#  https://github.com/openai/openai-cookbook/blob/502429c7c85fe78e0bc481e02d0ca44e2b9ad2c1/examples/Obtain_dataset.ipynb
#  https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

import openai
from openai.embeddings_utils import get_embedding

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from sentence_transformers import util
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PERSON = 'Ling Huang'

# Note this is NOT an efficient way to search on google. This is done simply for education purposes
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024]

nlp = pipeline('question-answering', 
               model='deepset/roberta-base-squad2', 
               tokenizer='deepset/roberta-base-squad2', 
               max_length=10)

nlp(f'Who is {PERSON}?', google_html)

{'score': 0.10722540318965912,
 'start': 833,
 'end': 883,
 'answer': 'aI am talented in teaching and motivating learners'}

In [3]:
# Our good old textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

# Only keep documents of at least 100 characters, same as before
documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n')))

print(f'There are {len(documents)} documents/paragraphs')

There are 70 documents/paragraphs


In [4]:
openai.api_key = 'XXX'  # replace with your key or save it as an environment variable named "OPENAI_API_KEY"

In [5]:
openai.Engine.list().data

[<Engine engine id=text-search-babbage-doc-001 at 0x1313c1fd0> JSON: {
   "created": null,
   "id": "text-search-babbage-doc-001",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=gpt-4 at 0x1313c3dd0> JSON: {
   "created": null,
   "id": "gpt-4",
   "object": "engine",
   "owner": "openai",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=gpt-3.5-turbo-16k at 0x1313c03b0> JSON: {
   "created": null,
   "id": "gpt-3.5-turbo-16k",
   "object": "engine",
   "owner": "openai-internal",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=curie-search-query at 0x1313c2db0> JSON: {
   "created": null,
   "id": "curie-search-query",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=text-davinci-003 at 0x1313c3ad0> JSON: {
   "created": null,
   "id": "text-davinci-003",
   "object": "engine",
   "owner": "openai-internal",
   "permissions": 

In [6]:
# define the engine we will use for embeddings
ENGINE = 'text-embedding-ada-002'

# list embedding engines
[e for e in openai.Engine.list().data if 'embed' in e.id or 'search' in e.id]

[<Engine engine id=text-search-babbage-doc-001 at 0x1311f9cd0> JSON: {
   "created": null,
   "id": "text-search-babbage-doc-001",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=curie-search-query at 0x1313c27b0> JSON: {
   "created": null,
   "id": "curie-search-query",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=text-search-babbage-query-001 at 0x1313c1cd0> JSON: {
   "created": null,
   "id": "text-search-babbage-query-001",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=babbage-search-query at 0x1313c06b0> JSON: {
   "created": null,
   "id": "babbage-search-query",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=babbage-search-document at 0x1313c24b0> JSON: {
   "created": null,
   "id": "babbage-search-document",
   "o

In [7]:
response = openai.Embedding.create(
   model="text-embedding-ada-002",
   input="Tiktoken library For Tokenization In OpenAI API"
)

In [8]:
response 

<OpenAIObject list at 0x1313c1af0> JSON: {
  "data": [
    {
      "embedding": [
        -0.03512801229953766,
        0.0026946458965539932,
        -0.019008027389645576,
        -0.028217343613505363,
        0.011544798500835896,
        0.01855124533176422,
        -0.01688620075583458,
        -0.0013887648237869143,
        0.00020813054288737476,
        -0.02986765280365944,
        0.03798658400774002,
        0.004527299664914608,
        0.007853704504668713,
        -0.008767268620431423,
        -0.009496646001935005,
        0.0004618471721187234,
        0.003429549280554056,
        -0.01184686366468668,
        0.010049205273389816,
        -0.010594396851956844,
        -0.013769769109785557,
        0.014963296242058277,
        -0.009548218920826912,
        0.012583608739078045,
        -0.02867412567138672,
        -0.00643915357068181,
        0.019759507849812508,
        -0.05192580446600914,
        -0.0034553352743387222,
        -0.015869492664933205,
    

In [9]:
#  https://github.com/openai/openai-python#usage

# This could take time if you have hundreds or thousands of documents
embeddings = [get_embedding(document, engine=ENGINE) for document in documents]

In [10]:
# Transform list of lists to numpy
document_embeddings = np.array(embeddings)

document_embeddings.shape

(70, 1536)

In [11]:
# This next part will look pretty familiar
QUESTION = 'How many horns does a flea have?'  # a natural language query

In [12]:
# Encode the query using OpenAI and find relevant documents
question_embedding = np.array(get_embedding(QUESTION, engine=ENGINE))

# Sentence Transformers semantic search is ready to go. We could rewrite it otherwise
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]

hits

[{'corpus_id': 15, 'score': 0.8606122695413039},
 {'corpus_id': 17, 'score': 0.8235469862615471},
 {'corpus_id': 19, 'score': 0.7948092110171223}]

In [13]:
print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):
    
    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print('\n')

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.861:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Document 2 Cos_Sim 0.824:

In examining the louse with a microscope, its external deformity strikes
us with disgust. It has six feet, two eyes, and a sort of sting,
proboscis, or sucker, with which it pierces the skin, and sucks the
blood. The skin of the louse is hard and transparent, with here and
there several bristly hairs: at the end of each leg are two claws, by
which it is enabled to lay hold of the hairs, on which it climbs. There
is scarcely any animal known to

In [14]:
# answer the question from the top document
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

{'score': 0.852473795413971, 'start': 259, 'end': 262, 'answer': 'two'}

# Let's use GPT3 to answer instead
![](../data/gptqa.png)

In [15]:
context = documents[hits[0]['corpus_id']]

PROMPT = f"Given this context, answer the question.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:"
print(PROMPT)

Given this context, answer the question.

Context: When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.
Query: How many horns does a flea have?
Answer:


In [16]:
# Call the OpenAI API to extract the answer from our context

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=PROMPT,
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [17]:
response

<OpenAIObject text_completion id=cmpl-8NkuarqJE4FjyvRItI4v4iUVAymur at 0x1313c1c70> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " A flea has two horns."
    }
  ],
  "created": 1700672564,
  "id": "cmpl-8NkuarqJE4FjyvRItI4v4iUVAymur",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 7,
    "prompt_tokens": 143,
    "total_tokens": 150
  },
}

In [18]:
# Get the completion
response['choices'][0]['text']

' A flea has two horns.'

In [19]:
FUN_PROMPT = f"Given this context, answer the question in a fun way for a second grader.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:"
print(FUN_PROMPT)

Given this context, answer the question in a fun way for a second grader.

Context: When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.
Query: How many horns does a flea have?
Answer:


In [20]:
# Some more fun asking GPT to respond to a 2nd grader

context = documents[hits[0]['corpus_id']]

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=FUN_PROMPT,
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [21]:
# Get the completion, with some more flavor
response['choices'][0]['text']

' A flea has two horns, like a little porcupine!'

https://github.com/openai/openai-python#usage