In [None]:
import pandas as pd
import openai
from elasticsearch import Elasticsearch, helpers
import getpass
from zipfile import ZipFile
import os
import json

Download the dataset and extract it

In [None]:
os.system('curl -O https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip')
with ZipFile('vector_database_wikipedia_articles_embedded.zip', 'r') as zip_ref:
    zip_ref.extractall()

Replace with your local path to the downloaded file

In [None]:
file = '/Users/<user>/Downloads/vector_database_wikipedia_articles_embedded.csv'

Read the data into a DataFrame

In [None]:
df = pd.read_csv(file)

Found in the 'Manage Deployment' page

In [None]:
CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID:  ')

 Password for the 'elastic' user generated by Elasticsearch

In [None]:
ELASTIC_API_KEY = getpass.getpass('Enter Elastic password:  ')

Create the client instance

In [None]:
client = Elasticsearch(
    cloud_id= CLOUD_ID,
    api_key= ELASTIC_API_KEY
)

Define the mapping

In [None]:
mapping = {
    "mappings": {
        "properties": {
            "url": {
                "type": "text"
            },
            "title": {
                "type": "text"
            },
             "title_vector": {
                "type": "dense_vector",
                "dims": 1536,
                "index": "true",
                "similarity": "cosine"
            },
            "content_vector": {
                "type": "dense_vector",
                "dims": 1536,
                "index": "true",
                "similarity": "cosine"
            },
            "text": {
                "type": "text"
            },
            "vector_id": {
                "type": "keyword"
            }
        }
    }
}

  Create an index in Elasticsearch

In [None]:
try:
    client.indices.create(
        index="wiki-vector",
        body=mapping,
        ignore=400  ignore 400 already exists code
    )
except:
    print('Index already exists')

 Index the data

In [None]:
actions = [
    {
        "_index": 'wiki-vector',
        "_id": row['id'],
        "_source": {
            "url": row['url'],
            "title": row['title'],
            "text": row['text'],
            "title_vector": json.loads(row['title_vector']),
            "content_vector": json.loads(row['content_vector']),
            "vector_id": row['vector_id']
        }
    }
    for _, row in df.iterrows()
]

helpers.bulk(client, actions)

Embed user query.

In [None]:
query = "who invented the compass?"

openai.api_key = getpass.getpass('Enter OpenAI API key:')

response = openai.Embedding.create(
    input=query,
    model="text-embedding-ada-002"
)
query_vector = response['data'][0]['embedding']

Perform knn search using the question vector.

In [None]:
response = client.search(index="wiki-vector", body={
  "knn": {
    "field": "content_vector",
    "query_vector": query_vector,
    "k": 3,
    "num_candidates": 100
  },
  "_source": ["title", "text"]
})

Extract the first hit from the response.

In [None]:
first_hit = response['hits']['hits'][0]
source = first_hit['_source']
text = source['text']

Now we can send the question and the text to OpenAI's chat completion API.
The model will generate a response to the question, using the top kNN hit as context.
Use the `messages` list to shape your prompt to the gen AI model.

In [None]:
summary = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Answer the following question:" + query + "by using the following text:" + text + "Please print each sentence on a new line."},
    ]
)

choices = summary.choices

for choice in choices:
    print("------------------------------------------------------------")
    print(choice.message.content)
    print("------------------------------------------------------------")