#  Import modules and connect to Elasticsearch

In [9]:
import os
from getpass import getpass
from elasticsearch import Elasticsearch, helpers
import wget, zipfile, pandas as pd, json, openai
import streamlit as st
from tqdm.notebook import tqdm
from dotenv import load_dotenv

load_dotenv()

openai_api_key=os.getenv('OPENAI_API_KEY')
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")

url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(client.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Configure OpenAI connection

In [10]:
from openai import OpenAI

openai = OpenAI()
openai.models.retrieve("text-embedding-ada-002")

Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal')

# Download the dataset

In [7]:
with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip",
"r") as zip_ref:
    zip_ref.extractall("data")

# Read CSV file into a Pandas DataFrame

In [11]:
wikipedia_dataframe = pd.read_csv("data/vector_database_wikipedia_articles_embedded.csv")

# Create index with mapping

In [12]:
index_mapping= {
    "properties": {
      "title_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "content_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "text": {"type": "text"},
      "title": {"type": "text"},
      "url": { "type": "keyword"},
      "vector_id": {"type": "long"}

    }
}
client.indices.create(index="wikipedia_vector_index", mappings=index_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wikipedia_vector_index'})

# Index data into Elasticsearch

In [13]:
def dataframe_to_bulk_actions(df):
    for index, row in df.iterrows():
        yield {
            "_index": 'wikipedia_vector_index',
            "_id": row['id'],
            "_source": {
                'url' : row["url"],
                'title' : row["title"],
                'text' : row["text"],
                'title_vector' : json.loads(row["title_vector"]),
                'content_vector' : json.loads(row["content_vector"]),
                'vector_id' : row["vector_id"]
            }
        }

In [14]:
total_documents = len(wikipedia_dataframe)

progress_bar = tqdm(total=total_documents, unit="documents")
success_count = 0

for ok, info in helpers.streaming_bulk(client, actions=dataframe_to_bulk_actions(wikipedia_dataframe), raise_on_error=False, chunk_size=100):
  if ok:
    success_count += 1
  else:
    print(f"Unable to index {info['index']['_id']}: {info['index']['error']}")
  progress_bar.update(1)
  progress_bar.set_postfix(success=success_count)

  0%|          | 0/25000 [00:00<?, ?documents/s]

# Build application with Streamlit

In [15]:
!npm install localtunnel

[K[?25hm##################[0m) ⠧ reify:localtunnel: [32;40mhttp[0m [35mfetch[0m GET 200 https://registry.n[0m[K/regis[0m[K
added 22 packages, and audited 23 packages in 15s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues, run:
  npm audit fix

Run `npm audit` for details.
[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m New [31mmajor[39m version of npm available! [31m8.19.2[39m -> [32m10.4.0[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Changelog: [36mhttps://github.com/npm/cli/releases/tag/v10.4.0[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Run [32mnpm install -g npm@10.4.0[39m to update!
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0m

In [31]:
%%writefile app.py

import os
import streamlit as st
import openai
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

from openai import OpenAI

openai = OpenAI()

load_dotenv()

openai_api_key=os.getenv('OPENAI_API_KEY')
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")

url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)

# Define model
EMBEDDING_MODEL = "text-embedding-ada-002"


def openai_summarize(query, response):
    context = response['hits']['hits'][0]['_source']['text']
    summary = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Answer the following question:" + query + "by using the following text: " + context},
        ]
    )

    print(summary)
    return summary.choices[0].message.content


def search_es(query):
    # Create embedding
    question_embedding = openai.embeddings.create(input=query, model=EMBEDDING_MODEL)

    # Define Elasticsearch query
    response = client.search(
    index = "wikipedia_vector_index",
    knn={
        "field": "content_vector",
        "query_vector":  question_embedding.data[0].embedding,
        "k": 10,
        "num_candidates": 100
        }
    )
    return response


def main():
    st.title("Gen AI Application")

    # Input for user search query
    user_query = st.text_input("Enter your question:")

    if st.button("Search"):
        if user_query:

            st.write(f"Searching for: {user_query}")
            result = search_es(user_query)

            # print(result)
            openai_summary = openai_summarize(user_query, result)
            st.write(f"OpenAI Summary: {openai_summary}")

            # Display search results
            if result['hits']['total']['value'] > 0:
                st.write("Search Results:")
                for hit in result['hits']['hits']:
                    st.write(hit['_source']['title'])
                    st.write(hit['_source']['text'])
            else:
                st.write("No results found.")

if __name__ == "__main__":
    main()

Overwriting app.py


# Run the application

In [None]:
!streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://198.18.0.17:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
ChatCompletion(id='chatcmpl-8qJmP31VcFxCq0E2XBh5e4y3lDVpE', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Football is a popular sport that can refer to several different games. The most well-known type of football is association football, which is commonly known as soccer. In North America, South Africa, and Australia, it is called soccer to avoid confusion with other types of football played in those regions.\n\nThe name football comes from the combination of the words "foot" and "ball." The game is called football because the players primarily use their feet to kick, but it can also involve other parts of t