#  Import modules and connect to Elasticsearch

In [None]:
import os
from getpass import getpass
from elasticsearch import Elasticsearch, helpers
import wget, zipfile, pandas as pd, json, openai
import streamlit as st
from tqdm.notebook import tqdm
from dotenv import load_dotenv

load_dotenv()

# openai_api_key=os.getenv('OPENAI_API_KEY')
elastic_user = os.getenv('ES_USER')
elastic_password = os.getenv('ES_PASSWORD')
elastic_endpoint = os.getenv("ES_ENDPOINT")

url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(client.info())

# Configure OpenAI connection

In [None]:
from openai import OpenAI
# from openai import AzureOpenAI

# openai = OpenAI()
# openai.models.retrieve("text-embedding-ada-002")

azure_api_key = os.getenv('AZURE_API_KEY')
azure_endpoint = os.getenv('AZURE_EDNPOINT')
azure_api_version = os.getenv('AZURE_API_VERSION')
azure_deployment_id = os.getenv('AZURE_DEPLOYMENT_ID')

openai.api_type = "azure"
openai.api_base = azure_endpoint
openai.api_version = azure_api_version
openai.api_key = azure_api_key

# Download the dataset

In [4]:
with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip",
"r") as zip_ref:
    zip_ref.extractall("data")

# Read CSV file into a Pandas DataFrame

In [5]:
wikipedia_dataframe = pd.read_csv("data/vector_database_wikipedia_articles_embedded.csv")

# Create index with mapping

In [1]:
index_mapping= {
    "properties": {
      "title_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "content_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "text": {"type": "text"},
      "title": {"type": "text"},
      "url": { "type": "keyword"},
      "vector_id": {"type": "long"}

    }
}
client.indices.create(index="wikipedia_vector_index", mappings=index_mapping)

# Index data into Elasticsearch

In [7]:
def dataframe_to_bulk_actions(df):
    for index, row in df.iterrows():
        yield {
            "_index": 'wikipedia_vector_index',
            "_id": row['id'],
            "_source": {
                'url' : row["url"],
                'title' : row["title"],
                'text' : row["text"],
                'title_vector' : json.loads(row["title_vector"]),
                'content_vector' : json.loads(row["content_vector"]),
                'vector_id' : row["vector_id"]
            }
        }

In [8]:
total_documents = len(wikipedia_dataframe)

progress_bar = tqdm(total=total_documents, unit="documents")
success_count = 0

for ok, info in helpers.streaming_bulk(client, actions=dataframe_to_bulk_actions(wikipedia_dataframe), raise_on_error=False, chunk_size=100):
  if ok:
    success_count += 1
  else:
    print(f"Unable to index {info['index']['_id']}: {info['index']['error']}")
  progress_bar.update(1)
  progress_bar.set_postfix(success=success_count)

  0%|          | 0/25000 [00:00<?, ?documents/s]

# Build application with Streamlit

In [None]:
!npm install localtunnel

In [None]:
%%writefile app.py

import os
import streamlit as st
# import openai
from openai import AzureOpenAI
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

# from openai import OpenAI

# openai = OpenAI()

load_dotenv()

azure_api_key = os.getenv('AZURE_API_KEY')
azure_endpoint = os.getenv('AZURE_EDNPOINT')
azure_api_version = os.getenv('AZURE_API_VERSION')
azure_deployment_id = os.getenv('AZURE_DEPLOYMENT_ID')

chat = AzureOpenAI(
  api_key = azure_api_key,  
  api_version = azure_api_version,
  azure_endpoint = azure_endpoint
)

model_name = os.getenv('MODEL_NAME')
azure_embedding_endpoint = os.getenv('AZURE_EMBEDDING_ENDPOINT')
azure_embedding_api_key = os.getenv('AZURE_EMBEDDING_API_KEY')
azure_embedding_api_version = os.getenv("AZURE_EMBEDDING_API_VERSION")

embeddings = AzureOpenAI(
        api_key=azure_embedding_api_key,
        api_version=azure_embedding_api_version,
        azure_endpoint=azure_embedding_endpoint,
    )

elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")

# openai_api_key=os.getenv('OPENAI_API_KEY')

# openai.api_type = "azure"

url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)

# Define model
EMBEDDING_MODEL = "text-embedding-ada-002"

def openai_summarize(query, response):
    context = response['hits']['hits'][0]['_source']['text']
    summary = chat.chat.completions.create(
    model = azure_deployment_id,
    messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Answer the following question:" + query + "by using the following text: " + context},
        ]
    )

    print(summary)
    return summary.choices[0].message.content


def search_es(query):
    # Create embedding
    question_embedding = embeddings.embeddings.create(input=query, model=EMBEDDING_MODEL)

    # Define Elasticsearch query
    response = client.search(
    index = "wikipedia_vector_index",
    knn={
        "field": "content_vector",
        "query_vector":  question_embedding.data[0].embedding,
        "k": 10,
        "num_candidates": 100
        }
    )
    return response


def main():
    st.title("Gen AI Application")

    # Input for user search query
    user_query = st.text_input("Enter your question:", "what is football?")

    if st.button("Search"):
        if user_query:

            st.write(f"Searching for: {user_query}")
            result = search_es(user_query)

            # print(result)
            openai_summary = openai_summarize(user_query, result)
            st.write(f"OpenAI Summary: {openai_summary}")

            # Display search results
            if result['hits']['total']['value'] > 0:
                st.write("Search Results:")
                for hit in result['hits']['hits']:
                    st.write(hit['_source']['title'])
                    st.write(hit['_source']['text'])
            else:
                st.write("No results found.")

if __name__ == "__main__":
    main()

# Run the application

In [None]:
!streamlit run app.py