In [None]:
!pip3 install -r ../backend/requirements.txt
!pip3 install -U beautifulsoup4 wikipedia

In [None]:
import os
import json

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    PrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    SimpleField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)

from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

import wikipedia
from bs4 import BeautifulSoup

## Load data

In [None]:
with open("../data/softserve.json", "r") as f:
    website_data = json.load(f)


In [None]:
print(json.dumps(website_data[0], indent=4))

## Initialize Cognitive Search

In [None]:
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE", "ai-assistant-search")
AZURE_SEARCH_INDEX = os.getenv("AZURE_SEARCH_INDEX", "ai-assistant-idx")

# UPDATE THE VALUE BELOW TO YOUR COGNITIVE SEARCH ADMIN KEY
COGNITIVE_SEARCH_API_KEY = os.getenv(
    "COGNITIVE_SEARCH_API_KEY",
    "<FILL_IN_YOUR_COGNITIVE_SEARCH_API_KEY>",
)

In [None]:
azure_cognitive_search_embedding_key_credential = AzureKeyCredential(
    COGNITIVE_SEARCH_API_KEY
)
embedding_index_client = SearchIndexClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
    credential=azure_cognitive_search_embedding_key_credential,
)
embedding_search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net/",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_cognitive_search_embedding_key_credential,
)

In [None]:
if AZURE_SEARCH_INDEX not in embedding_index_client.list_index_names():
    embedding_index = SearchIndex(
        name=AZURE_SEARCH_INDEX,
        fields=[
            SimpleField(name="Id", type="Edm.String", key=True),
            SearchableField(
                name="Content", type="Edm.String", analyzer_name="en.microsoft"
            ),
            SearchableField(
                name="FileName",
                type="Edm.String",
                analyzer_name="en.microsoft",
            ),
            SearchableField(
                name="Summary",
                type="Edm.String",
                analyzer_name="en.microsoft",
            ),
            SearchField(
                name="content_embedding",
                type=SearchFieldDataType.Collection(
                    SearchFieldDataType.Single
                ),
                hidden=False,
                searchable=True,
                filterable=False,
                sortable=False,
                facetable=False,
                vector_search_dimensions=1536,
                vector_search_configuration="default",
            ),
            SearchField(
                name="title_embedding",
                type=SearchFieldDataType.Collection(
                    SearchFieldDataType.Single
                ),
                hidden=False,
                searchable=True,
                filterable=False,
                sortable=False,
                facetable=False,
                vector_search_dimensions=1536,
                vector_search_configuration="default",
            ),
            SearchField(
                name="summary_embedding",
                type=SearchFieldDataType.Collection(
                    SearchFieldDataType.Single
                ),
                hidden=False,
                searchable=True,
                filterable=False,
                sortable=False,
                facetable=False,
                vector_search_dimensions=1536,
                vector_search_configuration="default",
            ),
            SimpleField(
                name="Published",
                type=SearchFieldDataType.DateTimeOffset,
                facetable=True,
                filterable=True,
                sortable=True,
            ),
            SimpleField(
                name="FileType",
                type="Edm.String",
                filterable=True,
                facetable=True,
            ),
            SimpleField(
                name="Category",
                type="Edm.String",
                filterable=True,
                facetable=True,
            ),
            SimpleField(
                name="LocationURL",
                type="Edm.String",
                filterable=True,
                facetable=True,
            ),
            SimpleField(
                name="Storage",
                type="Edm.String",
                filterable=True,
                facetable=True,
            ),
        ],
        semantic_settings=SemanticSettings(
            configurations=[
                SemanticConfiguration(
                    name="default",
                    prioritized_fields=PrioritizedFields(
                        title_field=SemanticField(field_name="FileName"),
                        prioritized_content_fields=[
                            SemanticField(field_name="FileName"),
                            SemanticField(field_name="Content"),
                            SemanticField(field_name="Summary"),
                        ],
                    ),
                )
            ]
        ),
        vector_search=VectorSearch(
            algorithm_configurations=[
                HnswVectorSearchAlgorithmConfiguration(
                    name="default",
                    kind="hnsw",
                    parameters={"metric": "cosine"},
                )
            ]
        ),
    )
    embedding_index_client.create_index(embedding_index)

## Initialize OpenAI

In [None]:
AZURE_OPENAI_SERVICE = os.getenv(
    "AZURE_OPENAI_SERVICE", "ai-assistant-gpt-4"
)
# UPDATE THE VALUE BELOW TO YOUR OPENAI API KEY
OPENAI_API_KEY = os.getenv(
    "OPENAI_API_KEY", "<FILL_IN_YOUR_OPENAI_API_KEY>"
)
AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_CHATGPT_DEPLOYMENT", "ai-assistant-gpt-35-16k"
)
AZURE_OPENAI_CHATGPT_MODEL = os.getenv(
    "AZURE_OPENAI_CHATGPT_MODEL", "gpt-35-turbo-16k"
)
AZURE_OPENAI_GPT4_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_GPT4_DEPLOYMENT", "ai-assistant-gpt-4"
)
AZURE_OPENAI_GPT4_MODEL = os.getenv(
    "AZURE_OPENAI_GPT4_MODEL",
    "gpt-4-32k",
)
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_EMB_DEPLOYMENT", "ai-assistant-ada"
)

In [None]:
llm_gpt35 = AzureChatOpenAI(
    deployment_name=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
    model_name=AZURE_OPENAI_CHATGPT_MODEL,
)
llm_gpt4 = AzureChatOpenAI(
    deployment_name=AZURE_OPENAI_GPT4_DEPLOYMENT,
    model_name=AZURE_OPENAI_GPT4_MODEL,
)

summarize_chain = load_summarize_chain(llm_gpt4, chain_type="stuff")

embeddings_model = OpenAIEmbeddings(deployment=AZURE_OPENAI_EMB_DEPLOYMENT)

In [None]:
def embed_query(query_text):
    return embeddings_model.embed_query(query_text)

In [None]:
content = website_data[0]["content"]

query_vector = embed_query(content)
len(query_vector)

In [None]:
summarize_chain.run([Document(page_content=content)])

## Index content using OpenAI embeddings

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=0,
)

In [None]:
count = 0
batch = []
for res in search_results:
    count += 1
    title_embedding = embed_query(res["FileName"])
    doc = [Document(page_content=res["content"])]
    splitted_doc = text_splitter.split_documents(doc)
    for i, page in enumerate(splitted_doc):
        content = page.page_content
        content_embedding = embed_query(content)
        try:
            summary = summarize_chain.run([Document(page_content=content)])
        except Exception as e:
            print(e)
            summary = ""
        summary_embedding = embed_query(summary)
        section = {
            "Id": f"{res['id']}-page-{i}",
            "Content": content,
            "LocationURL": res["LocationURL"],
            "Published": res["Published"],
            "FileName": res["FileName"],
            "FileType": res["FileType"],
            "Category": res["Category"],
            "Storage": res["Storage"],
            "Summary": summary,
            "content_embedding": content_embedding,
            "title_embedding": title_embedding,
            "summary_embedding": summary_embedding,
        }
        batch.append(section)
        if len(batch) % 100 == 0:
            results = embedding_search_client.upload_documents(documents=batch)
            succeeded = sum([1 for r in results if r.succeeded])
            print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
            batch = []
    if count % 10 == 0:
        print(f"Processed {count} documents")
if len(batch) > 0:
    results = embedding_search_client.upload_documents(documents=batch)
    succeeded = sum([1 for r in results if r.succeeded])
    print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")

## Index Wikipedia Page

In [None]:
def strip_html_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [None]:
wiki_page = wikipedia.page("SoftServe company").html()
stripped_wiki_page = strip_html_tags(wiki_page)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=700
)

In [None]:
batch = []

title_embedding = embed_query("About SoftServe")

splitted_doc = text_splitter.split_documents(
    [Document(page_content=stripped_wiki_page)]
)
for i, page in enumerate(splitted_doc):
    content = page.page_content
    content_embedding = embed_query(content)
    summary = summarize_chain.run([Document(page_content=content)])
    try:
        summary = summarize_chain.run([Document(page_content=content)])
    except Exception as e:
        print(e)
        summary = ""
    summary_embedding = embed_query(summary)
    section = {
        "Id": f"Wiki-page-{i}",
        "Content": content,
        "LocationURL": "https://en.wikipedia.org/wiki/SoftServe",
        "FileType": ".html",
        "Storage": "Wikipedia",
        "FileName": "Wikipedia",
        "Summary": summary,
        "content_embedding": content_embedding,
        "title_embedding": title_embedding,
        "summary_embedding": summary_embedding,
    }
    batch.append(section)
    if len(batch) % 100 == 0:
        results = embedding_search_client.upload_documents(documents=batch)
        succeeded = sum([1 for r in results if r.succeeded])
        print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
        batch = []
if len(batch) > 0:
    results = embedding_search_client.upload_documents(documents=batch)
    succeeded = sum([1 for r in results if r.succeeded])
    print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")