In [1]:
import os

from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.models import QueryType
from azure.search.documents.models import Vector

from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

import openai
import tiktoken

## Initialize Cognitive Search Index

In [2]:
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE", "ai-assistant-search")
AZURE_SEARCH_INDEX = os.getenv("AZURE_SEARCH_INDEX", "ai-assistant-idx")
COGNITIVE_SEARCH_API_KEY = os.getenv(
    "COGNITIVE_SEARCH_API_KEY",
    "9qZ6DSjDa7TzbwwUmuCVu9D2AwuEDiT61O2vMwsTfYAzSeAx9dzL",
)

KB_FIELDS_CONTENT = os.getenv("KB_FIELDS_CONTENT", "Content")
KB_FIELDS_CATEGORY = os.getenv("KB_FIELDS_CATEGORY", "Storage")
KB_FIELDS_SOURCEPAGE = os.getenv("KB_FIELDS_SOURCEPAGE", "LocationURL")

sourcepage_field = KB_FIELDS_SOURCEPAGE
content_field = KB_FIELDS_CONTENT

In [3]:
azure_cognitive_search_embedding_key_credential = AzureKeyCredential(
    COGNITIVE_SEARCH_API_KEY
)
search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net/",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_cognitive_search_embedding_key_credential,
)

## Initialize OpenAI

In [4]:
AZURE_OPENAI_SERVICE = os.getenv(
    "AZURE_OPENAI_SERVICE", "ai-assistant-openai1"
)
OPENAI_API_KEY = os.getenv(
    "OPENAI_API_KEY", "22f8d7ccbb6b4ea39073d16155e310a6"
)
API_VERSION = "2023-05-15"
OPENAI_API_TYPE = "azure"

os.environ["OPENAI_API_TYPE"] = OPENAI_API_TYPE
os.environ[
    "OPENAI_API_BASE"
] = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["OPENAI_API_VERSION"] = API_VERSION

openai.api_type = OPENAI_API_TYPE
openai.api_base = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
openai.api_key = OPENAI_API_KEY
openai.api_version = API_VERSION

AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_CHATGPT_DEPLOYMENT", "ai-assistant-gpt-35-16k"
)
AZURE_OPENAI_CHATGPT_MODEL = os.getenv(
    "AZURE_OPENAI_CHATGPT_MODEL", "gpt-35-turbo-16k"
)
AZURE_OPENAI_GPT4_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_GPT4_DEPLOYMENT", "ai-assistant-gpt-4"
)
AZURE_OPENAI_GPT4_MODEL = os.getenv(
    "AZURE_OPENAI_GPT4_MODEL",
    "gpt-4-32k",
)
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_EMB_DEPLOYMENT", "ai-assistant-ada"
)

llm_gpt35 = AzureChatOpenAI(
    deployment_name=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
    model_name=AZURE_OPENAI_CHATGPT_MODEL,
)
llm_gpt4 = AzureChatOpenAI(
    deployment_name=AZURE_OPENAI_GPT4_DEPLOYMENT,
    model_name=AZURE_OPENAI_GPT4_MODEL,
)

embeddings_model = OpenAIEmbeddings(deployment=AZURE_OPENAI_EMB_DEPLOYMENT)

chatgpt_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT
chatgpt_model = AZURE_OPENAI_CHATGPT_MODEL
gpt4_deployment = AZURE_OPENAI_GPT4_DEPLOYMENT
gpt4_model = AZURE_OPENAI_GPT4_MODEL
embedding_deployment = AZURE_OPENAI_EMB_DEPLOYMENT

In [5]:
def embed_query(query_text):
    return embeddings_model.embed_query(query_text)


query_vector = embed_query("retail")
len(query_vector)

1536

In [6]:
chat_completion = openai.ChatCompletion.create(
    deployment_id=AZURE_OPENAI_GPT4_DEPLOYMENT,
    model=AZURE_OPENAI_GPT4_MODEL,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
        {
            "role": "assistant",
            "content": "The Los Angeles Dodgers won the World Series in 2020.",
        },
        {"role": "user", "content": "Where was it played?"},
    ],
    temperature=0.0,
    max_tokens=1024,
    n=1,
)
chat_completion.choices[0].message.content

'The 2020 World Series was played at Globe Life Field in Arlington, Texas.'

## Test query

In [8]:
# Utility functions
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"

MODELS_2_TOKEN_LIMITS = {
    "gpt-35-turbo": 4000,
    "gpt-3.5-turbo": 4000,
    "gpt-35-turbo-16k": 16000,
    "gpt-3.5-turbo-16k": 16000,
    "gpt-4": 8100,
    "gpt-4-32k": 32000,
}

AOAI_2_OAI = {
    "gpt-35-turbo": "gpt-3.5-turbo",
    "gpt-35-turbo-16k": "gpt-3.5-turbo-16k",
}


def get_token_limit(model_id: str) -> int:
    if model_id not in MODELS_2_TOKEN_LIMITS:
        raise ValueError("Expected model gpt-35-turbo and above")
    return MODELS_2_TOKEN_LIMITS[model_id]


class MessageBuilder:
    """
    A class for building and managing messages in a chat conversation.
    Attributes:
        message (list): A list of dictionaries representing chat messages.
        model (str): The name of the ChatGPT model.
        token_count (int): The total number of tokens in the conversation.
    Methods:
        __init__(self, system_content: str, chatgpt_model: str): Initializes the MessageBuilder instance.
        append_message(self, role: str, content: str, index: int = 1): Appends a new message to the conversation.
    """

    def __init__(self, system_content: str, chatgpt_model: str):
        self.messages = [{"role": "system", "content": system_content}]
        self.model = chatgpt_model
        self.token_length = num_tokens_from_messages(
            self.messages[-1], self.model
        )

    def append_message(self, role: str, content: str, index: int = 1):
        self.messages.insert(index, {"role": role, "content": content})
        self.token_length += num_tokens_from_messages(
            self.messages[index], self.model
        )


def num_tokens_from_messages(message: dict[str, str], model: str) -> int:
    """
    Calculate the number of tokens required to encode a message.
    Args:
        message (dict): The message to encode, represented as a dictionary.
        model (str): The name of the model to use for encoding.
    Returns:
        int: The total number of tokens required to encode the message.
    Example:
        message = {'role': 'user', 'content': 'Hello, how are you?'}
        model = 'gpt-3.5-turbo'
        num_tokens_from_messages(message, model)
        output: 11
    """
    # encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
    encoding = tiktoken.encoding_for_model(get_oai_chatmodel_tiktok(model))
    num_tokens = 2  # For "role" and "content" keys
    for key, value in message.items():
        num_tokens += len(encoding.encode(value))
    return num_tokens


def get_oai_chatmodel_tiktok(aoaimodel: str) -> str:
    message = "Expected Azure OpenAI ChatGPT model name"
    if aoaimodel == "" or aoaimodel is None:
        raise ValueError(message)
    if aoaimodel not in AOAI_2_OAI and aoaimodel not in MODELS_2_TOKEN_LIMITS:
        raise ValueError(message)
    return AOAI_2_OAI.get(aoaimodel) or aoaimodel


def get_messages_from_history(
    system_prompt: str,
    model_id: str,
    history: list[dict[str, str]],
    user_conv: str,
    few_shots=[],
    max_tokens: int = 4096,
) -> list:
    message_builder = MessageBuilder(system_prompt, model_id)

    # Add examples to show the chat what responses we want. It will try to mimic any responses and make sure they match the rules laid out in the system message.
    for shot in few_shots:
        message_builder.append_message(shot.get("role"), shot.get("content"))

    user_content = user_conv
    append_index = len(few_shots) + 1

    message_builder.append_message(USER, user_content, index=append_index)

    for h in reversed(history[:-1]):
        if bot_msg := h.get("bot"):
            message_builder.append_message(
                ASSISTANT, bot_msg, index=append_index
            )
        if user_msg := h.get("user"):
            message_builder.append_message(USER, user_msg, index=append_index)
        if message_builder.token_length > max_tokens:
            break

    messages = message_builder.messages
    return messages

In [9]:
chatgpt_token_limit = get_token_limit(chatgpt_model)
gpt4_token_limit = get_token_limit(gpt4_model)

In [11]:
user_input = "Hello, tell me about our case studies around blockchain"

In [12]:
history = [{"user": user_input}]

In [13]:
search_query_prompt = """Below is a history of the conversation so far, and a new question asked by the user that needs to be answered by searching in a corporate knowledge base.
Generate a search query based on the conversation and the new question.
Do not include cited source filenames and document names e.g info.txt or doc.pdf in the search query terms.
Do no include any search operators like "site:" in the search query terms.
Do not include any text inside [] or <<>> in the search query terms.
Do not include any special characters like '+'.
If the question is not in English, translate the question to English before generating the search query.
If you cannot generate a search query, return just the number 0.
"""

search_query_prompt_few_shots = [
    {
        "role": USER,
        "content": "What projects did SoftServe deliver in the retail industry?",
    },
    {
        "role": ASSISTANT,
        "content": "Show available case studies in the retail industry",
    },
    {"role": USER, "content": "What AI services does SoftServe offer?"},
    {
        "role": ASSISTANT,
        "content": "Describe AI services SoftServe provides",
    },
]

In [14]:
original_user_question = history[-1]["user"]

search_query_messages = get_messages_from_history(
    search_query_prompt,
    chatgpt_model,
    history,
    "Generate search query for: " + original_user_question,
    search_query_prompt_few_shots,
    max_tokens=chatgpt_token_limit - len(search_query_prompt),
)

In [15]:
search_query_completion = openai.ChatCompletion.create(
    deployment_id=chatgpt_deployment,
    model=chatgpt_model,
    messages=search_query_messages,
    temperature=0.0,
    max_tokens=32,
    n=1,
)
search_query = search_query_completion.choices[0].message.content
search_query

'SoftServe blockchain case studies'

In [35]:
source_clf_prompt = """
You're an assistant for SoftServe employees, answering their corporate inquiries.
You have access to the following data sources:
1. SoftServe Website - contains SoftServe's case studies, whitepapers, and other marketing materials.
2. SoftServe Wikipedia page - contains general information about SoftServe, including history and leadership team.
Based on the User Question, classify the data sources that are most likely to contain the answer. Return the numbers of the data sources separated by commas. If you cannot classify the data sources, return just the number 0. Do not include any text before or after the numbers. If you need to return multiple numbers, return them in ascending order, separated by commas. For example, if you need to return 1 and 2, return "1,2".
"""

In [36]:
source_clf_messages = get_messages_from_history(
    source_clf_prompt,
    chatgpt_model,
    [],
    "User Question: " + search_query,
    max_tokens=chatgpt_token_limit - len(source_clf_prompt),
)
source_clf_messages

[{'role': 'system',
  'content': '\nYou\'re an assistant for SoftServe employees, answering their corporate inquiries.\nYou have access to the following data sources:\n1. SoftServe Website - contains SoftServe\'s case studies, whitepapers, and other marketing materials.\n2. SoftServe Wikipedia page - contains general information about SoftServe, including history and leadership team.\nBased on the User Question, classify the data sources that are most likely to contain the answer. Return the numbers of the data sources separated by commas. If you cannot classify the data sources, return just the number 0. Do not include any text before or after the numbers. If you need to return multiple numbers, return them in ascending order, separated by commas. For example, if you need to return 1 and 2, return "1,2".\n'},
 {'role': 'user',
  'content': 'User Question: SoftServe blockchain case studies'}]

In [37]:
id_to_data_source = {
    "1": "SoftServe Website",
    "2": "Wikipedia",
    "0": "Unknown",
}

source_clf_completion = await openai.ChatCompletion.acreate(
    deployment_id=chatgpt_deployment,
    model=chatgpt_model,
    messages=source_clf_messages,
    temperature=0.0,
    max_tokens=32,
    n=1,
)
data_source_ids = source_clf_completion.choices[0].message.content
data_sources = [
    id_to_data_source.get(data_source_id.strip(), "Unknown")
    for data_source_id in data_source_ids.split(",")
]
data_sources

['SoftServe Website']

In [42]:
def search_idx(query_text, data_sources, top=5):
    def nonewlines(s: str) -> str:
        return s.replace("\n", " ").replace("\r", " ")

    query_vector = embed_query(query_text)
    title_embedding = Vector(
        value=query_vector, k=top, fields="title_embedding"
    )
    content_embedding = Vector(
        value=query_vector, k=top, fields="content_embedding"
    )
    summary_embedding = Vector(
        value=query_vector, k=top, fields="summary_embedding"
    )

    filter = (
        f"search.in(Storage, '{','.join(data_sources)}', ',')"
        if "Unknown" not in data_sources
        else None
    )

    r = search_client.search(
        query_text,
        filter=filter,
        query_type=QueryType.SEMANTIC,
        query_language="en-us",
        query_speller="lexicon",
        semantic_configuration_name="default",
        top=top,
        query_caption=None,
        vectors=[title_embedding, content_embedding, summary_embedding],
    )

    results_formatted = [
        {
            "title": doc["FileName"],
            "content": nonewlines(doc[content_field]),
            "summary": nonewlines(doc["Summary"]),
            "url": doc[sourcepage_field],
            "storage": doc["Storage"],
        }
        for doc in r
    ]
    results = [res["url"] + ": " + res["content"] for res in results_formatted]
    content = "\n".join(results)
    return results_formatted, results, content

In [43]:
results_formatted, results, content = search_idx(
    search_query, data_sources, top=10
)
results = [res["url"] + ": " + res["content"] for res in results_formatted]
content = "\n".join(results)

[
    str(res["storage"]) + ": "
    + str(res["title"]) + ": " + str(res["url"])
    for res in results_formatted
]

['SoftServe Website: building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf: https://www.softserveinc.com/files/cases/building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf',
 'SoftServe Website: building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf: https://www.softserveinc.com/files/cases/building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf',
 'SoftServe Website: merging-iot-blockchain-smart-report-system.pdf: https://www.softserveinc.com/files/cases/merging-iot-blockchain-smart-report-system.pdf',
 'SoftServe Website: building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf: https://www.softserveinc.com/files/cases/building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf',
 'SoftServe Website: merging-iot-blockchain-smart-report-system.pdf: https://www.softserveinc.com/files/cases/merging-iot-blockchain-smart-report-system.pdf',
 'SoftServe Website: leading-coin-offering-asian-fintech-star

In [44]:
ai_response_prompt_template = """
You're an assistant for SoftServe employees, answering their corporate inquiries.

In addition to the User Question, you receive a list of the most relevant Sources that may contain the answer. Each source has a URL followed by a colon and the actual source information. DO NOT include a list of Sources in your response.

Rules:
- Refer to SoftServe using terms like "we", "us", "SoftServe", or "SoftServe team". DO NOT use "they".
- DO NOT include any SoftServe employee personal information in your response or citations (e.g., names, titles, email addresses). If asked, reply with "I'm not allowed to share that information."
- You can mention names and titles if the information is from ((https://en.wikipedia.org/wiki/SoftServe)).
- Encourage precise queries for better answers. If necessary, request clarification.
- For data tables, use HTML format. No Markdown.
- Only use provided Sources for answers. If unsure, say "I don't know.".
- Use as much relevant information from ALL provided Sources as possible. But DO NOT include any irrelevant information from Sources.
- Cite each source separately with a full URL in double parentheses, e.g., ((https://softserveinc.com/items/60a77281)), ((source: https://softserveinc.highspot.com/items/64ce04cfc77183892e5ac400))
- DO NOT include any additional text in parentheses.
- DO NOT combine multiple citations in parentheses.
- DO NOT combine multiple Sources in one citation.
- Always cite with the full source URL. DO NOT use abbreviations or incomplete URLs.
- DO NOT modify the original URLs when citing.
Examples of citations:
Correct: ((https://www.softserveinc.com/items/60a772914))
Wrong: ((softserveinc)), ((demand-analytics.pdf)), ((source)), ((source: https://softserveinc.highspot.com/items/64ce04cfc77183892e5ac400)), (source: ((https://softserveinc.highspot.com/items/6374afff5eca98c3bc16cf94))), ((https://softserveinc.highspot.com/items/60356d674cfd1a15042411ba), (https://www.softserveinc.com/resources/essential-data-engineering)).

{follow_up_questions_prompt}
{injected_prompt}
"""

In [45]:
ai_response_prompt = ai_response_prompt_template.format(
    injected_prompt="",
    follow_up_questions_prompt="",
)

ai_response_messages = get_messages_from_history(
    ai_response_prompt,
    gpt4_model,
    history,
    "User Question:\n" + original_user_question + "\n\nSources:\n" + content,
    max_tokens=gpt4_token_limit - len(ai_response_prompt),
)

In [46]:
ai_response_completion = await openai.ChatCompletion.acreate(
    deployment_id=gpt4_deployment,
    model=gpt4_model,
    messages=ai_response_messages,
    temperature=0.1,
    max_tokens=1024,
    n=1,
)

ai_response = ai_response_completion.choices[0].message.content

In [47]:
print(ai_response)

SoftServe has been involved in several blockchain case studies, focusing on different aspects of this technology.

One case study involved building a blockchain-based exchange platform for invoice trading for a tech company based in Asia. The platform was designed to provide easy access to liquidity, accelerate interactions, and improve the world's trading infrastructure. It was built using blockchain technology, which guarantees fast settlement speed, asset tokenization, transparent rules of trade, and provides an asset management infrastructure to all ecosystem participants ((https://www.softserveinc.com/files/cases/building-a-blockchain-based-exchange-platform-invoice-trading-case.pdf)).

Another case study was a project for Carnegie Mellon, merging IoT and blockchain for a smart report system. The system was designed to manage data collected through IoT devices and stored securely using blockchain. The data could be used to create valuable reports for decision-making ((https://www.

In [None]:
response_val_prompt = """
You're an assistant for SoftServe employees, answering their corporate inquiries. 

You need to validate your past Response and ensure it complies with the Rules below. In addition to the Response, you receive a list of Source URLs used to generate the Response. DO NOT include a list of Source URLs in the new Response.

If the Response doesn't comply with the rules, rewrite ONLY the corresponding parts to meet the requirements and return the new Response. Make ONLY critical changes. If the Response does comply with the rules, return it without changes. DO NOT include any text before or after the new Response. 

Rules:
- Refer to SoftServe using terms like "we", "us", "SoftServe", or "SoftServe team". DO NOT use "they".
- Remove any SoftServe employee personal information from the new Response or citations (e.g., names, titles, email addresses).
- You can mention names and titles if the information is from ((https://en.wikipedia.org/wiki/SoftServe)).
- For data tables, use HTML format. No Markdown.
- Cite each source separately with a full URL in double parentheses, e.g., ((https://softserveinc.com/items/60a77281)), ((source: https://softserveinc.highspot.com/items/64ce04cfc77183892e5ac400))
- DO NOT include any additional text in parentheses.
- DO NOT combine multiple citations in parentheses.
- DO NOT combine multiple sources in one citation.
- Always cite with the full source URL. DO NOT use abbreviations or incomplete URLs.
- DO NOT modify the original URLs when citing.
Examples of citations:
Correct: ((https://www.softserveinc.com/items/60a772914))
Wrong: ((softserveinc)), ((demand-analytics.pdf)), ((source)), ((source: https://softserveinc.highspot.com/items/64ce04cfc77183892e5ac400)), (source: ((https://softserveinc.highspot.com/items/6374afff5eca98c3bc16cf94))), ((https://softserveinc.highspot.com/items/60356d674cfd1a15042411ba), (https://www.softserveinc.com/resources/essential-data-engineering)).
"""

In [None]:
response_val_messages = get_messages_from_history(
    response_val_prompt,
    chatgpt_model,
    [],
    "Response:\n"
    + ai_response
    + "\n\n\Source URLs:\n"
    + "\n".join([res["url"] for res in results_formatted]),
    max_tokens=chatgpt_token_limit - len(response_val_prompt),
)

In [None]:
response_val_completion = await openai.ChatCompletion.acreate(
    deployment_id=chatgpt_deployment,
    model=chatgpt_model,
    messages=response_val_messages,
    temperature=0.1,
    max_tokens=1024,
    n=1,
)
validated_response = response_val_completion.choices[0].message.content

In [None]:
print(validated_response)