### DESCRIPTION
Load tens of thousands of Wikipedia articles into Azure Data Explorer.
Harness its sub milisecond query capabilities to search your data and combine this with LLM to generate a response with Retrieval Augmented Generation pattern.
Use Azure Data Explorer vector store capabilities with embeddings together with Generative AI to generate answers.  


### PREPARATION
* An ADX (Azure Data Explorer or Kusto) cluster  
* In ADX, create a Database named "embeddings"  
    <img src="images/1.png" alt="Create Kusto cluster" /> 

* Create an AAD app registration for Authentication - see below   
    [Create an Azure Active Directory application registration in Azure Data Explorer](https://learn.microsoft.com/en-us/azure/data-explorer/provision-azure-ad-app)


In [1]:
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table

from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AzureOpenAI

from IPython.display import display, HTML, JSON, Markdown

from dotenv import load_dotenv
import time
import tiktoken
import pandas as pd
from ast import literal_eval
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Configure environment variables
load_dotenv()

AAD_TENANT_ID = os.getenv("AAD_TENANT_ID")
KUSTO_CLUSTER = os.getenv("KUSTO_CLUSTER")
KUSTO_DATABASE = os.getenv("KUSTO_DATABASE")
KUSTO_TABLE = os.getenv("KUSTO_TABLE")
KUSTO_MANAGED_IDENTITY_APP_ID = os.getenv("KUSTO_MANAGED_IDENTITY_APP_ID")
KUSTO_MANAGED_IDENTITY_SECRET = os.getenv("KUSTO_MANAGED_IDENTITY_SECRET")

# Configure OpenAI API
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")



In [2]:
embeddingmodel = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_MODEL_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1)


#### IMPORTANT!! Embeddings Creation Section - Run this only once !!!
You only need to run this once to create the embeddings and save them to Azure Data Explorer.   
Then you can use the already created database and table in Azure Data explorer for retrieval

In [3]:
# you can add as many urls as you want, but for this example we will only use one
# "moby dick" the book is available online at the URL below
urls = ["https://www.gutenberg.org/files/2701/2701-0.txt"]

loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()

#we use chunk size of 1000 and 10% overlap to try not to cut sentences in the middle
#this regex separates by placing the sentence period when cutting a chunk at the end of that chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100, separators=["\n\n", "\n", "(?<=\. )", " ", ""])
chunks = text_splitter.split_documents(documents)
len(chunks)

775

In [4]:
#we use the tenacity library to create delays and retries when calling openAI to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")
    return embeddingmodel.embed_query(text)

In [5]:
#save all the chunks into a pandas dataframe
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for ch in chunks:
    dict = {'document_name': ch.metadata['source'],'content': ch.page_content, 'embedding': ""}
    temp_df = pd.DataFrame(dict, index=[0])
    df = pd.concat([df, temp_df], ignore_index=True)
df.head()

Unnamed: 0,document_name,content,embedding
0,https://www.gutenberg.org/files/2701/2701-0.txt,The Project Gutenberg eBook of Moby-Dick; or T...,
1,https://www.gutenberg.org/files/2701/2701-0.txt,Deck.\n\nCHAPTER 37. Sunset.\n\nCHAPTER 38. Du...,
2,https://www.gutenberg.org/files/2701/2701-0.txt,CHAPTER 92. Ambergris.\n\nCHAPTER 93. The Cast...,
3,https://www.gutenberg.org/files/2701/2701-0.txt,ETYMOLOGY.\n\n(Supplied by a Late Consumptive ...,
4,https://www.gutenberg.org/files/2701/2701-0.txt,EXTRACTS. (Supplied by a Sub-Sub-Librarian).\n...,


In [6]:
# calculate the embeddings using openAI
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('./data/adx/adx_embeddings.csv', index=False)
print(df.head(10))

                                     document_name  \
0  https://www.gutenberg.org/files/2701/2701-0.txt   
1  https://www.gutenberg.org/files/2701/2701-0.txt   
2  https://www.gutenberg.org/files/2701/2701-0.txt   
3  https://www.gutenberg.org/files/2701/2701-0.txt   
4  https://www.gutenberg.org/files/2701/2701-0.txt   
5  https://www.gutenberg.org/files/2701/2701-0.txt   
6  https://www.gutenberg.org/files/2701/2701-0.txt   
7  https://www.gutenberg.org/files/2701/2701-0.txt   
8  https://www.gutenberg.org/files/2701/2701-0.txt   
9  https://www.gutenberg.org/files/2701/2701-0.txt   

                                             content  \
0  The Project Gutenberg eBook of Moby-Dick; or T...   
1  Deck.\n\nCHAPTER 37. Sunset.\n\nCHAPTER 38. Du...   
2  CHAPTER 92. Ambergris.\n\nCHAPTER 93. The Cast...   
3  ETYMOLOGY.\n\n(Supplied by a Late Consumptive ...   
4  EXTRACTS. (Supplied by a Sub-Sub-Librarian).\n...   
5  “Now the Lord had prepared a great fish to swa...   
6  “Let us fl

In [7]:
#save to local file
df.to_csv('data/adx/adx_embeddings.csv', index=False)

### Ingest the embeddings into Azure Data Explorer


* Please use one click ingest in Azure Data explorer into a table called "books" by ingesting data from ["./data/adx/adx_embeddings.csv"](./data/wikipedia/vector_database_wikipedia_articles_embedded_1000.csv)   
    <img src="images/2.png" alt="Create Kusto cluster" /> 

In [11]:
# Connect to adx using AAD app registration
cluster = KUSTO_CLUSTER
kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(cluster, KUSTO_MANAGED_IDENTITY_APP_ID, KUSTO_MANAGED_IDENTITY_SECRET,  AAD_TENANT_ID)
Kclient = KustoClient(kcsb)
kusto_db = KUSTO_DATABASE
table_name = "books"

In [13]:
#testing the connection to kusto works - sample query to get the top 10 results from wikipedia
query = table_name + " | take 10"

response = Kclient.execute(kusto_db, query)
for row in response.primary_results[0]:
    print("Title :{}".format(row["content"]))

ConnectionError: HTTPSConnectionPool(host='adxaidemo.swedencentral.kusto.windows.net', port=443): Max retries exceeded with url: /v1/rest/auth/metadata (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2a77b6ac0>: Failed to resolve 'adxaidemo.swedencentral.kusto.windows.net' ([Errno 8] nodename nor servname provided, or not known)"))

In [None]:
def get_answer_from_adx(question, nr_of_answers=1):
        searchedEmbedding = calc_embeddings(question)
        kusto_query = table_name + " | extend similarity = series_cosine_similarity_fl(dynamic("+str(searchedEmbedding)+"), embedding,1,1) | top " + str(nr_of_answers) + " by similarity desc "
        print (kusto_query)
        response = Kclient.execute(kusto_db, kusto_query)

        for row in response.primary_results[0]:
                return row['content']

In [None]:
# here we get our answer but in a long and non concise way
get_answer_from_adx("Why does the coffin prepared for Queequeg become Ishmael's life buoy once the Pequod sinks?",1)

books | extend similarity = series_cosine_similarity_fl(dynamic([0.01388722391538789, -0.02785565915458321, 0.01125460704859909, -0.011153091962517278, -0.01700034452955324, 0.0024566581448270523, -0.020235282129429738, 0.012134402288222238, 0.0021724166488559975, 0.002757818511370902, 0.01796135202430634, 0.0334863457079393, 0.020993259452019217, 0.001238480487889455, -0.0024414307887825285, 0.0043008436754292155, 0.028776060987432597, 0.00841219302021106, 0.0006949533888357305, -0.015240754981629462, -0.0055393238140727255, 0.015389642719050595, 0.013602982419416834, -0.007153409119293172, -0.018286199182181113, 0.015592672891214217, 0.008452798682114777, -0.015633278553117934, 0.009007745832540635, 0.011999048529672316, 0.005251698823383868, -0.007945225114350763, -0.029263332655567274, -0.009224310914897988, -0.0313477684300433, -0.007985830776254478, 0.004629074793683049, -0.016729638875098436, 0.01312924659279841, -0.006033362471643008, -0.008303910013371123, -0.00270367747361219

'And thus the first man of the Pequod that mounted the mast to look out for the White Whale, on the White Whale’s own peculiar ground; that man was swallowed up in the deep. But few, perhaps, thought of that at the time. Indeed, in some sort, they were not grieved at this event, at least as a portent; for they regarded it, not as a foreshadowing of evil in the future, but as the fulfilment of an evil already presaged. They declared that now they knew the reason of those wild shrieks they had heard the night before. But again the old Manxman said nay.\n\nThe lost life-buoy was now to be replaced; Starbuck was directed to see to it; but as no cask of sufficient lightness could be found, and as in the feverish eagerness of what seemed the approaching crisis of the voyage, all hands were impatient of any toil but what was directly connected with its final end, whatever that might prove to be; therefore, they were going to leave the ship’s stern unprovided with a buoy, when by certain stran

In [None]:
# this is the question we want to ask and its embeddings
question = calc_embeddings("Why does the coffin prepared for Queequeg become Ishmael's life buoy once the Pequod sinks?")
print('Embeddings: {}'.format(question))

Embeddings: [0.01388722391538789, -0.02785565915458321, 0.01125460704859909, -0.011153091962517278, -0.01700034452955324, 0.0024566581448270523, -0.020235282129429738, 0.012134402288222238, 0.0021724166488559975, 0.002757818511370902, 0.01796135202430634, 0.0334863457079393, 0.020993259452019217, 0.001238480487889455, -0.0024414307887825285, 0.0043008436754292155, 0.028776060987432597, 0.00841219302021106, 0.0006949533888357305, -0.015240754981629462, -0.0055393238140727255, 0.015389642719050595, 0.013602982419416834, -0.007153409119293172, -0.018286199182181113, 0.015592672891214217, 0.008452798682114777, -0.015633278553117934, 0.009007745832540635, 0.011999048529672316, 0.005251698823383868, -0.007945225114350763, -0.029263332655567274, -0.009224310914897988, -0.0313477684300433, -0.007985830776254478, 0.004629074793683049, -0.016729638875098436, 0.01312924659279841, -0.006033362471643008, -0.008303910013371123, -0.0027036774736121936, -0.018245593520277395, 0.003080974154717402, -0.

In [None]:
# Configure OpenAI API
client = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)

In [None]:
def call_openAI(text, retrieved_answer):
    content = retrieved_answer + "\n\n" + text
    response = client.chat.completions.create(
        model=OPENAI_DEPLOYMENT_NAME,
        messages = [
            {"role": "system", "content": "You are a helpful assistant that answers questions only from the text provided in the Answers section below.Answer in a clear and concise manner in 1 sentence, providing answers only from the answers below. If the answer is not in the answers, please answer with 'I don't know'"},
            {"role": "user", "content": content}
        ],
        temperature=0.7,
        max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

    return response.choices[0].message.content


In [None]:
question = "Why does the coffin prepared for Queequeg become Ishmael's life buoy once the Pequod sinks?"
retrieved_answer_from_adx = get_answer_from_adx(question,1)
call_openAI(question, retrieved_answer_from_adx)

books | extend similarity = series_cosine_similarity_fl(dynamic([0.01388722391538789, -0.02785565915458321, 0.01125460704859909, -0.011153091962517278, -0.01700034452955324, 0.0024566581448270523, -0.020235282129429738, 0.012134402288222238, 0.0021724166488559975, 0.002757818511370902, 0.01796135202430634, 0.0334863457079393, 0.020993259452019217, 0.001238480487889455, -0.0024414307887825285, 0.0043008436754292155, 0.028776060987432597, 0.00841219302021106, 0.0006949533888357305, -0.015240754981629462, -0.0055393238140727255, 0.015389642719050595, 0.013602982419416834, -0.007153409119293172, -0.018286199182181113, 0.015592672891214217, 0.008452798682114777, -0.015633278553117934, 0.009007745832540635, 0.011999048529672316, 0.005251698823383868, -0.007945225114350763, -0.029263332655567274, -0.009224310914897988, -0.0313477684300433, -0.007985830776254478, 0.004629074793683049, -0.016729638875098436, 0.01312924659279841, -0.006033362471643008, -0.008303910013371123, -0.00270367747361219

'Because it floats and serves as a buoyant device.'

In [None]:
question = "Why does Ahab pursue Moby Dick?"
retrieved_answer_from_adx = get_answer_from_adx(question,1)
call_openAI(question, retrieved_answer_from_adx)

books | extend similarity = series_cosine_similarity_fl(dynamic([-0.007135458641673036, -0.027408593751558973, 0.001704801423539879, -0.019989824906102648, -0.035288566597174206, 0.032073327987012554, -0.030966439876747374, 0.025998632771504753, -0.013249683845805441, -0.001528556184617774, 0.024008875031449953, 0.03320656507685524, 0.02418017878896333, -0.0023060116188909395, 0.004967807570903933, 0.012228449476942191, 0.04535595178258036, -0.010284811764374234, -0.007715255867334924, -0.016431978780914246, 0.009144982773024068, -0.006964154884159661, 0.006997097627889888, -0.0035084317767624055, -0.006446949523511039, 0.008525654161753644, 0.0026963864382894354, -0.023165532596317075, 0.010324342684321457, 0.004618611879660187, 0.02817287248415216, -0.006529306848497916, -0.006736847903511327, 0.013901955200806091, -0.03141446566182705, 0.0025596727013911928, -0.015377802599643388, -0.013862423349536246, 0.020042534041129108, -0.02817287248415216, -0.007603249421065007, 0.00508969674

'Ahab pursues Moby Dick because he believes that the pursuit of the whale incites the hearts of his crew and breeds a generous knight-errantism in them, while also providing food for their daily appetites.'