In [None]:
# Import required libraries
import os
import json
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter

from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters
)


import json

load_dotenv()

In [None]:
# Configure environment variables
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME")
AZURE_OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_GPT4_DEPLOYMENT_NAME")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_dimensions = 1536
index_name = "legal_clauses"

In [None]:
# Configure OpenAI API
aoai_client = AzureOpenAI(
  azure_endpoint = AZURE_OPENAI_ENDPOINT, 
  api_key=AZURE_OPENAI_API_KEY,  
  api_version="2023-05-15"
)
credential = AzureKeyCredential(key)


In [None]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def calc_embeddings(text):
    # model = "deployment_name"
    embeddings = aoai_client.embeddings.create(input = [text], model=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME).data[0].embedding
    return embeddings

In [None]:
import pandas as pd
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=30,
)

df = pd.DataFrame(columns=['id', 'document_name', 'content', 'embedding'])
 
id = 0 
for entry in os.scandir("./data/clauses/"):
    if entry.is_file():
        print(entry.name)
        filename = entry.name
        full_path = entry.path
        loader = PyPDFLoader(full_path)
        pages = loader.load_and_split(text_splitter=splitter)
        for page in pages:
            df.loc[len(df.index)] = [str(id), filename, page.page_content, ""]  
            id += 1
df.head()

In [None]:
# calculate the embeddings using openAI ada 
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('./data/clauses_with_embeddings.csv', index=False)
print(df.head(2))

In [None]:
# Output embeddings to json file
output_path = os.path.join('.', 'data', 'clauses_with_embeddings.json')

with open(output_path, 'w') as f:
    df.to_json(f, orient='records')


In [None]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="document_name", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myVectorizer",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=AZURE_OPENAI_ENDPOINT,
                deployment_id=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME,
                model_name=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME,
                api_key=AZURE_OPENAI_API_KEY
            )
        )
    ]
)


semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="content")
    )
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
index_name = "legal_clauses"
# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [None]:
from azure.search.documents import SearchClient
import json

# Upload some documents to the index
output_path = os.path.join('.', 'data', 'clauses_with_embeddings.json')
with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

In [None]:
from azure.search.documents.models import VectorizedQuery
def do_search(query, fields):
    embedding = calc_embeddings(query)
    vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields=fields)
  
    results = search_client.search(  
        search_text=None,  
        vector_queries= [vector_query],
        select=["content", "document_name"],
    )  

    for result in results:  
        print(f"Score: {result['@search.score']}")  
        print(f"Clause name: {result['document_name']}")  
        print(f"content: {result['content']}")   

In [None]:
# check for clause matching for incoming documents
# splitting into 1000 char long chunks with 30 char overlap
from azure.search.documents import SearchClient
splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=30,
)
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

for entry in os.scandir("./data/documents/"):
    if entry.is_file():
        filename = entry.name
        full_path = entry.path
        loader = PyPDFLoader(full_path)
        pages = loader.load_and_split(text_splitter=splitter)
        for page in pages:
            # calculate the embeddings using openAI ada
            do_search(page.page_content, "embedding")
            print(f"Results for {filename}:")