In [1]:
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import openai

# Define the target directory
target_directory = r'C:\Users\pablosal\Desktop\sharepoint-indexing-azure-cognitive-search'

# Load .env file
load_dotenv()

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\sharepoint-indexing-azure-cognitive-search


In [2]:
from gbb_ai.azure_search_sharepoint_extension import retrieve_sharepoint_files_content

In [3]:
site_domain = 'mngenvmcap747548.sharepoint.com'
site_name = 'Contoso'

In [4]:
content_files = retrieve_sharepoint_files_content(site_domain=site_domain, site_name=site_name, minutes_ago=None,file_formats=["docx"])

2023-12-09 16:25:47,221 - micro - MainProcess - INFO     New access token retrieved.... (azure_search_sharepoint_extension.py:msgraph_auth:64)
2023-12-09 16:25:47,225 - micro - MainProcess - INFO     Decoded Access Token:
{
  "aud": "https://graph.microsoft.com",
  "iss": "https://sts.windows.net/9495d8c9-4ebb-4107-b905-c7b45d1b7b7a/",
  "iat": 1702160447,
  "nbf": 1702160447,
  "exp": 1702164347,
  "aio": "E2VgYCiPtH8W/rjJOujhzettlsbXAQ==",
  "app_displayname": "dev-graph",
  "appid": "118583ee-94ed-45dd-870b-73784045eb37",
  "appidacr": "1",
  "idp": "https://sts.windows.net/9495d8c9-4ebb-4107-b905-c7b45d1b7b7a/",
  "idtyp": "app",
  "oid": "4f614374-65fa-45fc-8369-cb616a6fe08f",
  "rh": "0.Ab0AydiVlLtOB0G5Bce0XRt7egMAAAAAAAAAwAAAAAAAAADLAAA.",
  "roles": [
    "TeamsActivity.Read.All",
    "SharePointTenantSettings.Read.All",
    "People.Read.All",
    "Sites.Read.All",
    "Sites.Manage.All",
    "Directory.Read.All",
    "OnlineMeetingTranscript.Read.All",
    "BrowserSiteLists.Re

In [5]:
type(content_files[0])

TypeError: 'NoneType' object is not subscriptable

In [None]:
OPENAI_KEY = os.getenv("OPENAI_KEY")

In [None]:
# split documents into text and embeddings
text_splitter = RecursiveCharacterTextSplitter(
   chunk_size=1000, 
   chunk_overlap=50
)

chunks = text_splitter.split_documents(content_files)

In [13]:
chunks

[Document(page_content='A large language model (LLM) is a type of language model notable for its ability to achieve general-purpose language understanding and generation. LLMs acquire these abilities by using massive amounts of data to learn billions of parameters during training and consuming large computational resources during their training and operation.[1] LLMs are artificial neural networks (mainly transformers[2]) and are (pre-)trained using self-supervised learning and semi-supervised learning.\nAs autoregressive language models, they work by taking an input text and repeatedly predicting the next token or word.[3] Up to 2020, fine tuning was the only way a model could be adapted to be able to accomplish specific tasks. Larger sized models, such as GPT-3, however, can be prompt-engineered to achieve similar results.[4] They are thought to acquire knowledge about syntax, semantics and "ontology" inherent in human language corpora, but also inaccuracies and biases present in the

In [14]:
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
deployment="foundational-ada"
model_name="text-embedding-ada-002"
embeddings = OpenAIEmbeddings(
                deployment=deployment,
                model=model_name,
                openai_api_base=os.getenv("OPENAI_ENDPOINT"),
                openai_api_type="azure",
                show_progress_bar=True,
                chunk_size=1000,
            )

In [15]:
from langchain.vectorstores.azuresearch import AzureSearch

In [16]:
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    PrioritizedFields)
from azure.search.documents.models import Vector

In [17]:
embedding_function = embeddings.embed_query

In [18]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    SimpleField(
        name="security_group",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

index_name: str = "langchain-vector-demo-custom"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint= os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT"),
    azure_search_key= os.getenv("AZURE_SEARCH_ADMIN_KEY"),
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
    semantic_settings=SemanticSettings(
        default_configuration="config",
        configurations=[
            SemanticConfiguration(
                name="config",
                prioritized_fields=PrioritizedFields(
                    title_field=SemanticField(field_name="content"),
                    prioritized_content_fields=[
                        SemanticField(field_name="content")
                    ],
                    prioritized_keywords_fields=[
                        SemanticField(field_name="metadata")
                    ],
                ),
            )
        ],
    ),
)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
100%|██████████| 1/1 [00:00<00:00, 12.43it/s]


In [19]:
vector_store.add_documents(chunks)

100%|██████████| 1/1 [00:00<00:00, 11.21it/s]
100%|██████████| 1/1 [00:00<00:00, 12.29it/s]
100%|██████████| 1/1 [00:00<00:00, 11.44it/s]
100%|██████████| 1/1 [00:00<00:00, 10.82it/s]
100%|██████████| 1/1 [00:00<00:00, 10.86it/s]
100%|██████████| 1/1 [00:00<00:00, 12.03it/s]
100%|██████████| 1/1 [00:00<00:00, 12.46it/s]
100%|██████████| 1/1 [00:00<00:00, 11.35it/s]
100%|██████████| 1/1 [00:00<00:00, 12.02it/s]
100%|██████████| 1/1 [00:00<00:00, 11.88it/s]
100%|██████████| 1/1 [00:00<00:00, 11.77it/s]
100%|██████████| 1/1 [00:00<00:00,  4.43it/s]
100%|██████████| 1/1 [00:00<00:00, 10.89it/s]
100%|██████████| 1/1 [00:00<00:00, 10.90it/s]
100%|██████████| 1/1 [00:00<00:00, 11.48it/s]
100%|██████████| 1/1 [00:00<00:00, 10.67it/s]
100%|██████████| 1/1 [00:00<00:00, 11.11it/s]
100%|██████████| 1/1 [00:00<00:00, 12.18it/s]
100%|██████████| 1/1 [00:00<00:00, 11.05it/s]
100%|██████████| 1/1 [00:00<00:00, 12.80it/s]
100%|██████████| 1/1 [00:00<00:00, 10.39it/s]
100%|██████████| 1/1 [00:00<00:00,

['Yzg4NmI1YWYtMGI0YS00YTMzLTk4YjYtY2JiNDJkYjI3MWVj',
 'MjdkN2E3MjctNmU4Yi00NTcwLTg0YTEtM2Y5NjFhZDE3MDIz',
 'ZjgzMDgyMjktZTI1Mi00OTBiLTg2MmItNDAzY2YyMDQxYjU5',
 'ZWE5ZjBjNGUtN2M3Yy00OTQyLThjN2QtOTZkMjRmOGYzYmQ1',
 'MzEzOGQ3OTktZDc1OC00MDdkLWE2NmQtZGZkNTAwNDU1MGRm',
 'YzM3MDhmYzUtN2M5YS00ZTRlLThjYWUtNzFlZjQzZWNiZjYy',
 'ZWJmNzk5MDQtODEwZS00MDNjLWFhMjEtMmZhZDhhMTA2NzAx',
 'NDkwM2IzMjctODEwOS00ZjFkLTk2ZDYtNzk4NzNmNTQ5MmU3',
 'Y2UzYjM1OWUtZmNjNS00NmViLWE0MDYtMTBmZDUyZTllY2E0',
 'YWViNzc1ZmYtZmUyNi00N2M0LWIwODQtOTQ3OTA4MTE4NjMw',
 'YzczNGVlNjUtYTk4Yy00NTBmLWJkMTEtODIzYWFkNTExY2I2',
 'ZTRmNDI4ZTYtMzljYi00NzY3LWJkZDQtYWI0NWEwZTJiODRh',
 'MTRlZDkxMDAtNzUzOS00NGQ1LWI3NmYtYzU1YzljNDNhNTQy',
 'NmQ0YTlkM2YtMjhlYS00NDU4LThmMmYtNGU0NzUxMDhmZDcy',
 'NWRhY2E3MWUtNmY4OC00ZGQ0LTllYzItNjk4NmZhMWM0NGRh',
 'YjIzMDgxNjQtMzU2Mi00NGJkLThiYzktODY0MWQ2NWYxNTVl',
 'ZjU2N2ZjOWMtMDRiNi00NzkwLWJmMmEtMGIxZGNkNTNjMTU3',
 'ODM0NzhlYzctZjM5ZC00NzQ3LThkZTItNzFhYTMzY2IwOTZi',
 'ZjI2MmUyNDYtYzBmNi00NDBmLWI0ZjMtNWE4MTAyMGU1

## Search 

In [29]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

In [31]:
search_client = SearchClient(endpoint=os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT"),
                      index_name=os.getenv("AZURE_COGNITIVE_SEARCH_INDEX_NAME"),
                      credential=AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")))

In [32]:
def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    openai.api_type = "azure"
    openai.api_key = os.getenv("OPENAI_API_KEY")
    openai.api_base = os.getenv("OPENAI_ENDPOINT")
    openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
    deployment="foundational-ada"
    model_name="text-embedding-ada-002"
    embeddings = OpenAIEmbeddings(
                    deployment=deployment,
                    model=model_name,
                    openai_api_base=os.getenv("OPENAI_ENDPOINT"),
                    openai_api_type="azure",
                    show_progress_bar=True,
                    chunk_size=1000,
                )
    embedding = embeddings.embed_query(text)
    return embedding

In [33]:
search_query = "LLM is a master of laws"

In [38]:
# hybrid retrieval + rerank 
r = search_client.search(
        search_text=search_query,
        top=5, 
        vectors=[Vector(value=get_embeddings(search_query), k=50, fields="content_vector")],
        query_type="semantic",
        semantic_configuration_name="config",
        query_language="en-us",
        filter="security_group eq 'group2'",)

for doc in r:
    content = doc["content"].replace("\n", " ")[:1000]
    print(f"score: {doc['@search.score']}, reranker: {doc['@search.reranker_score']}. {content}")

100%|██████████| 1/1 [00:00<00:00, 10.25it/s]


