## Demo of connectors as a pip package!

User would need to install our package to use connectors for GenAI flows (commented out since it's not published):

In [1]:
# % pip install elastic_connectors 
from connectors.loader.loader import GoogleDriveLoader, SharepointOnlineLoader, MongoDBLoader

In [3]:
sharepoint_loader = SharepointOnlineLoader(
  tenant_id="...", 
  tenant_name="enterprisesearch",
  client_id="...",
  secret_value="...",
  site_collections="*"
  )

In [None]:
sharepoint_docs = sharepoint_loader.load()

In [5]:
len(sharepoint_docs)

11133

In [9]:
mongo_loader = MongoDBLoader(
  host="mongodb://127.0.0.1:27017",
  database="product_catalog",
  collection="products",
  content_keys=["name", "description"] # used to tell what is meaningful content to embed as a vector
)

In [10]:
# load docs in sync mode
mongo_docs = mongo_loader.load()

In [11]:
len(mongo_docs)

3

In [12]:
mongo_docs

[Document(page_content='Gadget A useful gadget', metadata={'name': 'Gadget', 'description': 'A useful gadget', 'price': 19.99, 'stock_count': 100, 'id': '664cb39a0f970cbd1799ea72'}),
 Document(page_content='Widget An essential widget', metadata={'name': 'Widget', 'description': 'An essential widget', 'price': 29.99, 'stock_count': 150, 'id': '664cb39a0f970cbd1799ea73'}),
 Document(page_content='Doodad A fancy doodad', metadata={'name': 'Doodad', 'description': 'A fancy doodad', 'price': 49.99, 'stock_count': 200, 'id': '664cb39a0f970cbd1799ea74'})]

## Chunking splitting with langchain util functions!

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4, chunk_overlap=1 # some very dummy value to force splitting
)
docs = mongo_loader.load_and_split(text_splitter=text_splitter)

In [22]:
len(docs)

6

## Using any embeddings from HuggingFace

In [23]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()



## Using langchain ES utils 

In [29]:
es_url = "https://ee846f61f7db43bd9b65a78c5c250a2d.es.us-east-1.aws.elastic.cloud:443"
es_api_key = "Zk1VT2pJOEJiLUhhaV9WOVFvN3M6dnRwX1BRMlVTQ0d3QkszTEhBZGdmQQ=="
vector_field = "vector"
index_name = "connectors-notebook-test"

In [30]:
from langchain_elasticsearch import ElasticsearchStore

db = ElasticsearchStore.from_documents(
    docs,
    embeddings,
    vector_query_field=vector_field,
    es_url=es_url,
    index_name=index_name,
    es_api_key=es_api_key
)

## Semantic search

In [34]:
from langchain_elasticsearch import ElasticsearchRetriever

def vector_query(search_query, k=1):
    return {
        "knn": {
            "field": vector_field,
            "query_vector": embeddings.embed_query(search_query),
            "k": k,
            "num_candidates": 20,
        }
    }


retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=vector_query,
    content_field="text",
    url=es_url,
    api_key=es_api_key
)

In [46]:
docs = retriever.invoke("gadget")

# content partial due to chunking
for d in docs:
    print(f'Content: {d.page_content}')
    print(f'ES Index: {d.metadata.get("_index", "")}')

Content: Gadget A
ES Index: connectors-notebook-test
