In [None]:
# Install python dependencies
# MongoDB reccomends `python -m pip install "pymongo[srv]"==3.11` or whatever version of python you are using
%pip install pymongo
%pip install pypdf
%pip install langchain
%pip install langchain_community
%pip install langchain_openai


In [1]:
from pymongo.mongo_client import MongoClient

# Setup up MongoDB Atlas connection
# You can find this connection URI by going to your cluster and clicking connect > Drivers
MONGODB_ATLAS_CLUSTER_URI = ""

# Create a new client and connect to the server
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [2]:
## Setup DB details.
## Create a new Database
DB_NAME = "SYNTHETIC_DATA"
db = client[DB_NAME]

## Create a new collection
COLLECTION_NAME = "INSURANCE_CLAIMS"
MONGODB_COLLECTION = db[COLLECTION_NAME]

## Index Name. Manually create this in the MongoDB web ui, add documents below, then create the search index in the same name
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [4]:
from langchain_community.document_loaders import PyPDFLoader

# Load the PDF, create embeddings, persist in MongoDB Atlas
pdf_uri = "https://comm.unc.edu/wp-content/uploads/sites/388/2018/12/UNC-Department-of-Communication-Policy-Manual-Draft-11-10-15-ph-pp.pdf"
loader = PyPDFLoader(pdf_uri)
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [5]:
from langchain_core.documents import Document

# Define helper method for Rag with AIConfig
def get_knn_context_from_query(query: str, k: int = 1) -> str:
    results: list[Document] = vector_search.similarity_search(
        query=query, k = 2
    )
    results_as_strings = [doc.page_content for doc in results]
    resulting_documents_as_a_string = "\n".join(results_as_strings)

    return resulting_documents_as_a_string

In [10]:
# Use AIConfig to generate synthetic data

from aiconfig import AIConfigRuntime

cove_config = AIConfigRuntime.load("/Users/ankush/Downloads/cove.aiconfig.json")

baseline_prompt = "List all the policies from this PDF: "
context = get_knn_context_from_query(baseline_prompt)
baseline_prompt += context
response = await cove_config.run("baseline_response_gen", {"baseline_prompt": baseline_prompt})
print(response)

[ExecuteResult(output_type='execute_result', execution_count=0, data='1. Records Retention and Disposal Policy\n2. Facilities Use Policy\n3. No Smoking Policy\n4. Personal Use Policy\n5. Sustainability Policy', mime_type=None, metadata={'id': 'chatcmpl-8kJFxEki6ILdJulMblHuSJljrlrhT', 'created': 1706047801, 'model': 'gpt-4-0613', 'object': 'chat.completion.chunk', 'raw_response': {'content': '1. Records Retention and Disposal Policy\n2. Facilities Use Policy\n3. No Smoking Policy\n4. Personal Use Policy\n5. Sustainability Policy', 'role': 'assistant'}, 'role': 'assistant'})]
