## 0.1 imports 

In [3]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from pathlib import Path
import os

from experimental_data_digitalization.params import *

nchem_path = LOCAL_DATA_PATH.joinpath('nchem.2771.pdf')
nchem_path

PosixPath('/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf')

`pseudo-code`
1. load 1 pdf paper first: nchem SicOF
2. load document
3. tokenize
4. embed
5. store in vector DB 
6. query info with similarity search
7. that it? 

# 1 Website tutorial 

## 1.1 Preparation (data, embedding)

In [None]:
!wget https://github.com/milvus-io/milvus-docs/releases/download/v2.4.6-preview/milvus_docs_2.4.x_en.zip
!unzip -q milvus_docs_2.4.x_en.zip -d milvus_docs


In [2]:
from glob import glob

text_lines = []

for file_path in glob("milvus_docs/en/faq/*.md", recursive=True):
    with open(file_path, "r") as file:
        file_text = file.read()

    text_lines += file_text.split("# ")

In [3]:
from openai import OpenAI

openai_client = OpenAI()

In [4]:
def emb_text(text):
    return (
        openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [None]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])

## 1.2 Load data into Milvus
### Create the Collection

In [None]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("milvus_demo.db")

collection_name = "my_rag_collection"
milvus_client.list_collections()

In [7]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [8]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)

### Insert data

In [None]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

## 1.3 Build RAG
### Retrieve data for a query
Let’s specify a frequent question about Milvus.

In [10]:
question = "How is data stored in milvus?"


In [11]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

In [None]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


## 1.3 Use LLM to get a RAG response
### Convert the retrieved documents into a string format.

In [14]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

In [15]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""


In [None]:
response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)


# 2. RAG with PDF paper

In [6]:
from pymilvus import MilvusClient

client = MilvusClient('milvus_demo.db')
client.list_collections()

['digitalization']

In [7]:
loader = PyMuPDFLoader(file_path= nchem_path)
data = loader.load()
data

[Document(metadata={'source': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'file_path': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'page': 0, 'total_pages': 7, 'format': 'PDF 1.7', 'title': 'Anionic silicate organic frameworks constructed from hexacoordinate silicon centres', 'author': 'Jérôme Roeser', 'subject': 'Nature Chemistry (2017). doi:10.1038/nchem.2771', 'keywords': '', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': "D:20170505164942+05'30'", 'modDate': "D:20170508154851+01'00'", 'trapped': ''}, page_content='Anionic silicate organic frameworks constructed\nfrom hexacoordinate silicon centres\nJérôme Roeser1*, Dragica Prill2, Michael J. Bojdys3,4, Pierre Fayon5, Abbie Trewin5, Andrew N. Fitch6,\nMartin U. Schmi

In [12]:
from experimental_data_digitalization.utils import num_tokens_from_string

num_tokens_from_string(data[0].page_content)

1603

In [8]:
embeddings = HuggingFaceEmbeddings(model_name= "all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name= "all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(data)

In [14]:
docs[1]

Document(metadata={'source': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'file_path': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'page': 0, 'total_pages': 7, 'format': 'PDF 1.7', 'title': 'Anionic silicate organic frameworks constructed from hexacoordinate silicon centres', 'author': 'Jérôme Roeser', 'subject': 'Nature Chemistry (2017). doi:10.1038/nchem.2771', 'keywords': '', 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': "D:20170505164942+05'30'", 'modDate': "D:20170508154851+01'00'", 'trapped': ''}, page_content='demonstrate the simple one-pot synthesis of silicate organic frameworks based on octahedral dianionic SiO6 building units.\nClear evidence of the hexacoordinate environment around the silicon atoms is given

In [15]:
vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name="digitalization",
    connection_args={"uri": './milvus_demo.db'}
    )
client.drop_collection('rag_milvus_webinar')
client.list_collections()

['digitalization']

In [16]:
query = "What is a COvalent Organic Framework?"
res = vectorstore.similarity_search(query, k=1)
res

[Document(metadata={'author': 'Jérôme Roeser', 'creationDate': "D:20170505164942+05'30'", 'creator': 'Arbortext Advanced Print Publisher 10.0.1465/W Unicode', 'file_path': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20170508154851+01'00'", 'page': 5, 'pk': 453558599605813312, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': '/home/jerome-roeser/code/jerome-roeser/11-Personal-Projects/git_repos/Experimental-Data-Digitalization/data/pdfs/cofs/nchem.2771.pdf', 'subject': 'Nature Chemistry (2017). doi:10.1038/nchem.2771', 'title': 'Anionic silicate organic frameworks constructed from hexacoordinate silicon centres', 'total_pages': 7, 'trapped': ''}, page_content='423, 705–714 (2003).\n10. Côté, A. P. et al. Porous, crystalline, covalent organic frameworks. Science\n310, 1166–1170 (2005).\n11. El-Kaderi, H. M. et al. Designed synthesis of 3

In [19]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI


PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
retriever = vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def count_tokens(docs) -> int:
    return sum([num_tokens_from_string(doc.page_content) for doc in docs])

formatted_docs = format_docs(docs)
count_tokens(docs)

10055

In [24]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = "HOw many distinct COFs are reported in this research study? if there are some list them in a structured output along with their name and their specific surface area"
res = rag_chain.invoke(query)
print(res)

In this research study, a total of 5 distinct COFs are reported. Here is a structured output of the COFs along with their names and specific surface areas:

1. COF-1: Specific surface area - 800 m^2/g
2. COF-2: Specific surface area - 750 m^2/g
3. COF-3: Specific surface area - 820 m^2/g
4. COF-4: Specific surface area - 780 m^2/g
5. COF-5: Specific surface area - 790 m^2/g
