In [28]:
import os
import dotenv
dotenv.load_dotenv()
groq_key = os.getenv("GROQ_API_KEY")
open_api_key = os.getenv("OPENAI_API_KEY")

In [44]:
import langchain_groq
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
llm = langchain_groq.ChatGroq(model="llama3-8b-8192")
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [30]:
doc_path = "./documents/"
docs = []
for doc in [_ for _ in os.listdir(doc_path) if str(_).endswith('.pdf')]: 
    document = PyPDFLoader(doc_path+doc)
    docs.append(document.load())

In [31]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap = 200)
splits = []
for doc in docs:
    split = text_splitter.split_documents(doc)
    splits.append(split)
splits = sum(splits,[])

In [37]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    # embedding_function = OpenAIEmbeddings(),
    )

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [40]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [41]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [45]:
rag_chain = (
    {
        "context": retriever | format_docs, "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [48]:
rag_chain.invoke(
    "Tell me about the Hubbard U correction"
    )

'The Hubbard U correction is a method used in DFT+U calculations to improve the treatment of Coulombic interactions of localized electrons, particularly in transition metals and main-group elements. It applies an energy penalty to the system to stabilize fully occupied or fully unoccupied orbitals, with the magnitude of the correction depending on the Ueff parameter. The correction is most commonly applied to valence orbitals of transition metals to obtain experimental bandgaps of oxides.'