In [1]:
from langchain.llms import Ollama

import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
PROJECT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
TEXT_DIR = os.path.join(DATA_DIR, 'text')

In [3]:
llm = Ollama(model="nous-hermes2")
# llm = Ollama(model="llama3", format='json')

In [4]:
files = {
    'pdf': [],
    'md': [],
    'txt': [],
    'json': []
}

for file in os.listdir(RAW_DIR):
    if file.endswith(".pdf"):
        files['pdf'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".md"):
        files['md'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".txt"):
        files['txt'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".json"):
        files['json'].append(os.path.join(RAW_DIR, file))

In [5]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from typing import List
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

In [6]:
loader = PyMuPDFLoader(files['pdf'][0])
doc = loader.load()
# docs_list = [page for doc in docs for page in doc]
doc_list = [item for item in doc]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(doc_list)

embeddings = OllamaEmbeddings(model="nous-hermes2")

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=embeddings,
)

In [7]:
retriever = vectorstore.as_retriever()

In [8]:
# Define your desired data structure.
class ExInfo(BaseModel):
    name: str = Field(description="research paper title, usually should be in the first page.")
    authors: str = Field(description="research paper authors, usually should be in the first page.")
    date: int = Field(description="research paper publication date in year-month format. look for the keyword 'published'")
    summary_abstract: str = Field(description="research paper summary abstract, usually should be in the first page.")
    limitation: str = Field(description="research paper limitation, usually should be at the end of the article.")

# And a query intented to prompt a language model to populate the data structure.
paper_query = f"Extract the information from the research paper"

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=ExInfo)

prompt = PromptTemplate(
    template="""
    <|begin_of_text|>
        <|start_header_id|>
            system
        <|end_header_id|> 
    You are an research assistant for extracting information from pdf file. 
    Use the following pieces of retrieved context to answer the question.
    Output should be in json.
    <|eot_id|>

    <|start_header_id|>user<|end_header_id|>
    {question}

    Here are the pdf reference:
    \n ------- \n
    {document}
    \n ------- \n

    Follow this output format:
    \n ------- \n
    \n{format_instructions}\n
    \n ------- \n
    <|eot_id|>
    
    <|start_header_id|>assistant<|end_header_id|>

    """,
    input_variables=["question", "document"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

docs = retriever.invoke(paper_query)
chain.invoke({
    "question": paper_query,
    "document": docs  
})

{'name': 'Fish Behavior Detection Using Convolutional Neural Networks',
 'authors': 'Rui-Jie Yuan, Zhi-Yu Duan, Jia-Xin Lin and Peng-Song Gao',
 'date': 20200814,
 'summary_abstract': 'This paper proposes a method for fish behavior detection based on convolutional neural networks. The method is composed of two parts: feature extraction and behavior recognition. The feature extraction part uses the visual geometry group (VGG) network to extract features from input images. Behavior recognition is achieved by using an SVM classifier with the extracted features as inputs.',
 'limitation': 'One limitation of our method is that it requires large amounts of labeled training data for behavior recognition. Additionally, the computation time and storage space requirements for the VGG network are relatively high.'}