In [1]:
from langchain.llms import Ollama

import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
PROJECT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
TEXT_DIR = os.path.join(DATA_DIR, 'text')

In [3]:
llm = Ollama(model="nous-hermes2")
# llm = Ollama(model="llama3", format='json')

In [4]:
files = {
    'pdf': [],
    'md': [],
    'txt': [],
    'json': []
}

for file in os.listdir(RAW_DIR):
    if file.endswith(".pdf"):
        files['pdf'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".md"):
        files['md'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".txt"):
        files['txt'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".json"):
        files['json'].append(os.path.join(RAW_DIR, file))

In [5]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from typing import List
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

### PDF --> Markdown --> Embedding

In [6]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown(files['pdf'][0], pages=None)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, 
    chunk_overlap=256
)
doc_splits = text_splitter.create_documents([md_text])

embeddings = OllamaEmbeddings(model="nous-hermes2")

# Add to vectorDB
vectorstore_md = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=embeddings,
)

retriever_md = vectorstore_md.as_retriever()

### PDF --> Embedding

In [7]:
loader = PyMuPDFLoader(files['pdf'][0])
doc = loader.load()
# docs_list = [page for doc in docs for page in doc]
doc_list = [item for item in doc]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=256
)
doc_splits = text_splitter.split_documents(doc_list)

embeddings = OllamaEmbeddings(model="nous-hermes2")

# Add to vectorDB
vectorstore_pdf = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=embeddings,
)

retriever_pdf = vectorstore_pdf.as_retriever()

In [8]:
# Define your desired data structure.
class ExInfo(BaseModel):
    name: str = Field(description="research paper title, usually should be in the first page.")
    authors: str = Field(description="research paper authors, usually should be in the first page.")
    date: int = Field(description="research paper publication date in year-month format. look for the keyword 'published'")
    summary_abstract: str = Field(description="research paper summary abstract, usually should be in the first page.")
    limitation: str = Field(description="research paper limitation, usually should be at the end of the article.")

# And a query intented to prompt a language model to populate the data structure.
paper_query = f"Extract the information from the research paper"

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=ExInfo)

prompt = PromptTemplate(
    template="""
    <|begin_of_text|>
        <|start_header_id|>
            system
        <|end_header_id|> 
    You are an research assistant for extracting information from pdf file. 
    Use the following pieces of retrieved context to answer the question.
    Output should be in json.
    <|eot_id|>

    <|start_header_id|>user<|end_header_id|>
    {question}

    Here are the pdf reference:
    \n ------- \n
    {document}
    \n ------- \n

    Follow this output format:
    \n ------- \n
    \n{format_instructions}\n
    \n ------- \n
    <|eot_id|>
    
    <|start_header_id|>assistant<|end_header_id|>

    """,
    input_variables=["question", "document"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

### LLM + PDF

In [9]:
docs = retriever_pdf.invoke(paper_query)
chain.invoke({
    "question": paper_query,
    "document": docs  
})

{'properties': {'name': 'Fish Behavior Analysis in an Aquaculture Environment by Using Computer Vision Techniques',
  'authors': 'Zhang, Qi; Zhang, Wei; Li, Wen; et al.',
  'date': 20200814,
  'summary_abstract': 'This paper proposes a method for fish behavior analysis in an aquaculture environment by using computer vision techniques. Firstly, the video is recorded by the underwater camera in the fish tank, which is placed at the center of the tank and faces downward. The camera has a resolution of 1920 × 1080 pixels with a frame rate of 30 fps. Three experiments are carried out to analyze the behavior of the fish in the aquaculture environment. In the first experiment, 50 crucian carp are placed into the tank. The ammonia concentration is maintained at around 0.25 mg/L for 10 h. In the second experiment, 40 common carp are placed into the tank, and the ammonia concentration is increased to 8 mg/L in the first hour, then decreased to 3 mg/L in the next hour, and finally maintained at a

### LLM + Markdown

In [11]:
docs = retriever_md.invoke(paper_query)
chain.invoke({
    "question": paper_query,
    "document": docs  
})

{'name': 'Fish Behavior Under Diverse Environmental Conditions: A Study of Betta Fish',
 'authors': 'Shi, Yi; Zhang, Xin; Li, Qian; Zhou, Jun; Tao, Feng',
 'date': 202008,
 'summary_abstract': "In order to study the behavior of betta fish under different environmental conditions, an experimental device is designed to observe their behavior in real-time. The device includes a high-speed camera and a control system with sensors to monitor temperature, pH, dissolved oxygen (DO), and ammonia concentration. The fish's behavior is analyzed by image processing methods based on machine learning algorithms. The results show that betta fish can adapt to different environmental conditions, but under abnormal conditions such as high ammonia concentration, their behavior becomes more chaotic.",
 'limitation': 'The study has some limitations, including the small sample size of the fish (10 individuals), and the fact that the experiments were conducted in a laboratory setting rather than in natural e