In [2]:
from langchain.llms import Ollama

import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
PROJECT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
TEXT_DIR = os.path.join(DATA_DIR, 'text')

In [4]:
LLM_MODEL = "nous-hermes2"

llm = Ollama(model=LLM_MODEL)
# llm = ChatOllama(model=LLM_MODEL, temperature=0)
# llm = Ollama(model="llama3", format='json')


In [5]:
files = {
    'pdf': [],
    'md': [],
    'txt': [],
    'json': []
}

for file in os.listdir(RAW_DIR):
    if file.endswith(".pdf"):
        files['pdf'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".md"):
        files['md'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".txt"):
        files['txt'].append(os.path.join(RAW_DIR, file))

    if file.endswith(".json"):
        files['json'].append(os.path.join(RAW_DIR, file))

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from typing import List
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

### PDF --> Embedding

In [7]:
loader = PyMuPDFLoader(files['pdf'][0])
doc = loader.load()
# docs_list = [page for doc in docs for page in doc]
doc_list = [item for item in doc]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=256
)
doc_splits = text_splitter.split_documents(doc_list)

embeddings = OllamaEmbeddings(model=LLM_MODEL)

# Add to vectorDB
vectorstore_pdf = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=embeddings,
)

retriever_pdf = vectorstore_pdf.as_retriever()

In [8]:
# Define your desired data structure.
class ExInfo(BaseModel):
    name: str = Field(description="research paper title, usually should be in the first page.")
    authors: str = Field(description="research paper authors, usually should be in the first page.")
    date: int = Field(description="research paper publication date in year-month format. look for the keyword 'published'")
    summary_abstract: str = Field(description="research paper summary abstract, usually should be in the first page.")
    limitation: str = Field(description="research paper limitation, usually should be at the end of the article.")

# And a query intented to prompt a language model to populate the data structure.
paper_query = f"Extract the information from the research paper"

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=ExInfo)

prompt = PromptTemplate(
    template="""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|> 
    You are an research assistant for extracting information from pdf file. 
    Use the following pieces of retrieved context to answer the question.
    <|eot_id|>

    <|start_header_id|>user<|end_header_id|>
    {question}

    Here are the pdf reference:
    {document}

    Follow this output format:
    {format_instructions}
    <|eot_id|>
    
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "document"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser
docs = retriever_pdf.invoke(paper_query)
answer = chain.invoke({
    "question": paper_query,
    "document": docs  
})

answer

[{'name': 'A Novel Approach for Automatic Fish Behavior Recognition Based on Deep Learning',
  'authors': 'Xiaoyan Zhang, Ming Xu, Feng Gao, Yongjun Ma, Kai Wang\nAffiliation:\n1School of Electronic Information Engineering, Nanjing University of Science and Technology, Nanjing 210094, China',
  'date': 202006,
  'summary_abstract': "The behavior recognition of fish is an important method to understand the life habits of fish. However, because of the complexity of aquatic environments and limited visibility of fish's motion behavior, it is a great challenge to recognize fish behavior accurately.",
  'limitation': 'This paper proposes a novel approach for automatic fish behavior recognition based on deep learning. Firstly, a new feature extraction method is proposed to extract the feature of the ﬁsh image. Secondly, the Faster R-CNN algorithm is used to classify and locate the ﬁsh in the aquarium. Thirdly, K-means clustering is used to cluster the feature vectors of ﬁsh images into diffe

In [15]:
def reformat_output(answer):
    text = ""
    answer = answer[0]
    for item in answer:
        text = text + "\n" + f"{item}: {answer[item]}"

In [48]:
### Hallucination Grader
# llm = ChatOllama(model=LLM_MODEL, format='json', temperature=0)
class ExInfo(BaseModel):
    relevent: str = Field(description="whether an answer is grounded in / supported by a set of facts, give a binary answer of 'yes' or 'no'.")
    score: int = Field(description="the answer revelent score from 1 to 10. 10 indicate strong relevent, 1 indicate weak relevent answer.")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=ExInfo)

# Prompt
prompt = PromptTemplate(
    template=""" 
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a grader assessing whether an answer is grounded in / supported by a set of facts.
    Give a binary 'yes' or 'no' score to indicate whether the answer is grounded in / supported 
    by a set of facts. Provide the binary score as a JSON with a single key 'score' and no 
    preamble or explanation.
    <|eot_id|>
    
    <|start_header_id|>user<|end_header_id|>
    You are a grader assessing whether an answer is grounded in / supported by a set of facts.
    Give a binary 'yes' or 'no' score to indicate whether the answer is grounded in / supported 
    by a set of facts. Provide the binary score as a JSON with a single key 'score' and no 
    preamble or explanation.

    Here are the facts:
    {documents} 

    Here is the answer: 
    {answer}
    
    Follow this output format:
    {format_instructions}

    <|eot_id|>
    
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["answer", "documents"],
    partial_variables={"format_instructions": parser.get_format_instructions()}

)

chain = prompt | llm | JsonOutputParser()
rating = chain.invoke({
    "documents": docs, 
    "answer": reformat_output(answer)
})

rating



OutputParserException: Invalid json output: I am sorry, but the provided text is in Chinese and I don't have access to a reliable Chinese-to-English translation tool at this moment. Please provide the text in English or let me know if you need assistance with any other task in English.

In [47]:
import pandas as pd

item = pd.json_normalize(rating).to_dict('records')[0]

def extractor(answer):
    output = {}
    for item in answer:
        if "relevent" in item:
            output['relevent'] = answer[item]

        if "score" in item:
            output['score'] = answer[item]

    return output

extractor(item)

{'relevent': 'yes', 'score': 5}