In [73]:
from langchain_groq import ChatGroq
llm_relevancy = ChatGroq(model="llama-3.3-70b-versatile", temperature=0,)
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0,max_tokens=3000,)

In [44]:
from pydantic import BaseModel,Field
from typing import List
from langchain_core.output_parsers import JsonOutputParser

class ResponseSchema(BaseModel):
    content: str = Field(...,description="The page content of the document that is relevant or sufficient to answer the question asked")
    reasoning: str = Field(...,description="The reasoning for selecting The page content with respect to the question asked")
    is_irrelevant: bool = Field(...,description="Specify 'True' if the content in the document is not sufficient or relevant to answer the question asked otherwise specify 'False' if the context or page content is relevant to answer the question asked")


class RelevancySchemaMessage(BaseModel):
    source: ResponseSchema

relevancy_parser = JsonOutputParser(pydantic_object=RelevancySchemaMessage)

In [56]:
from langchain_community.document_loaders import PyMuPDFLoader

file_path = "./data/FWG.pdf"
loader = PyMuPDFLoader(file_path)
#
docs = loader.load()
print(len(docs))
print(docs[0].metadata)

7
{'source': './data/FWG.pdf', 'file_path': './data/FWG.pdf', 'page': 0, 'total_pages': 7, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'PDF-XChange (xcpro40.DLL v4.0.0316.0001) (Windows)', 'creationDate': '', 'modDate': '', 'trapped': ''}


In [57]:
from langchain.schema import Document
def format_doc(doc: Document) -> str:
    return f"Document_Title: {doc.metadata['title']}\nPage: {doc.metadata['page']}\nContent: {doc.page_content}"

In [58]:
### Helper function to extract relevant context

REAG_SYSTEM_PROMPT = """
# Role and Objective
You are an intelligent knowledge retrieval assistant. Your task is to analyze provided documents or URLs to extract the most relevant information for user queries.

# Instructions
1. Analyze the user's query carefully to identify key concepts and requirements.
2. Search through the provided sources for relevant information and output the relevant parts in the 'content' field.
3. If you cannot find the necessary information in the documents, return 'isIrrelevant: true', otherwise return 'isIrrelevant: false'.

# Constraints
- Do not make assumptions beyond available data
- Clearly indicate if relevant information is not found
- Maintain objectivity in source selection
"""

from langchain_core.prompts import PromptTemplate
def extract_relevant_context(question,documents):
    result = []
    for doc in documents:
        formatted_documents = format_doc(doc)
        system = f"{REAG_SYSTEM_PROMPT}\n\n# Available source\n\n{formatted_documents}"
        prompt = f"""Determine if the 'Avaiable source' content supplied is sufficient and relevant to ANSWER the QUESTION asked.
        QUESTION: {question}
        #INSTRUCTIONS TO FOLLOW
        1. Analyze the context provided thoroughly to check its relevancy to help formulizing a response for the QUESTION asked.
        2, STRICTLY PROVIDE THE RESPONSE IN A JSON STRUCTURE AS DESCRIBED BELOW:
            ```json
               {{"content":<<The page content of the document that is relevant or sufficient to answer the question asked>>,
                 "reasoning":<<The reasoning for selecting The page content with respect to the question asked>>,
                 "is_irrelevant":<<Specify 'True' if the content in the document is not sufficient or relevant.Specify 'False' if the page content is sufficient to answer the QUESTION>>
                 }}
            ```
         """
        messages =[ {"role": "system", "content": system},
                       {"role": "user", "content": prompt},
                    ]
        response = llm_relevancy.invoke(messages)    
        print(response.content)
        formatted_response = relevancy_parser.parse(response.content)
        result.append(formatted_response)
    final_context = []
    final_reasons = []
    for items in result:
        if (items['is_irrelevant'] == False) or ( items['is_irrelevant'] == 'false') or (items['is_irrelevant'] == 'False'):
            final_context.append(items['content'])
            final_reasons.append(items['reasoning'])
    return final_context, final_reasons

In [59]:
### 현재는 전체 페이지별로 순환 검토 방식... 1차적으로 의미 검색 결과 대상으로 순환 검토하는 방식으로 개선 필요

question = "What is the type of fw generator?"
# question = "주요 기업의 챗지피티 사용 정책"
final_context, final_reasons = extract_relevant_context(question, docs)
print(len(final_context))

```json
{
  "content": "PURCHASE ORDER SPECIFICATION FOR F.W. GENERATOR",
  "reasoning": "The provided content mentions 'F.W. GENERATOR' in the context of a purchase order specification, but it does not explicitly state the type of F.W. generator. However, it implies that the document is related to the procurement of a specific F.W. generator for a crude oil carrier. The lack of detailed specifications or descriptions of the F.W. generator in the given content limits its relevance to fully answering the question.",
  "is_irrelevant": "True"
}
```
```json
{
  "content": "F.W. GENERATOR EVAPORATING TYPE",
  "reasoning": "The question asks for the type of FW generator. The provided document contains a description of 'F.W. GENERATOR EVAPORATING TYPE' under the 'DESCRIPTION' column, which directly answers the question.",
  "is_irrelevant": "False"
}
```
```json
{
  "content": "1. Type Low pressure evaporating type. (M/E jacket water heating) Shell & Tube type",
  "reasoning": "The question 

In [60]:
final_context

['F.W. GENERATOR EVAPORATING TYPE',
 '1. Type Low pressure evaporating type. (M/E jacket water heating) Shell & Tube type']

In [61]:
final_reasons

["The question asks for the type of FW generator. The provided document contains a description of 'F.W. GENERATOR EVAPORATING TYPE' under the 'DESCRIPTION' column, which directly answers the question.",
 "The question asks for the type of FW generator. The provided document contains a section titled '1. Type' which describes the FW generator as a 'Low pressure evaporating type' that is also a 'Shell & Tube type'. This information directly answers the question asked."]

In [None]:
rag_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
You Must Generate the answer in Korean or Han-gul.
Question: {question} 
Context: {context} 
Answer:
"""

def generate_response(question,final_context):
    prompt = PromptTemplate(template=rag_prompt,
                                     input_variables=["question","context"],)
    chain  = prompt | llm
    response = chain.invoke({"question":question,"context":final_context})
    return response.content.split("\n\n")[-1]

In [77]:
final_response = generate_response(question,final_context)
final_response

'저압 증발형(F.W. GENERATOR EVAPORATING TYPE)입니다. 이 유형은 저압 증발형이며, 주 엔진 자켓 물을 가열하는 방식입니다. 셸과 튜브 형태를 가지고 있습니다.'

In [78]:
from pprint import pprint
pprint(final_response)

('저압 증발형(F.W. GENERATOR EVAPORATING TYPE)입니다. 이 유형은 저압 증발형이며, 주 엔진 자켓 물을 가열하는 '
 '방식입니다. 셸과 튜브 형태를 가지고 있습니다.')
