In [2]:
from langchain_groq import ChatGroq
llm_relevancy = ChatGroq(model="llama-3.3-70b-versatile", temperature=0,)
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.6,max_tokens=3000,)

In [None]:
REAG_SYSTEM_PROMPT = """
# Role and Objective
You are an intelligent knowledge retrieval assistant. Your task is to analyze provided documents or URLs to extract the most relevant information for user queries.

# Instructions
1. Analyze the user's query carefully to identify key concepts and requirements.
2. Search through the provided sources for relevant information and output the relevant parts in the 'content' field.
3. If you cannot find the necessary information in the documents, return 'isIrrelevant: true', otherwise return 'isIrrelevant: false'.

# Constraints
- Do not make assumptions beyond available data
- Clearly indicate if relevant information is not found
- Maintain objectivity in source selection
"""



In [4]:
from pydantic import BaseModel,Field
from typing import List
from langchain_core.output_parsers import JsonOutputParser

class ResponseSchema(BaseModel):
    content: str = Field(...,description="The page content of the document that is relevant or sufficient to answer the question asked")
    reasoning: str = Field(...,description="The reasoning for selecting The page content with respect to the question asked")
    is_irrelevant: bool = Field(...,description="Specify 'True' if the content in the document is not sufficient or relevant to answer the question asked otherwise specify 'False' if the context or page content is relevant to answer the question asked")


class RelevancySchemaMessage(BaseModel):
    source: ResponseSchema

relevancy_parser = JsonOutputParser(pydantic_object=RelevancySchemaMessage)

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader

file_path = "./data/FWG.pdf"
loader = PyMuPDFLoader(file_path)
#
docs = loader.load()
print(len(docs))
print(docs[0].metadata)

7
{'source': './data/FWG.pdf', 'file_path': './data/FWG.pdf', 'page': 0, 'total_pages': 7, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'PDF-XChange (xcpro40.DLL v4.0.0316.0001) (Windows)', 'creationDate': '', 'modDate': '', 'trapped': ''}


In [7]:
from langchain.schema import Document
def format_doc(doc: Document) -> str:
    return f"Document_Title: {doc.metadata['title']}\nPage: {doc.metadata['page']}\nContent: {doc.page_content}"

In [24]:
### Helper function to extract relevant context
from langchain_core.prompts import PromptTemplate
def extract_relevant_context(question,documents):
    result = []
    for doc in documents:
        formatted_documents = format_doc(doc)
        system = f"{REAG_SYSTEM_PROMPT}\n\n# Available source\n\n{formatted_documents}"
        prompt = f"""Determine if the 'Avaiable source' content supplied is sufficient and relevant to ANSWER the QUESTION asked.
        QUESTION: {question}
        #INSTRUCTIONS TO FOLLOW
        1. Analyze the context provided thoroughly to check its relevancy to help formulizing a response for the QUESTION asked.
        2, STRICTLY PROVIDE THE RESPONSE IN A JSON STRUCTURE AS DESCRIBED BELOW:
            ```json
               {{"content":<<The page content of the document that is relevant or sufficient to answer the question asked>>,
                 "reasoning":<<The reasoning for selecting The page content with respect to the question asked>>,
                 "is_irrelevant":<<Specify 'True' if the content in the document is not sufficient or relevant.Specify 'False' if the page content is sufficient to answer the QUESTION>>
                 }}
            ```
         """
        messages =[ {"role": "system", "content": system},
                       {"role": "user", "content": prompt},
                    ]
        response = llm_relevancy.invoke(messages)    
        print(response.content)
        formatted_response = relevancy_parser.parse(response.content)
        result.append(formatted_response)
    final_context = []
    final_reasons = []
    for items in result:
        if (items['is_irrelevant'] == False) or ( items['is_irrelevant'] == 'false') or (items['is_irrelevant'] == 'False'):
            final_context.append(items['content'])
            final_reasons.append(items['reasoning'])
    return final_context, final_reasons

In [None]:
# 현재는 전체 페이지별로 순환 검토 방식... 1차적으로 의미 검색 결과 대상으로 순환 검토하는 방식으로 개선 필요

# question = "What is the type of fw generator?"
question = "main features of fw generator"
final_context, final_reasons = extract_relevant_context(question, docs)
print(len(final_context))

```json
{
  "content": "F.W. GENERATOR",
  "reasoning": "The provided document content does not thoroughly describe the main features of the FW generator. It only mentions 'F.W. GENERATOR' as part of the 'PURCHASE ORDER SPECIFICATION FOR' section, without providing any detailed information about its features.",
  "is_irrelevant": "True"
}
```
```json
{
  "content": "F.W. GENERATOR EVAPORATING TYPE",
  "reasoning": "The provided content mentions 'F.W. GENERATOR EVAPORATING TYPE' which seems to be related to the main features of the FW generator. However, the information is limited and does not provide a detailed description of the main features. The content only provides a brief description of the generator type.",
  "is_irrelevant": "True"
}
```
```json
{
  "content": "1. Type: Low pressure evaporating type. (M/E jacket water heating) Shell & Tube type. 2. Particular: 1) Q'ty/ship: One(1) Set, 2) Capacity: 25 ton/day with 15% fouling margin, 3) Shell vacuum: Maker's standard, 4) Salini

In [27]:
final_context

["1. Type: Low pressure evaporating type. (M/E jacket water heating) Shell & Tube type. 2. Particular: 1) Q'ty/ship: One(1) Set, 2) Capacity: 25 ton/day with 15% fouling margin, 3) Shell vacuum: Maker's standard, 4) Salinity of distillate water: Max. 10 PPM, 5) Design condition: HEATING MEDIUM: M/E jacket cooling F.W., COOLING MEDIUM: S.W. 3. Construction and Material: 1) Construction: Fresh water generator to consist of evaporating chamber, condensing chamber, distillate pump, salinity indicator, brine/air ejector, and other accessories.",
 "4. Accessory and fittings To be in accordance with the requirements of Classification Society/Regulatory Bodies and maker's standard including following. 1 set - S.W. ejector for brine/air extraction 1 set - Thermometer and bulb with name plate(S.W. in/outlet, J.W. in/outlet, Condenser) 1 set - Pressure/Compound gauge with root valve and name plate. 1     - Motor driven distillate pump. 1     - Vacuum gauge with root valve and name plate. 1     - 

In [28]:
final_reasons

['The provided content is relevant to the question about the main features of the FW generator because it explicitly mentions the type, particular specifications, and construction and material details of the FW generator. The content includes key information such as the type of generator, its capacity, and the materials used, which are essential features of the FW generator.',
 'The provided content includes a list of accessories and fittings for the F.W. generator, which are essential features of the system. The list includes items such as thermometers, pressure gauges, pumps, valves, and other components that are crucial for the operation of the F.W. generator. This information is relevant to understanding the main features of the F.W. generator.',
 "The provided content mentions 'F.W. GENERATOR' and provides some technical specifications related to it, such as the power plug type and the rated evaporation capacity. Although the content does not provide an exhaustive list of main fea

In [None]:
rag_prompt = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

def generate_response(question,final_context):
    prompt = PromptTemplate(template=rag_prompt,
                                     input_variables=["question","context"],)
    chain  = prompt | llm
    response = chain.invoke({"question":question,"context":final_context})
    # print(response.content.split("\n\n")[-1])
    return response.content.split("\n\n")[-1]

In [20]:
final_response = generate_response(question,final_context)
final_response

'The main features of the FW generator include a low-pressure evaporating type with a shell and tube design, a capacity of 25 tons/day, and a salinity of distillate water of max 10 PPM. The generator consists of an evaporating chamber, condensing chamber, and other accessories. It also includes various fittings such as a distillate pump, salinity indicator, and vacuum gauge.'

In [21]:
from pprint import pprint
pprint(final_response)

('The main features of the FW generator include a low-pressure evaporating '
 'type with a shell and tube design, a capacity of 25 tons/day, and a salinity '
 'of distillate water of max 10 PPM. The generator consists of an evaporating '
 'chamber, condensing chamber, and other accessories. It also includes various '
 'fittings such as a distillate pump, salinity indicator, and vacuum gauge.')
