### Installation

In [None]:

# %pip install langchain langchain-community python_dotenv
# %pip install langchain-openai

# %pip install pandas numpy
# %pip install streamlit

# %pip install "unstructured[all-docs]<=0.16.10"
# %pip install langchain_postgres

# %pip install redis>=4.1.0

### Import Libraries

In [53]:
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough,RunnableLambda

from langchain_postgres.vectorstores import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from langchain_community.storage import RedisStore
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from pathlib import Path
from IPython.display import display, HTML, Markdown
from base64 import b64decode
import os, hashlib, shutil, uuid, json, time
import torch, redis, streamlit as st
import logging
# Initialize Redis client
client = redis.Redis(host="localhost", port=6379, db=0)

In [54]:
from dotenv import load_dotenv
load_dotenv()

True

In [55]:
FILE_PATH = Path("data/hbspapers_48__1.pdf")

### Data Loading

Partition tables and text into chunks

In [56]:

def data_loading():

    raw_pdf_elements = partition_pdf(
        filename=FILE_PATH,

        infer_table_structure=True,
        strategy = "hi_res",

        extract_image_block_types = ["Image"],
        extract_image_block_to_payload  = True,

        chunking_strategy="by_title",
        mode='elements',
        max_characters=10000,
        new_after_n_chars=5000,
        combine_text_under_n_chars=2000,
        image_output_dir_path="data/",
    )
    return raw_pdf_elements

In [57]:
pdf_elements = data_loading()

In [58]:
pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x752b5c50b110>,
 <unstructured.documents.elements.CompositeElement at 0x752b5c50aa80>,
 <unstructured.documents.elements.CompositeElement at 0x752b5c50bf50>,
 <unstructured.documents.elements.CompositeElement at 0x752b5c50af90>,
 <unstructured.documents.elements.CompositeElement at 0x752b5c50a3f0>,
 <unstructured.documents.elements.CompositeElement at 0x752b5c50a7e0>,
 <unstructured.documents.elements.CompositeElement at 0x752b94dddd00>,
 <unstructured.documents.elements.Table at 0x752b5c5083e0>,
 <unstructured.documents.elements.CompositeElement at 0x752b94e3a780>,
 <unstructured.documents.elements.Table at 0x752cb038c0e0>,
 <unstructured.documents.elements.CompositeElement at 0x752b94e69a90>,
 <unstructured.documents.elements.Table at 0x752b9292bce0>,
 <unstructured.documents.elements.CompositeElement at 0x752b50cf08f0>,
 <unstructured.documents.elements.Table at 0x752b94dcb200>,
 <unstructured.documents.elements.CompositeElement

In [59]:
tables = [element.metadata.text_as_html for element in pdf_elements if 'Table' in str(type(element))]
text = [element.text for element in pdf_elements if 'CompositeElement' in str(type(element))]

In [60]:
tables

['<table><tr><td/><td>Beef</td><td>Veal</td><td>Lamb</td><td>Mutton</td><td>Adult Australian RDI</td></tr><tr><td>Moisture (g)</td><td>73.1</td><td>74.8</td><td>72.9</td><td>73.2</td><td/></tr><tr><td>Protein (g)</td><td>23.2</td><td>24.8</td><td>21.9</td><td>21.5</td><td>46-64</td></tr><tr><td>Fat (g)</td><td>2.8</td><td>1.5</td><td>4.7</td><td>4.0</td><td>-</td></tr><tr><td>Energy (kJ)</td><td>498</td><td>477</td><td>546</td><td>514</td><td>6.5-15.8MJ</td></tr><tr><td>Cholesterol (mg)</td><td>50</td><td>51</td><td>66</td><td>66</td><td>-</td></tr><tr><td>Thiamin (mg)</td><td>0.04</td><td>0.06</td><td>0.12</td><td>0.16</td><td>1.1-1.2</td></tr><tr><td>Riboflavin (mg)</td><td>0.18</td><td>0.20</td><td>0.23</td><td>0.25</td><td>1.1-1.6</td></tr><tr><td>Niacin (mg)</td><td>5.0</td><td>16.0</td><td>5.2</td><td>8.0</td><td>14-16</td></tr><tr><td>Vitamin B6 (mg)</td><td>0.52</td><td>0.8</td><td>0.10</td><td>0.8</td><td>1.3-1.7</td></tr><tr><td>Vitamin B12 (ug)</td><td>2.5</td><td>1.6</td><t

In [61]:
pdf_elements[0].metadata.to_dict()

{'filetype': 'application/pdf',
 'languages': ['eng'],
 'last_modified': '2025-04-07T16:53:41',
 'page_number': 1,
 'orig_elements': 'eJzNWnlz4ziu/yqq/LG1U4lsHRQpddXUrq/E9x3bybypLoqibNmyJOuIj5n97gtJdnfSSc8kfuW819VxAhiQSPwIEAD52x9X3OVr7sVfHevqi3AlcWRzRccipwYVkWUw0bAkS7SYwhEhVJNl5epGuFrzmFo0pqDzxxXz/dByPBrzKKNduveT+OuCO/NFDBxFkSTQObK3jhUvgCuTjBv4jhener/9BmIFfCMoBinov98IJ1pV9IKa0hrSCvprOpcHxlW0j2K+TufRd3bcHQWU8av/wBe24/J4H/Dsq37nKhuNN0/oPBvyb1fcm1/9nnGj+Ovatxzb4ZlBFEnRRAmJEhnL+IumfkFyqh2A5lcvWZs8TKeSviPmu3SyV/ee88TDyIn3gm8LU991fW8O/1O10xjGTuxmI/vR/grWLEXSqUiYrYnIkKhoAkNUdKZolGuaZPCL2h9YKjra90QbSoFk9lalAnqDzuT/39h/yCNOQ7YQep7rePx9ZlcVi9u6Jao6kkSkG6ZoyhIWdZMZtkKpoSLj0mZHYEbjmdk1Wc3NrEtGgbymc/m/NLvFY85ix/e+MjBs9DUIfRPEwKtgwsolcAGLrzLNP66imIZgX8/iO2Ag/AyiPg3AQ4R/CiXACbxF+CXVTUI3/XIRx0H0pVgM/ULibwvcSgo0KS7MKMi0rv7z+1/jf0tZ4ubOV+fUjRcC9SyhzBf0yfGTkLrCiDncYzwSROE0kuNAfnm+Wro0DGkM3HH65DdWjcRtoui2KnIMH0hD6arRiKgjgzJZN2Wq0YutGgNLBYAQYZKvmhNt6AUtpWVNIgX5LUamcd660ZChoU/252d4HoG7ETrcchg4dwZtjvKHkNM08H

In [62]:
text

['University of Wollongong\n\nResearch Online\n\nFaculty of Health and Behavioural Sciences - Papers (Archive)\n\nFaculty of Science, Medicine and Health\n\nSeptember 2007\n\nNutritional composition of red meat\n\nP. G. Williams University of Wollongong, peterw@uow.edu.au\n\nFollow this and additional works at: https://ro.uow.edu.au/hbspapers\n\nPart of the Arts and Humanities Commons, Life Sciences Commons, Medicine and Health Sciences Commons, and the Social and Behavioral Sciences Commons\n\nRecommended Citation\n\nWilliams, P. G.: Nutritional composition of red meat 2007. https://ro.uow.edu.au/hbspapers/48\n\nResearch Online is the open access institutional repository for the University of Wollongong. For further information contact the UOW Library: research-pubs@uow.edu.au\n\nNutritional composition of red meat\n\nAbstract\n\nLean red meats are: • An excellent source of high biological value protein, vitamin B12, niacin, vitamin B6, iron, zinc and phosphorus • A source of long-cha

In [63]:
display(HTML(tables[0]))

0,1,2,3,4,5
,Beef,Veal,Lamb,Mutton,Adult Australian RDI
Moisture (g),73.1,74.8,72.9,73.2,
Protein (g),23.2,24.8,21.9,21.5,46-64
Fat (g),2.8,1.5,4.7,4.0,-
Energy (kJ),498,477,546,514,6.5-15.8MJ
Cholesterol (mg),50,51,66,66,-
Thiamin (mg),0.04,0.06,0.12,0.16,1.1-1.2
Riboflavin (mg),0.18,0.20,0.23,0.25,1.1-1.6
Niacin (mg),5.0,16.0,5.2,8.0,14-16
Vitamin B6 (mg),0.52,0.8,0.10,0.8,1.3-1.7


### Summarize the Data

In [65]:
# Summarize extracted text and tables using LLM
def summarize_text_and_tables(text, tables):
    prompt_text = """You are an assistant tasked with summarizing text and tables. \

                    You are to give a concise summary of the table or text and do nothing else.
                    Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")
    summarize_chain = {"element": RunnablePassthrough()}| prompt | model | StrOutputParser()
    return {
        "text": summarize_chain.batch(text, {"max_concurrency": 5}),
        "table": summarize_chain.batch(tables, {"max_concurrency": 5})
    }

In [66]:
data_summary = summarize_text_and_tables(text, tables)

In [67]:
data_summary

{'text': ["The document discusses the nutritional composition of red meat, highlighting that lean red meats are excellent sources of high-quality protein, vitamins (B12, B6, niacin), minerals (iron, zinc, phosphorus), and long-chain omega-3 fatty acids. It also notes that red meats are generally low in fat and sodium and contain various bioactive substances such as antioxidants. The article is authored by Assoc Prof Peter Williams and was published in 2007 in the journal Nutrition & Dietetics. The full text is accessible through the University of Wollongong's Research Online repository.",
  'Lean red meats are rich in high-quality protein, vitamins (B12, niacin, B6), minerals (iron, zinc, phosphorus), and omega-3 fats, while being low in fat and sodium. They also contain antioxidants and bioactive compounds. The FSANZ defines meat as the carcass of various animals, excluding eggs and foetuses, and includes offal but not bone. In Australia, "red meat" refers specifically to cattle, shee

In [68]:
text_summary = data_summary['text']

In [69]:
tables_summary = data_summary['table']

In [72]:
text_summary

["The document discusses the nutritional composition of red meat, highlighting that lean red meats are excellent sources of high-quality protein, vitamins (B12, B6, niacin), minerals (iron, zinc, phosphorus), and long-chain omega-3 fatty acids. It also notes that red meats are generally low in fat and sodium and contain various bioactive substances such as antioxidants. The article is authored by Assoc Prof Peter Williams and was published in 2007 in the journal Nutrition & Dietetics. The full text is accessible through the University of Wollongong's Research Online repository.",
 'Lean red meats are rich in high-quality protein, vitamins (B12, niacin, B6), minerals (iron, zinc, phosphorus), and omega-3 fats, while being low in fat and sodium. They also contain antioxidants and bioactive compounds. The FSANZ defines meat as the carcass of various animals, excluding eggs and foetuses, and includes offal but not bone. In Australia, "red meat" refers specifically to cattle, sheep, and goa

In [73]:
tables_summary

['The table presents nutritional information for beef, veal, lamb, and mutton, comparing their contents of moisture, protein, fat, energy, cholesterol, vitamins, and minerals. Key points include:\n\n- **Moisture Content**: Ranges from 72.9g (lamb) to 74.8g (veal).\n- **Protein**: Highest in veal (24.8g), lowest in mutton (21.5g).\n- **Fat**: Lamb has the highest fat content (4.7g), while veal has the lowest (1.5g).\n- **Energy**: Lamb provides the most energy (546 kJ), while veal provides the least (477 kJ).\n- **Cholesterol**: Similar across meats, ranging from 50mg (beef) to 66mg (lamb and mutton).\n- **Vitamins**: Niacin is notably higher in veal (16.0mg) compared to the others, while vitamin B12 is highest in mutton (2.8µg).\n- **Minerals**: Iron content is highest in mutton (3.3mg), while potassium is similar across all meats, around 362-365mg.\n\nThe table also includes recommended daily intakes (RDI) for adults for various nutrients.',
 'The table presents the lean and fat compo

### Initialize Retriever

In [70]:
def initialize_retriever():

    store = RedisStore(client=client)
    id_key = "doc_id"
    vectorstore = PGVector(
            embeddings=OpenAIEmbeddings(),
            collection_name=COLLECTION_NAME,
            connection=CONNECTION_STRING,
            use_jsonb=True,
            )
    retrieval_loader = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
    return retrieval_loader

In [71]:
load_retriever = initialize_retriever()

IntegrityError: (psycopg.errors.NotNullViolation) null value in column "name" of relation "langchain_pg_collection" violates not-null constraint
DETAIL:  Failing row contains (e4795516-05a2-4e2d-84a0-9f3ef8cd9142, null, null).
[SQL: INSERT INTO langchain_pg_collection (uuid, name, cmetadata) VALUES (%(uuid)s::UUID, %(name)s::VARCHAR, %(cmetadata)s::JSON)]
[parameters: {'uuid': UUID('e4795516-05a2-4e2d-84a0-9f3ef8cd9142'), 'name': None, 'cmetadata': Json(None)}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

### Add Summary to vectorstore & Raw data to RedisStore

In [74]:
# Store text, tables, and their summaries in the retriever

def store_docs_in_retriever(text, text_summary, table, table_summary, retriever):
    """Store text and table documents along with their summaries in the retriever."""

    def add_documents_to_retriever(documents, summaries, retriever, id_key = "doc_id"):
        """Helper function to add documents and their summaries to the retriever."""
        if not summaries:
            return None, []

        doc_ids = [str(uuid.uuid4()) for _ in documents]
        summary_docs = [
            Document(page_content=summary, metadata={id_key: doc_ids[i]})
            for i, summary in enumerate(summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs, ids=doc_ids)
        retriever.docstore.mset(list(zip(doc_ids, documents)))

# Add text, table, and image summaries to the retriever
    add_documents_to_retriever(text, text_summary, retriever)
    add_documents_to_retriever(table, table_summary, retriever)
    return retriever

In [75]:
retriever  = store_docs_in_retriever(text, text_summary, tables,  tables_summary, load_retriever)


NameError: name 'load_retriever' is not defined

In [22]:
query = "What is the comparison of the composition of red meat and vegetarian protein sources"


In [23]:
docs = retriever.invoke(query)

In [24]:
docs

[b'1. Folate values from US data [39]; all other values from NUTTAB 2006 [38]\n\n10\n\nTable 6. Percentage of male adult recommended dietary intake (RDI) or adequate intake (AI)\n\nprovided by 100g of lean red meat and some vegetarian protein sources',
 b'2) Nutrient composition of red meat\n\nRed meat contains high biological value protein and important micronutrients that are needed for good health throughout life. It also contains a range of fats, including essential omega-3 polyunsaturated fats. Recent analyses have shown that there has been a significant trend to leaner cuts of meat over the past two decades [3]. While the nutritional composition will vary somewhat according to breed, feeding regimen, season and meat cut, in general lean red meat has a low fat content, is moderate in cholesterol and rich in protein and many essential vitamins and minerals.\n\n1\n\n2.1 Nutrient composition of beef, veal, lamb and mutton\n\nTable 1 presents the typical nutrient composition of sample

### RAG Pipeline

#### Parse the retriever output

In [None]:
def parse_retriver_output(data):
    parsed_elements = []
    for element in data:
        # Decode bytes to string if necessary
        if isinstance(element, bytes):
            element = element.decode("utf-8")

        parsed_elements.append(element)

    return parsed_elements


#### Chat with the LLM using retrieved context

In [None]:
def chat_with_llm():


    prompt_text = """
                You are an AI Assistant tasked with understanding detailed
                information from text and tables. You are to answer the question based on the
                context provided to you. You must not go beyond the context given to you.

                Context:
                {context}

                Question:
                {question}
                """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")

    rag_chain = {
       "context": retriever | RunnableLambda(parse_retriver_output), "question": RunnablePassthrough(),
        } | RunnablePassthrough().assign(
        response=(
        prompt
        | model
        | StrOutputParser()
        )
        )

    return rag_chain




In [27]:
rag_chain = chat_with_llm()

In [28]:
response = rag_chain.invoke("What is the nutrient composition of beef, veal, lamb and mutton")

In [29]:
response

{'context': ['<table><tr><td/><td>Beef</td><td>Veal</td><td>Lamb</td><td>Mutton</td><td>Adult Australian RDI</td></tr><tr><td>Moisture (g)</td><td>73.1</td><td>74.8</td><td>72.9</td><td>73.2</td><td/></tr><tr><td>Protein (g)</td><td>23.2</td><td>24.8</td><td>21.9</td><td>21.5</td><td>46-64</td></tr><tr><td>Fat (g)</td><td>2.8</td><td>1.5</td><td>4.7</td><td>4.0</td><td>-</td></tr><tr><td>Energy (kJ)</td><td>498</td><td>477</td><td>546</td><td>514</td><td>6.5-15.8MJ</td></tr><tr><td>Cholesterol (mg)</td><td>50</td><td>51</td><td>66</td><td>66</td><td>-</td></tr><tr><td>Thiamin (mg)</td><td>0.04</td><td>0.06</td><td>0.12</td><td>0.16</td><td>1.1-1.2</td></tr><tr><td>Riboflavin (mg)</td><td>0.18</td><td>0.20</td><td>0.23</td><td>0.25</td><td>1.1-1.6</td></tr><tr><td>Niacin (mg)</td><td>5.0</td><td>16.0</td><td>5.2</td><td>8.0</td><td>14-16</td></tr><tr><td>Vitamin B6 (mg)</td><td>0.52</td><td>0.8</td><td>0.10</td><td>0.8</td><td>1.3-1.7</td></tr><tr><td>Vitamin B12 (ug)</td><td>2.5</td><t

In [31]:
print(response['response'])

The nutrient composition per 100g of lean red meat for beef, veal, lamb, and mutton is as follows:

| Nutrient              | Beef  | Veal  | Lamb  | Mutton |
|----------------------|-------|-------|-------|--------|
| Moisture (g)         | 73.1  | 74.8  | 72.9  | 73.2   |
| Protein (g)          | 23.2  | 24.8  | 21.9  | 21.5   |
| Fat (g)              | 2.8   | 1.5   | 4.7   | 4.0    |
| Energy (kJ)          | 498   | 477   | 546   | 514    |
| Cholesterol (mg)     | 50    | 51    | 66    | 66     |
| Thiamin (mg)         | 0.04  | 0.06  | 0.12  | 0.16   |
| Riboflavin (mg)      | 0.18  | 0.20  | 0.23  | 0.25   |
| Niacin (mg)          | 5.0   | 16.0  | 5.2   | 8.0    |
| Vitamin B6 (mg)      | 0.52  | 0.80  | 0.10  | 0.80   |
| Vitamin B12 (µg)     | 2.5   | 1.6   | 0.96  | 2.8    |
| Pantothenic acid (mg) | 0.35  | 1.50  | 0.74  | 1.33   |
| Vitamin A (µg)       | <5    | <5    | 8.6   | 7.8    |
| Beta-carotene (µg)   | 10    | <5    | <5    | <5     |
| Alpha-tocopherol (mg) | 0.

In [32]:
response = rag_chain.invoke("What is the nutrient composition of organ meats")

In [34]:
print(response['response'])

The nutrient composition of organ meats (specifically liver, kidney, heart, brain, and tripe from beef and lamb) includes the following key points:

1. **Vitamin B12**: All organ meats (except tripe) are extremely rich in vitamin B12, providing more than 100% of the Recommended Daily Intake (RDI) in 100g.

2. **Liver**: 
   - Rich source of protein, iron, zinc, riboflavin, niacin, vitamin A, and folate.

3. **Kidney**: 
   - Rich in protein, thiamin, riboflavin, iron, and a source of folate.

4. **Heart**: 
   - A good source of iron and zinc, but not as rich as liver and kidney.

5. **Brain and Tripe**: 
   - Not particularly good sources of vitamins or minerals.

6. **Cholesterol**: 
   - All organ meats are high in cholesterol, especially brains.

7. **Sodium**: 
   - Generally low in sodium across all organ meats.

   - Liver is such a rich source of retinol (vitamin A) that large amounts are not recommended during pregnancy. 

These points indicate that organ meats are nutrient-de

In [95]:
response = rag_chain.invoke("What is Meat?")