In [1]:
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader
from pypdf import PdfReader
import ollama
import gradio as gr
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def triplextract(text, entity_types, predicates):
    input_format = """Perform Named Entity Recognition (NER) and extract knowledge graph triplets from the text. 
    NER identifies named entities of given entity types, and triple extraction 
    identifies relationships between entities using specified predicates.
    
        **Entity Types:**
        {entity_types}

        **Predicates:**
        {predicates}

        **Text:**
        {text}
        """

    message = input_format.format(
                entity_types = json.dumps({"entity_types": entity_types}),
                predicates = json.dumps({"predicates": predicates}),
                text = text)

    # Pass the message as a single string
    prompt = message
    output = ollama.generate(model='sciphi/triplex', prompt=prompt)
    return output

In [15]:
entity_types = ["PDF", "Document"]
predicates = ["is part of", "views"]

In [16]:
reader = PdfReader("C:\\Users\\PV862NU\\Downloads\\test.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

print(text)

Find out what your documents can look like by viewing the below design samples. A 
sample PDF can give you a clearer picture of what you can create. More PDF examples 
can be found in the  Prince samples repository  and on the  CSS For Publishing  web site.  
Dictionary    
Dictionaries often use a multi -column layout to save space, with running headers 
indicating keyword entries on that page. Notice how letters are rotated and shown on 
tabs on the side of right pages. The fonts used in this sample PDF are  Satyr  and  Faunus, 
made by  Monokrom . Archive.org  has a  scanned copy  of the printed edition of this Old 
Icelandic dictionary from 1910.  
HTML  PDF  
Invoices   
 
Prince is often used to generate PDF invoices from HTML pages. Customers first see the 
invoice on their screen, and then receive a printable PDF version for their records. Here 
you will find two examples of invoices, one colorful, the other more conservati ve. 



In [17]:
prediction = triplextract(text, entity_types, predicates)
print(prediction)

{'model': 'sciphi/triplex', 'created_at': '2024-09-03T07:07:35.6375936Z', 'response': '  ```json\n{\n    "entities_and_triples": [\n        "[1], PDF:sample PDF",\n        "[2], Document:design samples",\n        "[3], ORGANIZATION:Prince samples repository",\n        "[4], ORGANIZATION:CSS For Publishing web site",\n        "[5], DOCUMENT:Dictionary",\n        "[6], DOCUMENT:Dictionaries",\n        "[7], DOCUMENT:multi-column layout",\n        "[8], PDF:Satyr",\n        "[9], DOCUMENT:Old Icelandic dictionary",\n        "[10], ORGANIZATION:Archive.org",\n        "[11], DOCUMENT:Faunus",\n        "[12], ORGANIZATION:Monokrom",\n        "[13], DOCUMENT:HTML PDF",\n        "[14], DOCUMENT:Invoices",\n        "[15], DOCUMENT:PDF invoices",\n        "[16], DOCUMENT:invoice",\n        "[17], DOCUMENT:printable PDF version"\n    ]\n}\n```', 'done': True, 'done_reason': 'stop', 'context': [529, 29989, 326, 29918, 2962, 29989, 29958, 1792, 13, 13, 4706, 3579, 6691, 28025, 29901, 1068, 13, 4706

In [18]:
response_string = prediction['response'].strip('```json\n').strip()
response_string = response_string.lstrip('\n')
response_string = response_string.strip('```')
response_string = response_string.replace('```', '')
response_string = response_string.replace("json", "")
response_json = json.loads(response_string)
entities_and_triples = response_json['entities_and_triples']
print(entities_and_triples)

['[1], PDF:sample PDF', '[2], Document:design samples', '[3], ORGANIZATION:Prince samples repository', '[4], ORGANIZATION:CSS For Publishing web site', '[5], DOCUMENT:Dictionary', '[6], DOCUMENT:Dictionaries', '[7], DOCUMENT:multi-column layout', '[8], PDF:Satyr', '[9], DOCUMENT:Old Icelandic dictionary', '[10], ORGANIZATION:Archive.org', '[11], DOCUMENT:Faunus', '[12], ORGANIZATION:Monokrom', '[13], DOCUMENT:HTML PDF', '[14], DOCUMENT:Invoices', '[15], DOCUMENT:PDF invoices', '[16], DOCUMENT:invoice', '[17], DOCUMENT:printable PDF version']


In [19]:
with open('output.txt', 'w') as f:
    f.write(text)

In [20]:
loader = TextLoader("./output.txt")
docs =loader.load()


In [21]:
text_splitter = SemanticChunker(HuggingFaceEmbeddings())
documents = text_splitter.split_documents(docs)

# Instantiate the embedding model
embedder = HuggingFaceEmbeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [22]:
vector = FAISS.from_documents(documents, embedder)
retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [28]:
llm = Ollama(model="llama3.1:8b")

# Define the prompt
prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n

3. Keep the answer crisp and limited to 3,4 sentences.

Context: {context}

Question: {question}

Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

In [29]:
llm_chain = LLMChain(
                  llm=llm, 
                  prompt=QA_CHAIN_PROMPT, 
                  callbacks=None, 
                  verbose=True)

document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)


In [30]:
combine_documents_chain = StuffDocumentsChain(
                  llm_chain=llm_chain,
                  document_variable_name="context",
                  document_prompt=document_prompt,
                  callbacks=None)
              
qa = RetrievalQA(
                  combine_documents_chain=combine_documents_chain,
                  verbose=True,
                  retriever=retriever,
                  return_source_documents=True)

In [31]:
def respond(question,history):
    return qa(question)["result"]

In [33]:
gr.ChatInterface(
    respond,
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Ask me question related to PDF", container=False, scale=7),
    title="PDF's Chatbot",
    examples=["What type of documents we have", "What is invoice"],
    cache_examples=True,
    retry_btn=None,

).launch(share = True)

Caching examples at: 'C:\Users\PV862NU\OneDrive - EY\projects\triplex-pipeline\gradio_cached_examples\44'
Caching example 1/2


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.


3. Keep the answer crisp and limited to 3,4 sentences.

Context: Context:
content:Find out what your documents can look like by viewing the below design samples. A 
sample PDF can give you a clearer picture of what you can create.
source:./output.txt

Context:
content:More PDF examples 
can be found in the  Prince samples repository  and on the  CSS For Publishing  web site. Dictionary    
Dictionaries often use a multi -column layout to save space, with running headers 
indicating keyword entries on that page. Notice how letters are rotated and shown on 
tabs 





[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.


3. Keep the answer crisp and limited to 3,4 sentences.

Context: Context:
content:More PDF examples 
can be found in the  Prince samples repository  and on the  CSS For Publishing  web site. Dictionary    
Dictionaries often use a multi -column layout to save space, with running headers 
indicating keyword entries on that page. Notice how letters are rotated and shown on 
tabs on the side of right pages. The fonts used in this sample PDF are  Satyr  and  Faunus, 
made by  Monokrom . Archive.org  has a  scanned copy  of the printed edition of this Old 
Icelandic dictionary from 1910. HTML  PDF  
Invoices   
 
Prince is often used to generate PDF invoices from HTML pages. Customers firs