<a href="https://colab.research.google.com/github/kavlata/LLM-experiments/blob/main/Fewshot_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q langchain transformers sentence_transformers
!pip install chromadb
!pip install unstructured
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install pdfminer.six

In [3]:
!pip install pytesseract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [None]:
# Use this to load your documents. I have used Contracts from the CUAD Dataset - https://www.atticusprojectai.org/cuad
# Since we want to extract important details from a particular contract, we are doing single document loading and indexing
from langchain.text_splitter import CharacterTextSplitter, TextSplitter #, NLTKTextSplitter
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('/content/sample_data/', glob="**/*.pdf")
documents = loader.load()
documents

In [119]:
len(documents)

1

In [120]:
# The document is split into multiple chunks using the splitter below. I have kept overlap so that text continuity is preserved 
# between the chunks.
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_documents(documents)
len(texts)



41

In [121]:
# Code for indexing a document using Chroma using HuggingFaceEmbeddings.
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.config import Settings
embeddings = HuggingFaceEmbeddings()
collection_name="long-docs"
persist_directory="/content/chromadb/"

client_settings = Settings(chroma_db_impl="duckdb+parquet",persist_directory=persist_directory, anonymized_telemetry=False)
vectorstore = Chroma(collection_name=collection_name,embedding_function=embeddings,client_settings=client_settings,persist_directory=persist_directory)

vectorstore.add_documents(documents=texts, embedding=embeddings)
vectorstore.persist()



In [29]:
# This part is for loading the LLM Model from HuggingFace. This downloads the code in memory.
# For running model on client data, downloading the model on your local server would be highly recommended instead of 
# using Hugging Face hub
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'# go for a smaller model if you dont have the VRAM
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline("text2text-generation",model=model, tokenizer=tokenizer, max_length=150,
                model_kwargs={"temperature":0}) #text2text-generation , text-generation
local_llm = HuggingFacePipeline(pipeline=pipe)

In [128]:
# If you want to provide few shot examples to your prompt use the below
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

examples = [
            # { 
            #       "question": "Which are the parties in this agreement ?",
            #     "summaries" : "THIS ENDORSEMENT AGREEMENT (the Agreement) is dated as of this \
            #      ____day of ____________, 2012, but made effective as of February 20, 2012 (Effective Date) between\
            #       Healthcare Distribution Specialists LLC (HDS), a Delaware corporation, and Paul Silas (Celebrity),\
            #        an individual."
            # },
#             {
#                     "question": "What is the contract date?",
#                    "summaries":
#                     """ THIS AGREEMENT is made and entered into this 1st day of June, 2017 by and between Prudential Bank (hereinafter referred to as the
# "Employer"), located in Philadelphia, Pennsylvania and Jeffrey Hanuscin, (hereinafter referred to as the "Employee"), residing at 2406 Sanibel Circle,
# Palmyra, NJ 08065 """},
            {
                  "question": "Which law governs this contract?",
                  "summaries" : """ s Agreement shall be governed by and construed in accordance with the
                   laws of the State """
            },
            {
                "question" : "What is the address of the employer?",
                "summaries" : """ THIS ENDORSEMENT AGREEMENT (herein the “Agreement”) is effective on this 21st day of February 2011, by and between Golfers 
Incorporated, a Delaware Corporation, having a mailing address of 1021 N. Sepulveda Blvd., Suite G, Manhattan Beach, CA 90266 (hereinafter 
referred to as “Company”) and Andy North, having a mailing address of 1624 S. High Point Road, Madison, WI 53719 (hereinafter referred to as 
“North”).
"""
            }
]
example_template = """
User: {question}
AI: {summaries}
"""
example_prompt = PromptTemplate(
    input_variables=["question", "summaries"],
    template=example_template
)
prefix = """Given the following extracted parts of a long document and a question, create a final answer with references  
If you don't know the answer, just say that you don't know.  Here are some
examples: 
"""
suffix = """
QUESTION: {question}
Context : {summaries}
Answer :
=========
"""
few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix = suffix,
    input_variables=["question","summaries"],
    example_separator="\n\n"
)

In [143]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
chain = load_qa_with_sources_chain(llm=local_llm, chain_type="stuff",prompt=few_shot_prompt_template)
query = "Where are the parties located ?  "
docs = vectorstore.similarity_search(query,k=1)
result = chain({"input_documents": docs, "question": query}, 
               return_only_outputs=True)
print(result)

{'output_text': 'Manhattan Beach, CA 90266'}


In [144]:
# The below is Zero-Shot prompt
from langchain.prompts.prompt import PromptTemplate
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
template = """Given the following extracted parts of a long document and a question, 
create a final answer . 
If you don't know the answer, just say that you don't know. 

QUESTION: {question}
=========
{summaries}
=========
ANSWER:"""

# create a prompt template
PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
# query 
chain1 = load_qa_with_sources_chain(llm=local_llm, chain_type="stuff",prompt=PROMPT)
query1 = "Where are the parties located ?  "
docs1 = vectorstore.similarity_search(query,k=1)
result1 = chain1({"input_documents": docs1, "question": query1}, 
               return_only_outputs=True)
print(result1)

{'output_text': 'Company and North'}
