In [1]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
%pip install -q langchain einops accelerate transformers bitsandbytes scipy
%pip install -q xformers sentencepiece sentence-transformers pypdf
%pip install -q llama-index==0.7.21 llama_hub==0.0.19 faiss-gpu

^C
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os

os.environ["EDENAI_API_KEY"] = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNTM3ZWE2NWItZTZmZi00ZjQ3LThmY2QtNzU5NDg1YmRhNDMzIiwidHlwZSI6ImFwaV90b2tlbiJ9.NujKgo_tyy5V5SSP78F3s4_vY2Ll9afE578RAaSxKZ8"

In [5]:
from langchain.llms import EdenAI

llm=EdenAI(provider="openai",model="text-davinci-003",temperature=0)

In [8]:
import re
from glob import glob
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader

In [26]:
import math

def clean_document_nemerics(text):  #Cleans the text
  return re.sub(r'(?:\b|(?<=\s))\w{1}\b|[\d\W]+', ' ', text).lower()

def load_documents(directory): #Takes the docs directory and returns documents
  documents = []
  for item_path in glob(directory + "*.pdf"):
      loader = PyPDFLoader(item_path)
      documents.extend(loader.load())
  return documents

def text_to_chunks(full_string): #Takes full text and divides them into chunks, references, and gives k
  full_string = clean_document_nemerics(full_string)
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap = 128)
  chunks = text_splitter.split_text(full_string)
  references = []

  number_of_chunks = len(chunks)
  print("No of docs: ", number_of_chunks)
  number_of_references = math.ceil(number_of_chunks/5)
  print("No of refenrence documents needed:", number_of_references)
  index_of_reference = number_of_chunks // (number_of_chunks/5)
  print("Document index:", index_of_reference)

  for i in range(0, number_of_chunks):
      if i % 5 == 0:
          references.append(chunks[i])
          # print(i)

  return chunks, references, int(index_of_reference+1)

In [12]:
import tiktoken

encoding = tiktoken.get_encoding("p50k_base")

def get_similar_chunks(db, reference, k):  #Takes DB, reference, k and give reference documents
    chunks = [i.page_content for i in (db.similarity_search(reference, k = k))]
    return chunks
    
def calc_tokens(chunks): #Takes the text and gives the number of tokens
    cur_token = len(encoding.encode(chunks))
    return cur_token

In [119]:
from langchain import PromptTemplate, LLMChain

def generate_summaries(text_chunks_list): #Takes a list of chunks of [(k similar documents), (), ()..] and give their summaries in a list
  template = """
  As an expert summarizer, produce a concise yet comprehensive summary of the given text,
  whether it's an article, blog post, conversation, or passage without adding your interpretations.
  Your summary should exhibit great detail, depth, and complexity while ensuring clarity and conciseness.
  Employ the following content to create the summary:
  {text}
  """
    
  prompt = PromptTemplate(template=template, input_variables=["text"])
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  ip = [{'text': i} for i in text_chunks_list]
 
  summaries = llm_chain.apply(ip)

  return [i['text'] for i in summaries]


def generate_final_summary(text_chunk): #Takes the final text and generates the final summary
  template = """
  I want you to act as a text summarizer to help me create a brief understandable summary of the text I provide,
  whether it's an article, blog post, conversation, or passage.
  The summary should be 15-20 sentences in length, expressing the points and concepts written in the original text without adding your interpretations.
  Employ the following content to create the summary:
  ```{text}```
  """

  prompt = PromptTemplate(template=template, input_variables=["text"])
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  summary = llm_chain({'text': text_chunk})
  # print("Final summary: ")
  # print(summary)

  return summary

In [73]:
#Path will be given then documents will be taken and converted into text

#---the text is then split into chunks, references
#---chunks will be stored in DB
#---similarity search using references
#---summaries
#---if token limit exceeded, 
#--------recursively continue the same 
#---else
#--------final summary

In [118]:
def recurive_summarization(chunks_as_text):
    tokens = calc_tokens(chunks_as_text)

    #If the token limit is not exceeded. proceed for the final summary
    if tokens+1000 < 4096:              
        print("Current tokens: ", tokens)
        return generate_final_summary(chunks_as_text)

    #Recursively repeat the process
    else: 
        print("Current tokens: ", tokens)

        #Text divided into chunks, references, and K
        full_text_turned_into_chunks, references, k = text_to_chunks(chunks_as_text)

        #Storing the chunks into FAISS using embeddings
        db = FAISS.from_texts(full_text_turned_into_chunks, embeddings)

        #Retrieving the Similar chunks using references
        similar_chunk_k5 = []
        for reference in references:
            similar_k_texts = get_similar_chunks(db, reference, k)
            similar_chunk_k5.append(". ".join(similar_k_texts))
        
        summaries = generate_summaries(similar_chunk_k5)
        return recurive_summarization(".".join(summaries))

In [103]:
#Path is given documents and text is extracted
documents = load_documents("./KG/docs/")
documents_text = [i.page_content for i in documents]

In [105]:
len(documents)

17

In [121]:
summary = recurive_summarization(". ".join(documents_text))

Current tokens:  20608
No of docs:  167
No of refenrence documents needed: 34
Document index: 5.0
Current tokens:  8512
No of docs:  106
No of refenrence documents needed: 22
Document index: 5.0
Current tokens:  6164
No of docs:  74
No of refenrence documents needed: 15
Document index: 4.0
Current tokens:  3798
No of docs:  49
No of refenrence documents needed: 10
Document index: 4.0
Current tokens:  2302


  summary = recurive_summarization(". ".join(documents_text))


In [123]:
print(summary['text'])



Neats is an extraction-based multi-document summarization system that uses techniques such as term frequency, sentence position, stigma words, term clustering, buddy system of paired sentences, and explicit time annotation to improve topic coverage and readability. It was evaluated on the large-scale summarization evaluation DUC and was among the best performers, performing better on longer summaries and words based on weighted retention than on shorter ones. The Document Understanding Conference (DUC) and Text Summarization Challenge (TSC) are large-scale summarization projects in the US and Japan, respectively. They compile standard training and test collections and provide common and large-scale evaluations in single and multiple document summarization. Common metrics such as pseudo precision and weighted retention are used to measure the effectiveness of automatic summarization systems. Lead sentences are good summary sentence candidates and one needs to cover all documents in a 

In [29]:
#Text divided into chunks, references, and K
chunks, references, k = text_to_chunks(". ".join(documents_text))

No of docs:  23
No of refenrence documents needed: 5
Document index: 5.0


In [31]:
#Storing the chunks into FAISS using embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = FAISS.from_texts(chunks, embeddings)
#db.save_local('faissdb/', 'index')

In [39]:
#Retrieving the Similar chunks using references
similar_chunk_k5 = []
for reference in references:
    similar_k_texts = get_similar_chunks(db, reference, k)
    similar_chunk_k5.append(". ".join(similar_k_texts))

In [None]:
recurive_summarization(".. ".join(similar_chunk_k5))