##Multi-document summarization using Llama2 (Clustering + Graph RAG) 🦙

In [None]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
!pip install -q langchain einops accelerate transformers scipy
!pip install -q xformers sentencepiece sentence-transformers pypdf
!pip install -q llama-index==0.7.21 llama_hub==0.0.19 openai
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install glob2

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
import huggingface_hub
from langchain import HuggingFacePipeline

huggingface_hub.notebook_login()

#Downloading the model and Text-generatioin pipeline

In [None]:
def load_model(device_type, model_id):

    print(f"Loading Model: {model_id}, on: {device_type}")

    tokenizer = AutoTokenizer.from_pretrained(
        model_id, cache_dir="./model/"
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        cache_dir="./model/",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        load_in_8bit=True,
        )

    generation_config = GenerationConfig.from_pretrained(model_id)

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=4096,
        top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0})
    print("Local LLM Loaded")

    return tokenizer, local_llm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
SHOW_SOURCES = True

In [None]:
tokenizer, LLM = load_model(device, "krthk/llama-2-7b-chat-finetuned") #Huggingface model id

#Dealing with the documents (Text extraction + clustering + adjusting chunk tokens)

In [None]:
import os
from glob import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [None]:
def load_documents_and_chunks(directory): #Takes the docs directory and returns documents

  documents = []
  for item_path in glob(directory + "*.pdf"):
      loader = PyPDFLoader(item_path)
      documents.extend(loader.load())

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 1024,
      chunk_overlap  = 128,  # similar token len in overlap of text between chunks
      is_separator_regex = False,
  )

  raw_text = ""
  text_chunks = []
  for doc in documents:
    raw_text += doc.page_content
    text_chunks.append(doc.page_content)

  return text_chunks, raw_text

def calc_tokens(text):
  return len(tokenizer.tokenize(text))

In [None]:
chunks, raw_text = load_documents_and_chunks("/content/documents/")

In [None]:
for i in chunks:
  print(calc_tokens(i))

In [None]:
len(chunks)

In [None]:
chunks[10]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2') #Embedding model

def cluster_sentences(sentences, distance_threshold=1.3): # AgglomerativeClustering for vector clustering
    sentence_embeddings = model.encode(sentences)

    clustering_model = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None, linkage='ward')
    clustering_model.fit(sentence_embeddings)

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(clustering_model.labels_):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(sentences[sentence_id])

    return [cluster for cluster in clustered_sentences.values()]

In [None]:
clusters = cluster_sentences(chunks)

In [None]:
for i in clusters:
  if isinstance(i, list):
    print("List length:", len(i))
  else:
    print("String")


In a cluster where `n` chunks exist, each containing a certain number of tokens, the combined token size of the cluster is represented as `(tokens[0] + tokens[1] + ... + tokens[n])`. Occasionally, this total surpasses the input token limit of the Large Language Model (LLM), making it challenging for digestion. To address this, the clusters are redistributed, imposing a token limit for each cluster. The advantage of having similar information in each cluster ensures that the choice of elements within a cluster is inconsequential, as they consistently exhibit similarity.

✅ Therefore, achieving clusters of similar information which can fit into the LLM


In [None]:
import math
import numpy as np

def redistribution(listoflistofchunks):
  limit = 2048
  redistributed = []
  for index, chunks in enumerate(listoflistofchunks):
    tokens = calc_tokens("\n".join(chunks))

    print(index, tokens)

    if tokens < limit:
      redistributed.extend([chunks])
    else:
      print(f"chunk {index} of tokens {tokens} is splitted into {math.ceil(tokens/limit)} parts")
      partitionlist = np.array_split(chunks, math.ceil(tokens/limit))
      # res = [list(x) for x in partitionlist]
      redistributed.extend([list(x) for x in partitionlist])
  return redistributed

In [None]:
limit_clusters = redistribution(clusters)

In [None]:
for i in limit_clusters:
  if isinstance(i, list):
    print("List length:", len(i), "tokens:", calc_tokens("\n".join(i)))
  else:
    print("String")

In [None]:
#joining the chunks into the cluster to make a string of all the cluster information
#Therefore it can be used by langchain with prompt templates

tosendlangchain = ["\n".join(i) for i in limit_clusters]

for i in tosendlangchain:
  print(calc_tokens(i))

#Utilizing langchain for creating LLM chain and prompt templates

In [None]:
from langchain import PromptTemplate, LLMChain

def cluster_summaries(text_chunks_list):
  template = """As an expert summarizer, produce a concise yet comprehensive summary of the given text,
  whether it's an article, blog post, conversation, or passage without adding your interpretations.
  Your summary should exhibit great detail, depth, and complexity while ensuring clarity and conciseness.
  Employ the following content to create the summary:
  {text}"""
  prompt = PromptTemplate(template=template, input_variables=["text"])
  llm_chain = LLMChain(prompt=prompt, llm=LLM)

  ip = [{'text': i} for i in text_chunks_list]
  summary = llm_chain.apply(ip)

  return [i['text'] for i in summary]


def generate_final_summary(text_chunk): #Takes the final text and generates the final summary
  template = """I want you to act as a text summarizer to help me create a brief understandable summary of the text I provide,
  whether it's an article, blog post, conversation, or passage.
  The summary should be in good length, expressing the points and concepts written in the original text without adding your interpretations.
  Breifly summarize the following text for me:
  {text}"""

  prompt = PromptTemplate(template=template, input_variables=["text"])
  llm_chain = LLMChain(prompt=prompt, llm=LLM)

  summary = llm_chain.run(text_chunk)

  return summary

In [None]:
def recursive_summary(listofclusters):
  tokens = calc_tokens("\n".join(listofclusters))
  print("current tokens:", tokens)

  if tokens+90 <= 4096:
    return generate_final_summary("\n".join(listofclusters)) #proceed to the final summary token limit is not exceeded
  else:
    summaryofclusters = cluster_summaries(listofclusters) #else use mapreduce like technique
    print(summaryofclusters,"\n\n\n")
    return recursive_summary(summaryofclusters)

In [None]:
summary = recursive_summary(tosendlangchain)

In [None]:
summary