In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
!pip install langchain einops accelerate transformers bitsandbytes scipy
!pip install xformers sentencepiece
!pip install llama-index llama_hub
!pip install sentence-transformers pypdf openai glob2

#Loading Model and tokenizer

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
import huggingface_hub
from langchain import HuggingFacePipeline


# Loading model and tokenizer from huggingface
def load_model(device_type, model_id):
    print(f"Loading Model: {model_id}, on: {device_type}")

    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./model/")

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        cache_dir="./model/",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        load_in_8bit=True,
    )

    generation_config = GenerationConfig.from_pretrained(model_id)

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=4096,
        top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={"temperature": 0})
    print("Local LLM Loaded")

    return tokenizer, local_llm

In [3]:
tokenizer, LLM = load_model("cuda", "krthk/llama-2-7b-hs-tuned")  # finetuned

Loading Model: krthk/llama-2-7b-hs-tuned, on: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

Local LLM Loaded


In [1]:
import glob
import re
import pandas as pd
import string
from langchain.document_loaders import PyPDFLoader


def clean_document(text):  # remove unnecessary characters reducing amibuity
    text = re.sub("\s+", " ", text)
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"\.{2,}", ".", text)
    text = "".join([i for i in text if i not in string.punctuation or i == "."])
    text = re.sub(r"\d+", "", text)
    return " ".join([word for word in text.split() if len(word) > 1])


def load_documents_and_chunks(
    directory,
):  # Given a directory retrive the documents in it
    documents = []
    dataframe = {"Document": [], "Content": []}
    full_string = ""

    for item_path in glob.glob(directory + "*.pdf"):
        loader = PyPDFLoader(item_path)
        documents.extend(loader.load())

    for i, doc in enumerate(documents):
        cleaned_string = clean_document(doc.page_content)
        doc.page_content = cleaned_string
        full_string += cleaned_string
        dataframe["Document"].append(f"Document_{i}")
        dataframe["Content"].append(cleaned_string)

    dataframe = pd.DataFrame(dataframe)

    # Documents, Documents in a dataframe, full concatinated string of all documents
    return documents, dataframe, full_string

In [2]:
documents, dataframe, full_String = load_documents_and_chunks("/content/documents/")

In [None]:
print(dataframe)

       Document                                            Content
0    Document_0  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
1    Document_1  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
2    Document_2  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
3    Document_3  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
4    Document_4  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
5    Document_5  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
6    Document_6  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
7    Document_7  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
8    Document_8  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
9    Document_9  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
10  Document_10  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
11  Document_11  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
12  Document_12  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR...
13  Document_13  HYBRID LONG DOCUMENT SUMMARIZATION USING CFFA

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Maximal Marginal Relevance to remove redundancy in the documents
def mmr(documents, dataframe, full_string, top_n=5, lambda_param=0.5):
    vectorizer = CountVectorizer().fit_transform(dataframe["Content"])
    similarity_matrix = cosine_similarity(vectorizer, vectorizer)

    def mmr_score(selected, candidate, lambda_param):
        sim_selected = cosine_similarity(
            vectorizer[selected, :], vectorizer[candidate, :]
        )
        sim_candidate = cosine_similarity(
            vectorizer[candidate, :], vectorizer[candidate, :]
        )

        return lambda_param * sim_candidate - (1 - lambda_param) * sim_selected.mean()

    selected_docs = []
    redundant_docs = []
    remaining_docs = list(range(len(documents)))

    while len(selected_docs) < top_n:
        mmr_scores = []
        for candidate in remaining_docs:
            if not selected_docs:
                mmr_scores.append((candidate, 0.0))
            else:
                score = mmr_score(selected_docs, candidate, lambda_param)
                mmr_scores.append((candidate, score))

        mmr_scores.sort(key=lambda x: x[1], reverse=True)
        selected_doc_index = mmr_scores[0][0]

        selected_docs.append(selected_doc_index)
        remaining_docs.remove(selected_doc_index)

    cleaned_documents = []

    for i, doc_index in enumerate(selected_docs):
        selected_content = dataframe.loc[doc_index, "Content"]
        redundant_content = ""

        for redundant_index in redundant_docs:
            redundant_content += dataframe.loc[redundant_index, "Content"]

        cleaned_content = selected_content.replace(redundant_content, "")
        cleaned_documents.append(cleaned_content)

    return cleaned_documents

In [None]:
cleaned_documents = mmr(documents, dataframe, full_String, top_n=len(documents) / 1.5)

len(cleaned_documents)

32

In [None]:
cleaned_documents[0]

'HYBRID LONG DOCUMENT SUMMARIZATION USING CFFAR AND CHATGPT RACTICAL STUDY Guang Lu Sylvia B. Larcher Institute of Communication and Marketing Lucerne University of Applied Sciences and Arts Zentralstrasse Lucerne Switzerland guang.luhslu.ch sylvia.bendelhslu.chTu Tran getAbstract AG Alpenquai Lucerne Switzerland tu.trangetAbstract.com June ABSTRACT Text summarization is an important downstream natural language processing NLP task that chal lenges both the understanding and generation capabilities of language models. Thanks to large language models LLMs and techniques for finetuning models in machine learning significant progress has been made in automatically summarizing short texts such as news articles often leading to very satisfactory machinegenerated results. In contrast summarizing long documents still remains major challenge. This is partly due to the complex nature of contextual information in long texts but also due to the lack of opensource benchmarking datasets and the corr

In [6]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

embedding_model = SentenceTransformer(
    "all-MiniLM-L12-v2"
)  # Model to create vector embeddings

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=128,  # similar token len in overlap of text between chunks
    is_separator_regex=False,
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
chunks = text_splitter.split_text(
    " ".join(cleaned_documents)
)  # Create chunks of size 2046

len(chunks)

57

In [None]:
from sklearn.cluster import AgglomerativeClustering


# AgglomerativeClustering for vector clustering
def cluster_sentences(sentences, embedding_model, distance_threshold=1.5):
    sentence_embeddings = embedding_model.encode(sentences)

    clustering_model = AgglomerativeClustering(
        distance_threshold=distance_threshold, n_clusters=None, linkage="ward"
    )
    clustering_model.fit(sentence_embeddings)

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(clustering_model.labels_):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(sentences[sentence_id])

    return [cluster for cluster in clustered_sentences.values()]

In [None]:
clusters = cluster_sentences(
    chunks, embedding_model
)  # Cluster sentences based on similarity score

for i in clusters:
    print(len(i), "chunks")

7 chunks
16 chunks
9 chunks
4 chunks
7 chunks
10 chunks
4 chunks


In [None]:
from transformers import AutoTokenizer
import numpy as np
import math


def calc_tokens(text, tokenizer):  # Calculates number of tokens using llama2 tokenizer
    return len(tokenizer.tokenize(text))


def redistribution(
    listoflistofchunks, tokenizer
):  # Limit the clusters maximum of 3000 tokens
    limit = 3072
    redistributed = []

    for index, chunks in enumerate(listoflistofchunks):
        tokens = calc_tokens("\n".join(chunks), tokenizer)

        if tokens < limit:  # Token size of cluster is lower than the limit
            print(f"chunk {index} of tokens {tokens} is fine")
            redistributed.extend([chunks])
        else:  # If limit exceeded divide into suitable parts creating new cluster
            print(
                f"chunk {len(chunks)} of tokens {tokens} is splitted into {math.ceil(tokens/limit)} parts"
            )
            partitionlist = np.array_split(chunks, math.ceil(tokens / limit))
            redistributed.extend([list(x) for x in partitionlist])
    return redistributed

In [None]:
limitclusters = redistribution(clusters, tokenizer)

print("------------------------------------------")

for i in limitclusters:
    print("chunk:", len(i), "- tokens:", calc_tokens("\.".join(i), tokenizer))

chunk 7 of tokens 3428 is splitted into 2 parts
chunk 16 of tokens 8378 is splitted into 3 parts
chunk 9 of tokens 4134 is splitted into 2 parts
chunk 3 of tokens 1755 is fine
chunk 7 of tokens 3157 is splitted into 2 parts
chunk 10 of tokens 4567 is splitted into 2 parts
chunk 6 of tokens 1863 is fine
------------------------------------------
chunk: 4 - tokens: 2122
chunk: 3 - tokens: 1304
chunk: 6 - tokens: 3350
chunk: 5 - tokens: 2607
chunk: 5 - tokens: 2418
chunk: 5 - tokens: 2333
chunk: 4 - tokens: 1800
chunk: 4 - tokens: 1755
chunk: 4 - tokens: 1755
chunk: 3 - tokens: 1401
chunk: 5 - tokens: 2294
chunk: 5 - tokens: 2272
chunk: 4 - tokens: 1863


In [None]:
from summa.summarizer import summarize

key_information_extracted = []
for cluster in limitclusters:
    key_information_extracted.append(summarize(" ".join(cluster), ratio=0.5))

print(
    "length:",
    len(" ".join(key_information_extracted)),
    "tokens:",
    calc_tokens(" ".join(key_information_extracted), tokenizer),
)

length: 75652 tokens: 18028


In [None]:
from langchain import PromptTemplate, LLMChain


def cluster_summaries(
    text_chunks_list,
):  # Takes in chunks for recursive summary (Map Reduce)
    template = """Summarize as a abstract summary: \
    {text}"""
    prompt = PromptTemplate(template=template, input_variables=["text"])
    llm_chain = LLMChain(prompt=prompt, llm=LLM)

    ip = [{"text": i} for i in text_chunks_list]
    summary = llm_chain.apply(ip)

    return [i["text"] for i in summary]


def generate_final_summary(
    text_chunk,
):  # Takes the final text and generates the final summary
    template = """Summarize as a abstract summary: \
    {text}"""

    prompt = PromptTemplate(template=template, input_variables=["text"])
    llm_chain = LLMChain(prompt=prompt, llm=LLM)

    summary = llm_chain.run(text_chunk)

    return summary

In [None]:
def recursive_summary(chunks):
    tokens = calc_tokens(" ".join(chunks))
    if tokens < 4000:
        return generate_final_summary(" ".join(chunks))
    else:
        listofsummaries = cluster_summaries(chunks)
        return recursive_summary(listofsummaries)

In [None]:
final_summary = recursive_summary(key_information_extracted)

final_summary

This comprehensive study delves into the intricate task of summarizing lengthy documents, leveraging a hybrid approach combining ChatGPT and CFFAR (Coarse-to-Fine Facet-Aware Ranking), an extractive summarization model. While ChatGPT displays promise, persisting challenges in text coherence, faithfulness, and style prompt the integration of CFFAR to augment summarization quality. The study underlines the preliminary nature of ChatGPT for long document summarization, suggesting it as an inspirational tool for human editors.
The authors anticipate that their work will serve as a guide for Natural Language Processing (NLP) researchers, aligning ChatGPT's capabilities with practitioners' needs. Stressing the necessity for ongoing research, the study emphasizes the need to test the hybrid summarization pipeline and construct specialized evaluation frameworks for long document summarization. The provided text delves into an array of topics related to multidocument summarization (MDS) dataset

In [None]:
import torch
from glob import glob
from pathlib import Path
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index import set_global_service_context
from llama_index import ServiceContext
from llama_index import VectorStoreIndex
from llama_index import download_loader
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms import HuggingFaceLLM

system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information. <</SYS>>
"""  # Llama2's official system prompt


def model_tokenizer_embedder(model_name, auth_token):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, cache_dir="./model/", use_auth_token=auth_token
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir="./model/",
        use_auth_token=auth_token,
        torch_dtype=torch.float16,
        load_in_8bit=True,
    )

    embedding_llm = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    )

    return tokenizer, model, embedding_llm


tokenizer, model, embedding_llm = model_tokenizer_embedder(
    "krthk/llama-2-7b-hs-tuned", "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"
)

query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    model=model,
    tokenizer=tokenizer,
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024, llm=llm, embed_model=embedding_llm
)

set_global_service_context(service_context)

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [8]:
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()


def load_documents(directory):
    documents = []
    for item_path in glob(directory + "*.pdf"):
        documents.extend(loader.load(file_path=item_path, metadata=True))
    return documents

In [9]:
documents = load_documents("/content/documents/")

In [10]:
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

In [11]:
response = query_engine.query("What is C2F_FAR?")

print(response)

 As a responsible and ethical AI language model, I must first emphasize that C2F-FAR is not a widely recognized or established term in the field of natural language processing (NLP). However, based on the context and the information provided, I can provide some insights.

C2F-FAR seems to be a variant of the popular open-source library, C2F (Convolutional and Recurrent Feedforward Networks), which is designed for extractive summarization tasks. The "FAR" part of the name could potentially stand for "Far-Reaching Accuracy," implying that the model is capable of producing high-quality summaries that cover a wide scope of information.

Without access to the original source code or detailed documentation, it's difficult to provide a comprehensive analysis of C2F-FAR's performance or features. However, based on the information provided in the context, here are some general insights:

1. C2F-FAR appears to be an extension of the C2F library that incorporates farming techniques to improve the