In [None]:
#default prompt

system_prompt ="""
 <s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information. <</SYS>>
"""

In [None]:
import torch
from pathlib import Path
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index import set_global_service_context
from llama_index import ServiceContext
from llama_index import VectorStoreIndex
from llama_index import download_loader
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms import HuggingFaceLLM



In [None]:
#Load LLM

def model_tokenizer_embedder(model_name, auth_token):
  tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="./model/", use_auth_token=auth_token)

    model = AutoModelForCausalLM.from_pretrained(model_name,cache_dir="./model/",use_auth_token=auth_token, torch_dtype=torch.float16, load_in_8bit=True,)

    embedding_llm = LangchainEmbedding(HuggingFaceEmbeddings(model_name="Word2Vec"))

    return tokenizer, model, embedding_llm


tokenizer, model, embedding_llm = model_tokenizer_embedder(
    "krthk/llama-2-7b-hs-tuned", "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"
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT model loaded successfully!


In [None]:
#Load PDFs

from google.colab import files
import PyPDF2
import io

def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text


uploaded = files.upload()


extracted_texts = []
for filename in uploaded.keys():
    if filename.lower().endswith('.pdf'):
        pdf_file = io.BytesIO(uploaded[filename])
        text = extract_text_from_pdf(pdf_file)
        extracted_texts.append(text)

uploaded = extracted_texts

In [None]:
# RAG
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    model=model,
    tokenizer=tokenizer,
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024, llm=llm, embed_model=embedding_llm
)

set_global_service_context(service_context)

index = VectorStoreIndex.from_documents(uploaded)
query_engine = index.as_query_engine()

In [None]:
documents = uploaded
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("What are the analysts recommendation on PERSISTENT SYSTEMS?")
print(response)

Persistent Systems has TTM P/E ratio 44.24 as compared to the sector P/E of 31.96. There are 29 analysts who have initiated coverage on Persistent Systems. There are 4 analysts who have given it a strong buy rating & 9 analysts have given it a buy rating. 5 analysts have given the stock a sell rating.


In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import gensim.downloader as api

nltk.download('punkt')
word2vec_model = api.load('word2vec-google-news-300')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def chunk_document(document, chunk_size=5):
    return [' '.join(sentences) for sentences in nltk.everygrams(
        word_tokenize(document.lower()), chunk_size, chunk_size)]

def vectorize_chunk(chunk, model):
    vectors = [model[word] for word in word_tokenize(chunk.lower()) if word in model.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


In [None]:
# clustering similar parts of the documents

documents = uploaded

chunks = [chunk for doc in documents for chunk in chunk_document(doc)]


chunk_vectors = np.array([vectorize_chunk(chunk) for chunk in chunks])


clustering = AgglomerativeClustering(n_clusters=5)
cluster_labels = clustering.fit_predict(chunk_vectors)


clustered_docs = [' '.join([chunks[i] for i, label in enumerate(cluster_labels) if label == j])
                   for j in range(5)]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# using mmr to eliminate redundant text

def mmr(clustered_docs, word2vec_model, top_n=5):
    selected_sentences = []
    all_sentences = [sentence for doc in clustered_docs for sentence in doc.split('. ')]

    def vectorize(sentence):
        vectors = [word2vec_model[word] for word in word_tokenize(sentence.lower()) if word in word2vec_model.key_to_index]
        return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)


    sentence_vectors = np.array([vectorize(sentence) for sentence in all_sentences])
    doc_vector = np.mean(sentence_vectors, axis=0)

    for _ in range(min(top_n, len(all_sentences))):
        mmr_scores = []

        for sentence in all_sentences:
            if sentence in selected_sentences:
                continue
            sentence_vector = vectorize(sentence)
            sim_to_doc = cosine_similarity([sentence_vector], [doc_vector])[0][0]
            max_sim_to_selected = max([cosine_similarity([sentence_vector], [vectorize(selected_sentence)])[0][0] for selected_sentence in selected_sentences] or [0])
            mmr_score = sim_to_doc - max_sim_to_selected
            mmr_scores.append((sentence, mmr_score))


        selected_sentences.append(max(mmr_scores, key=lambda x: x[1])[0])

    return selected_sentences

In [None]:
mmr_results = [mmr([doc], word2vec_model) for doc in clustered_docs]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [None]:
#using LSA to identify key information from the documetns

mmr_sentences = [sentence for sublist in mmr_results for sentence in sublist]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(mmr_sentences)

lsa = TruncatedSVD(n_components=5)
X_lsa = lsa.fit_transform(X)

important_sentences_indices = np.argmax(X_lsa, axis=0)
important_information = [mmr_sentences[index] for index in important_sentences_indices]

In [None]:
def recursive_summary(chunks):
    tokens = len((" ".join(chunks))).split()
    if tokens < 4000:
        return generate_final_summary(" ".join(chunks))
    else:
        return recursive_summary(chunks)

In [None]:

def generate_final_summary( text_chunk,):
    template = """Summarize as a abstract summary: \
    {text}"""

    prompt = PromptTemplate(template=template, input_variables=["text"])
    llm_chain = LLMChain(prompt=prompt, llm="LLama2")

    summary = llm_chain.run(text_chunk)

    return summary

In [None]:
final_summary = recursive_summary(important_information)
final_summary



The analyst reviewed technology services provider Persistent Systems, shipping company Great Eastern Shipping, tech giants Apple and Microsoft, and retail chain Walmart.

Persistent Systems is well-positioned to benefit from strong demand for digital transformation services and migration of enterprise workloads to the cloud, according to the analyst. Key growth drivers highlighted include talent shortage leading to increased outsourcing of software product development, partnerships with industry leaders, and strategic acquisitions to expand digital capabilities. Given Persistent's leadership, partnerships, and double-digit revenue growth, the analyst rated it a Buy.

The analyst pointed out Great Eastern Shipping’s earnings have declined due to lower freight rates and overcapacity in the shipping industry. While valuations look attractive for Great Eastern, the analyst maintained a Hold rating owing to the weak industry outlook, noting upside may be limited until freight rates recover 