In [51]:
import PyPDF2
import re
import torch
import ollama
from openai import OpenAI
import spacy
from sentence_transformers import SentenceTransformer, util




In [52]:
CHUNK_SIZE = 450
CYAN = '\033[96m'
NEON_GREEN = '\033[92m'
RESET_COLOR = '\033[0m'

In [38]:
def getDataFromPdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    num_pages = len(pdf_reader.pages)
    text = ''
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        if page.extract_text():
            text += page.extract_text() + " "
    return text

In [86]:
def getDataFromPdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text_list = []
    
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        
        if page_text:
            # Remove common headers and footers (adjust regex if needed)
            lines = page_text.split("\n")
            
            # Regex pattern to detect headers like: "1/8/25, 10:05 AM sec.gov/Archives/..."
            header_pattern = r'\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2} [APM]{2}'                         

            # Other common headers/footers to remove
            unwanted_patterns = [
                header_pattern,                      # Header with date & sec.gov URL
                r'EX-\d+\.\d+',
                r"^Page\s*\d+",                      # Matches "Page X"
                r'sec\.gov/Archives/\S+',
                r"^(Document Title|Confidential)",   # Common header/footer texts
                r"^SEC\s*Filing\s*Details",          # SEC-specific header
                r"^Table of Contents$",              # Table of Contents header
                r"^(http)"                           # remove the link/url
            ]

            # Remove lines matching any of the patterns
            filtered_lines = [line for line in lines if not any(re.match(p, line.strip()) for p in unwanted_patterns)]
            
            # Replace newlines with spaces for cleaner text output
            clean_text = " ".join(filtered_lines)
            text_list.append(clean_text)
    
    return " ".join(text_list)  # Join all pages with a space

In [40]:
def prepareChunks(text):
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Split text into chunks by sentences, respecting a maximum chunk size
    sentences = re.split(r'(?<=[.!?]) +', text)  # split on spaces following sentence-ending punctuation
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        # Check if the current sentence plus the current chunk exceeds the limit
        if len(current_chunk) + len(sentence) + 1 < CHUNK_SIZE:  # +1 for the space
            current_chunk += (sentence + " ").strip()
        else:
            # When the chunk exceeds 1000 characters, store it and start a new one
            chunks.append(current_chunk)
            current_chunk = sentence + " "
    if current_chunk:  # Don't forget the last chunk!
        chunks.append(current_chunk)
    with open("vault.txt", "w", encoding="utf-8") as vault_file:
        for chunk in chunks:
            # Write each chunk to its own line
            vault_file.write(chunk.strip() + "\n\n")  # Two newlines to separate chunks
    print(f"PDF content stored to vault.txt with each chunk on a separate line.")
    return chunks

In [79]:
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient semantic embeddings

def prepareChunks(text, chunk_size=1000, output_file="vault.txt", similarity_threshold=0.7):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Use spaCy for sentence segmentation
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current_chunk = []
    current_chunk_embedding = None  # Store embeddings for semantic similarity checks

    for sentence in sentences:
        sentence_embedding = embedder.encode(sentence, convert_to_tensor=True)

        # If adding a sentence exceeds chunk size OR it's semantically different → Start a new chunk
        if (sum(len(s) for s in current_chunk) + len(sentence) + 1 > chunk_size or
                (current_chunk_embedding is not None and 
                 util.pytorch_cos_sim(current_chunk_embedding, sentence_embedding).item() < similarity_threshold)):
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            current_chunk.append(sentence)
            # Update chunk embedding as average of existing embeddings
            current_chunk_embedding = sentence_embedding if current_chunk_embedding is None else \
                                      (current_chunk_embedding + sentence_embedding) / 2

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    # Save chunks to a file
    with open(output_file, "w", encoding="utf-8") as vault_file:
        vault_file.write("\n\n".join(chunks))  # Efficient file writing

    print(f"PDF content stored in {output_file} with semantic chunking.")
    
    return chunks

In [80]:
def prepareEmbeddings(chunks):
    print(NEON_GREEN + "Generating embeddings for the vault content..." + RESET_COLOR)
    vault_embeddings = []
    for chunk in chunks:
        try:
            response = ollama.embeddings(model='nomic-embed-text', prompt=chunk)
            embedding = response.get("embedding")
            if embedding:  # Ensure embedding is not None
                vault_embeddings.append(embedding)
            else:
                print(f"Skipping invalid embedding for content: {chunk.strip()}")
        except Exception as e:
            print(f"Failed to generate embedding for content: {chunk.strip()}. Error: {e}")

    if not vault_embeddings:
        print("No valid embeddings generated. Exiting...")
        exit(1)
    return vault_embeddings

In [81]:
def convertEmbeddingsToTensor(vault_embeddings):
    # Ensure all embeddings have the same size
    embedding_size = len(vault_embeddings[0])
    if any(len(e) != embedding_size for e in vault_embeddings):
        print("Embedding size mismatch detected. Skipping invalid embeddings...")
        vault_embeddings = [e for e in vault_embeddings if len(e) == embedding_size]

    # Convert to tensor
    vault_embeddings_tensor = torch.tensor(vault_embeddings)
    print("Embeddings for each line in the vault:")
    print(vault_embeddings_tensor)
    return vault_embeddings_tensor

In [82]:
def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k=5):
    if not rewritten_input.strip():
        print("Rewritten input is empty. Skipping context retrieval.")
        return []

    if vault_embeddings.nelement() == 0:  # Check if the tensor has any elements
        print("Vault embeddings are empty. Skipping context retrieval.")
        return []

    try:
        input_embedding = ollama.embeddings(model='nomic-embed-text', prompt=rewritten_input)["embedding"]
    except Exception as e:
        print(f"Failed to generate input embedding. Error: {e}")
        return []

    if not input_embedding:
        print("Input embedding is invalid. Skipping context retrieval.")
        return []

    # Compute cosine similarity
    cos_scores = torch.cosine_similarity(torch.tensor(input_embedding).unsqueeze(0), vault_embeddings)

    # Adjust top_k if needed
    top_k = min(top_k, len(cos_scores))
    top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()

    # Retrieve relevant context
    relevant_context = [vault_content[idx].strip() for idx in top_indices]
    return relevant_context

In [83]:
def getRelevantEmbeddings(query, embeddings, content):
    relevant_context = get_relevant_context(query, embeddings, content)
    if relevant_context:
        context_str = "\n".join(relevant_context)
        print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
    else:
        print(CYAN + "No relevant context found." + RESET_COLOR)
    
    user_input_with_context = query
    if relevant_context:
        user_input_with_context = query + "\n\nRelevant Context:\n" + context_str

    conversation_history = []
    conversation_history.append({"role": "user", "content": user_input_with_context})


    system_message = "You are a helpful assistant that is an expert at extracting the most useful information from a given text. Also bring in extra relevant information to the user query from outside the given context."
    messages = [
        {"role": "system", "content": system_message}, 
        *conversation_history
    ]

    response = client.chat.completions.create(
        model='gemma:2b',
        messages=messages,
        max_tokens=3000,
    )
    print(response.choices[0].message.content)

In [84]:
client = OpenAI(
    base_url='http://localhost:11434/v1',
    api_key='gemma:2b'
)

In [87]:
text_data = getDataFromPdf("./Example_2.pdf")

In [88]:
chunks = prepareChunks(text_data)

PDF content stored in vault.txt with semantic chunking.


In [74]:
embeddings = prepareEmbeddings(chunks)

[92mGenerating embeddings for the vault content...[0m


In [75]:
tensor_embedding = convertEmbeddingsToTensor(embeddings)

Embeddings for each line in the vault:
tensor([[ 0.8608,  0.9354, -3.0653,  ..., -1.1735, -0.7083, -0.1666],
        [ 0.6117,  0.0962, -3.4848,  ..., -0.6500, -1.4436, -0.8836],
        [ 0.8844,  1.1359, -3.4204,  ..., -1.3753, -0.8387, -0.8021],
        ...,
        [ 1.1135,  1.4939, -3.5688,  ..., -1.6604, -0.9151, -0.4049],
        [ 0.7071,  1.2066, -3.8221,  ..., -1.3988, -0.9617,  0.1832],
        [ 1.2377,  0.7432, -3.4734,  ..., -1.9130, -1.0240,  0.1240]])


In [76]:
def get(ques):
    getRelevantEmbeddings(ques, tensor_embedding, chunks)

In [77]:
ques='extract the Borrower, admistrative agent and underwriter(Bookrunner, lead arranger, left lead, manager) and the aggrement date'
get(ques)

Context Pulled from Documents: 

[96mThe Borrower and its Subsidiaries taken as a whole are Solvent as of the Effective Date.
The Administrative Agent shall have received, at least three Business Days prior to the Effective Date, all documentation and other information regarding the Borrower requested in connection with applicable “know your customer” and anti-money laundering rules and regulations, including the Patriot Act, to the extent requested in writing of the Borrower at least 10 Business Days prior to the Effective Date and (ii) to the extent the Borrower qualifies as a “legal entity customer” under the Beneficial Ownership Regulation, at least three Business Days prior to the Effective Date, any Lender that has requested, in a written notice to the Borrower at least 10 Business Days prior to the Effective Date, a Beneficial Ownership Certification in relation to the Borrower shall have received such Beneficial Ownership Certification (provided that, upon the execution and de

In [68]:
import re

def clean_text(text):
    # Remove document references (e.g., "EX-10.1 2 hrmy-20230726xex10d1.htm EX-10.1")
    text = re.sub(r'EX-\d+\.\d+ \d+ \S+ EX-\d+\.\d+', '', text)

    # Remove timestamps (e.g., "1/8/25, 10:05 AM")
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2} [APM]{2}', '', text)

    # Remove SEC URLs
    text = re.sub(r'sec\.gov/Archives/\S+', '', text)

    # Normalize whitespace (remove extra spaces and newlines)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Example usage
raw_text = """EX-10.1 2 hrmy-20230726xex10d1.htm EX-10.1 Exhibit 10.1 EXECUTION VERSION CREDIT AGREEMENT dated as of July 26, 2023 among HARMONY BIOSCIENCES HOLDINGS, INC., as Borrower The Lenders Party Hereto JPMORGAN CHASE BANK, N.A. as Administrative Agent and JPMORGAN CHASE BANK, N.A., as Bookrunner and Lead Arranger1/8/25, 10:05 AM sec.gov/Archives/edgar/data/1802665/000155837023012348/hrmy-20230726xex10d1.htm#_Toc256000083 iTABLE OF CONTENTS Page Article I Definitions 1 Section 1.01Defined"""

cleaned_text = clean_text(raw_text)
print(cleaned_text)

Exhibit 10.1 EXECUTION VERSION CREDIT AGREEMENT dated as of July 26, 2023 among HARMONY BIOSCIENCES HOLDINGS, INC., as Borrower The Lenders Party Hereto JPMORGAN CHASE BANK, N.A. as Administrative Agent and JPMORGAN CHASE BANK, N.A., as Bookrunner and Lead Arranger iTABLE OF CONTENTS Page Article I Definitions 1 Section 1.01Defined
