In [1]:
import os
from typing import List, Dict
import pymupdf
import PyPDF2
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
from chromadb.errors import InvalidCollectionException
import re
import nltk
from nltk.tokenize import sent_tokenize

  from tqdm.autonotebook import tqdm, trange


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/devanshk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import os
from typing import List, Dict

def process_papers_txt(folder_path: str) -> List[Dict[str, any]]:
    """Process each cleaned .txt file (provided from generate_clean_text_files.ipynb) in the folder and return a list of dictionaries with 'id' and 'text' keys."""
    papers = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            papers.append({
                "id": filename,
                "text": text
            })
    return papers



In [None]:
def create_embeddings(papers: List[Dict[str, str]], model_name: str = "all-MiniLM-L6-v2") -> List[Dict[str, any]]:
    """Create embeddings for the given papers."""

    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embedded_papers = []
    for paper in papers:
        embedding = model.encode(paper["text"])
        embedded_papers.append({
            "id": paper["id"],
            "text": paper["text"],
            "embedding": embedding.tolist()
        })
    return embedded_papers

In [9]:
def store_in_chroma(papers: List[Dict[str, any]], collection_name: str = "research_papers") -> chromadb.Collection:
    """Store the embedded papers in Chroma vector database."""
    client = chromadb.PersistentClient(settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE)
    
    # Check if collection exists, if not create it
    try:
        collection = client.get_collection(name=collection_name)
    except InvalidCollectionException:
        collection = client.create_collection(name=collection_name)
    
    ids = [paper["id"] for paper in papers]
    embeddings = [paper["embedding"] for paper in papers]
    documents = [paper["text"] for paper in papers]
    
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=documents
    )

    return collection

In [10]:
def semantic_chunking(text: str, model_name: str = "all-MiniLM-L6-v2") -> List[str]:
    """Chunk the text using semantic similarity."""
    model = SentenceTransformer(model_name)
    chunks = []
    sentences = sent_tokenize(text)
    current_chunk = sentences[0]

    for sentence in sentences[1:]:
        current_chunk_embedding = model.encode(current_chunk)
        sentence_embedding = model.encode(sentence)
        similarity = model.similarity(current_chunk_embedding, sentence_embedding)
        if similarity > 0.8:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence    

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [11]:
def chunk_by_fixed_size(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """Chunk the text into fixed-size chunks with overlap."""
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)

    return chunks


In [12]:
def chunk_by_sentence(text: str, max_chunk_size: int = 1000) -> List[str]:
    """Chunk the text by sentences, ensuring chunks don't exceed max_chunk_size."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [13]:
def query_chroma(query: str, collection: chromadb.Collection, top_k: int = 5) -> List[Dict[str, any]]:
    """Query the Chroma database and return the top_k most relevant results."""
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )
    
    # Reformat results for easier handling
    formatted_results = []
    for i in range(len(results['ids'][0])):
        formatted_results.append({
            'id': results['ids'][0][i],
            'text': results['documents'][0][i],
            'distance': results['distances'][0][i]
        })
    
    return formatted_results

In [None]:
def main(folder_path: str, chunking_method: str = "sentence"):
    """Main function to run the RAG pipeline."""
    print("Processing papers...")
    papers = process_papers_txt(folder_path)
    
    print("Chunking papers...")
    chunked_papers = []
    for paper in papers:
        if chunking_method == "fixed":
            chunks = chunk_by_fixed_size(paper["text"])
        elif chunking_method == "sentence":
            chunks = chunk_by_sentence(paper["text"])
        elif chunking_method == "semantic":
            chunks = semantic_chunking(paper["text"])
        else:
            raise ValueError("Invalid chunking method. Choose 'fixed' or 'sentence' or 'semantic'.")
        
        for i, chunk in enumerate(chunks):
            chunked_papers.append({
                "id": f"{paper['id']}_chunk_{i}",
                "text": chunk
            })
    
    print("Creating embeddings...")
    embedded_papers = create_embeddings(chunked_papers)
    
    # Splitting embedded_papers into smaller batches to avoid exceeding the maximum batch size
    max_batch_size = 5000
    print("Storing in Chroma database...")
    for i in range(0, len(embedded_papers), max_batch_size):
        batch = embedded_papers[i:i + max_batch_size]
        store_in_chroma(batch)
    
    print("Pipeline completed successfully!")
    # # Query loop Uncomment if wanting to test if the 
    # while True:
    #     query = input("Enter your query (or 'quit' to exit): ")
    #     if query.lower() == 'quit':
    #         break
        
    #     results = query_chroma(query, collection)
    #     print("\nTop 5 most relevant chunks:")
    #     for i, result in enumerate(results, 1):
    #         print(f"\n{i}. Document: {result['id']}")
    #         print(f"Relevance Score: {1 - result['distance']:.4f}")
    #         print(f"Text: {result['text'][:200]}...")  # Print first 200 characters

if __name__ == "__main__":
    folder_path = "cleaned_text"  # Set the path to your folder containing the .txt files from the generate_clean_text_files.ipynb
    chunking_method = "sentence"  # Choose the semantic strategy that you want to use
    main(folder_path, chunking_method)


Top 5 most relevant chunks:

1. Document: Eng_haptic_devices978_3_031_04536_3_pdf.txt_chunk_1
Relevance Score: 0.4841
Text: it is focused on publishing new advances and developments
in all aspects of haptics. haptics is a multidisciplinary eld with researchers from
psychology, physiology, neurology, engineering, and comput...

2. Document: Eng_haptic_devices_978_3_031_04536_3_pdf.txt_chunk_1
Relevance Score: 0.4841
Text: it is focused on publishing new advances and developments
in all aspects of haptics. haptics is a multidisciplinary eld with researchers from
psychology, physiology, neurology, engineering, and comput...

3. Document: 978_3_030_24564_1_pdf.txt_chunk_130
Relevance Score: 0.4490
Text: primarily technologies being developed for digital touch communication involve 
some form of haptics. haptics investigates humanmachine communication 
through the sense of touch in interactions where ...

4. Document: Electromechanical_Actuators_for_Haptic_Feedback_with_Fingertip_Contact_2