<a href="https://colab.research.google.com/github/jessiechd/RAG_Model/blob/main/0212_semantic_chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers rouge  --q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Semantic Chunking: Testing with documents
- variations in minimum sentences per chunk and overlaps
- coherence score for evaluation

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import rouge
from collections import defaultdict

In [None]:
class TextChunker2:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v1'):
        self.model = SentenceTransformer(model_name)

    def process_file(self, file_path, dynamic_window=True, min_chunk_size=3, overlap=1, num_clusters=5):
        # Step 1: Load and encode text
        sentences = self._load_text(file_path)
        contextualized = self._add_dynamic_context(sentences) if dynamic_window else self._add_fixed_context(sentences)
        embeddings = self.model.encode(contextualized)

        # Step 2: Compute cosine distances between consecutive embeddings
        distances = self._calculate_distances(embeddings)

        # Step 3: Identify breakpoints based on semantic gaps
        breakpoints = self._identify_breakpoints(distances, num_clusters)

        # Step 4: Create initial chunks with overlap
        initial_chunks = self._create_chunks(sentences, breakpoints, overlap)

        # Step 5: Merge small chunks for better coherence
        if min_chunk_size > 1:  # Ensure chunks are not too small
            chunk_embeddings = self.model.encode(initial_chunks)
            final_chunks = self._merge_small_chunks(initial_chunks, chunk_embeddings, min_chunk_size)
        else:
            final_chunks = initial_chunks  # Skip merging if min_chunk_size is 1

        return final_chunks


    def _load_text(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return sent_tokenize(text)

    def _add_fixed_context(self, sentences, window_size=1):
        return [' '.join(sentences[max(0, i-window_size): min(len(sentences), i+window_size+1)]) for i in range(len(sentences))]

    def _add_dynamic_context(self, sentences):
        contextualized = []
        embeddings = self.model.encode(sentences)
        for i in range(len(sentences)):
            similarities = cosine_similarity([embeddings[i]], embeddings)[0]
            closest_indices = np.argsort(-similarities)[:3]  # Select 2 most relevant neighbors
            context = ' '.join(sentences[j] for j in sorted(closest_indices))
            contextualized.append(context)
        return contextualized

    def _identify_breakpoints(self, distances, threshold_percentile=90):
        """Find breakpoints where semantic distance is high."""
        threshold = np.percentile(distances, threshold_percentile)  # Dynamic threshold
        return [i for i, dist in enumerate(distances) if dist > threshold]

    # def _identify_breakpoints(self, distances, num_clusters=3):
    #     distances = np.array(distances).reshape(-1, 1)  # Reshape for clustering
    #     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    #     kmeans.fit(distances)
    #     labels = kmeans.labels_

    #     # Find cluster with highest distance values
    #     breakpoint_cluster = np.argmax(kmeans.cluster_centers_)
    #     return [i for i, label in enumerate(labels) if label == breakpoint_cluster]

    def _create_chunks(self, sentences, breakpoints, overlap=1):
        chunks = []
        start_idx = 0

        for breakpoint in breakpoints:
            end_idx = breakpoint + 1
            chunk = ' '.join(sentences[max(0, start_idx - overlap):end_idx])
            chunks.append(chunk)
            start_idx = end_idx

        final_chunk = ' '.join(sentences[max(0, start_idx - overlap):])
        chunks.append(final_chunk)

        return chunks

    def _merge_small_chunks(self, chunks, embeddings, min_size):
        final_chunks, merged_embeddings = [chunks[0]], [embeddings[0]]
        for i in range(1, len(chunks) - 1):
            if len(chunks[i].split('. ')) < min_size:
                prev_sim = cosine_similarity([embeddings[i]], [merged_embeddings[-1]])[0][0]
                next_sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
                if prev_sim > next_sim:
                    final_chunks[-1] += ' ' + chunks[i]
                    merged_embeddings[-1] = (merged_embeddings[-1] + embeddings[i]) / 2
                else:
                    chunks[i + 1] = chunks[i] + ' ' + chunks[i + 1]
                    embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
            else:
                final_chunks.append(chunks[i])
                merged_embeddings.append(embeddings[i])
        final_chunks.append(chunks[-1])
        return final_chunks

    def _calculate_distances(self, embeddings):
      """Calculate cosine distances between consecutive embeddings."""
      distances = []
      for i in range(len(embeddings) - 1):
          similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
          distance = 1 - similarity  # Distance = 1 - similarity
          distances.append(distance)
      return distances

    def evaluate_coherence(self, chunks):
        coherence_scores = []
        embeddings = self.model.encode(chunks)
        for i in range(len(embeddings) - 1):
            score = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
            coherence_scores.append(score)
        return np.mean(coherence_scores)

    def evaluate_rouge(self, original_text, chunks):
        rouge_evaluator = rouge.Rouge()
        scores = [rouge_evaluator.get_scores(chunk, original_text)[0]['rouge-1']['f'] for chunk in chunks]
        return np.mean(scores)

    def evaluate_qa_performance(self, chunks, test_questions):
        chunk_embeddings = self.model.encode(chunks)

        def retrieve_best_chunk(query):
            from sentence_transformers import CrossEncoder
            reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

            query_embedding = self.model.encode([query])
            similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
            top_n = np.argsort(similarities)[-3:]
            rerank_scores = reranker.predict([[query, chunks[i]] for i in top_n])
            return chunks[top_n[np.argmax(rerank_scores)]]

        correct = 0
        for question, expected_answer in test_questions:
            retrieved_chunk = retrieve_best_chunk(question)
            if expected_answer in retrieved_chunk:
                correct += 1

        return correct / len(test_questions)


In [None]:
import nltk
nltk.download('punkt_tab')

chunker2 = TextChunker2()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Testing with Document #1: "17.pdf"

In [None]:
file_path = "/content/17_qwen1.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("Where does the family medicine training take place?", "Africa")
]

In [None]:
def chunker_test(mcs, ovl):
    chunks2 = chunker2.process_file(
        file_path,
        dynamic_window=True,
        min_chunk_size=mcs,
        overlap=ovl,
        num_clusters=3)

    print(f"Using min_chunk_size={mcs}, overlap={ovl}")

    # Print results
    print(f"Successfully split text into {len(chunks2)} chunks")

    # for i in range(len(chunks2)):
    #     print(f"Chunk {i+1}: {len(chunks2[i].split('. '))} sentences")

    # Compute evaluation metrics
    coherence_score = chunker2.evaluate_coherence(chunks2)
    # rouge_score = chunker2.evaluate_rouge(text, chunks2)  # Fixed variable
    qa_accuracy = chunker2.evaluate_qa_performance(chunks2, test_questions)  # Removed retrieval_system

    # Print evaluation results
    print(f"Coherence Score: {coherence_score:.4f}")
    # print(f"ROUGE Score: {rouge_score:.4f}")
    print(f"QA Accuracy: {qa_accuracy * 100:.2f}%")

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 102 chunks
Coherence Score: 0.5870
ROUGE Score: 0.0593
QA Accuracy: 100.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 80 chunks
Coherence Score: 0.6504
ROUGE Score: 0.0835
QA Accuracy: 100.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 58 chunks
Coherence Score: 0.7273
ROUGE Score: 0.1122
QA Accuracy: 100.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 52 chunks
Coherence Score: 0.7788
ROUGE Score: 0.1448
QA Accuracy: 100.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 51 chunks
Coherence Score: 0.7969
ROUGE Score: 0.1585
QA Accuracy: 100.00%


# Testing with Document #2: "PDF1.pdf"

In [None]:
file_path = "/content/PDF1_qwen1.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("Which industry sector does the study focus on?", "semiconductor"),
    ("WWhat machine learning subfield is sentiment analysis a part of?", "NLP")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 202 chunks
Coherence Score: 0.5717
ROUGE Score: 0.0371
QA Accuracy: 50.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 174 chunks
Coherence Score: 0.6755
ROUGE Score: 0.0497
QA Accuracy: 100.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 120 chunks
Coherence Score: 0.6795
ROUGE Score: 0.0679
QA Accuracy: 100.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 109 chunks
Coherence Score: 0.7519
QA Accuracy: 50.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 105 chunks
Coherence Score: 0.7688
QA Accuracy: 100.00%


# Testing with Document #3: "2024_11_05 - Ferrari Q3 2024 Results Press Release.pdf"

In [None]:
file_path = "/content/2024_11_05 - Ferrari Q3 2024 Results Press Release_qwen1.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("Where is Ferrari's factory located?", "Maranello"),
    ("What is the brand this document mentioned?", "Ferrari")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 53 chunks
Coherence Score: 0.5453
QA Accuracy: 50.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 42 chunks
Coherence Score: 0.5961
QA Accuracy: 50.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 32 chunks
Coherence Score: 0.6668
QA Accuracy: 50.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 24 chunks
Coherence Score: 0.6642
QA Accuracy: 50.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 26 chunks
Coherence Score: 0.7087
QA Accuracy: 100.00%


# Testing with Document #4: "ai-in-america-oai-economic-blueprint-20250113.pdf"

In [None]:
file_path = "/content/ai-in-america-oai-economic-blueprint-20250113_qwen1.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is OpenAI's mission?", "benefits everyone"),
    ("How many people are currently using OpenAI's tools?", "300 million")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 74 chunks
Coherence Score: 0.5756
QA Accuracy: 0.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 68 chunks
Coherence Score: 0.6318
QA Accuracy: 0.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 42 chunks
Coherence Score: 0.6412
QA Accuracy: 0.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 46 chunks
Coherence Score: 0.7270
QA Accuracy: 0.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 39 chunks
Coherence Score: 0.7016
QA Accuracy: 0.00%


# Testing with Document #5: "creatingsystem.pdf"

In [None]:
file_path = "/content/creatingsystem_qwen1.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is a water system operations and maintenance manual?", "comprehensive 'how-to' guidance document"),
    ("Why is the manual necessary?", "detailed resource")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 6 chunks
Coherence Score: 0.6066
QA Accuracy: 100.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 5 chunks
Coherence Score: 0.6121
QA Accuracy: 100.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 3 chunks
Coherence Score: 0.3867
QA Accuracy: 50.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 4 chunks
Coherence Score: 0.5249
QA Accuracy: 100.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 3 chunks
Coherence Score: 0.3822
QA Accuracy: 50.00%


# Testing with Document #6: "2014-monarch-plus-service-manual.pdf"

In [None]:
file_path = "/content/2014-monarch-plus-service-manual.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is the company name of these products?", "Monarch"),
    ("What are the tools needed for service?", "Safety glasses")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 171 chunks


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Coherence Score: 0.5494
QA Accuracy: 50.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 137 chunks
Coherence Score: 0.5909
QA Accuracy: 50.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 92 chunks
Coherence Score: 0.5980
QA Accuracy: 50.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 83 chunks
Coherence Score: 0.7029
QA Accuracy: 50.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 70 chunks
Coherence Score: 0.7372
QA Accuracy: 0.00%


# Testing with Document #7: "231161_OperationsMaintenanceManual.pdf"

In [None]:
file_path = "/content/231161_OperationsMaintenanceManual.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is the first step in the backup procedures?", "Describe procedures"),
    ("Explain the processing overview", "Provide information")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 68 chunks
Coherence Score: 0.6021
QA Accuracy: 100.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 59 chunks
Coherence Score: 0.6603
QA Accuracy: 100.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 38 chunks
Coherence Score: 0.6648
QA Accuracy: 100.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 36 chunks
Coherence Score: 0.6850
QA Accuracy: 100.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 35 chunks
Coherence Score: 0.6981
QA Accuracy: 50.00%


# Testing with Document #8: "SUPO-744_REV_A.pdf"

In [None]:
file_path = "/content/SUPO-744_REV_A.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is the purpose of this Maintenance Manual ", "provide qualified service personnel with information"),
    ("What does 'CAUTION' indicate?", "potentially hazardous situation")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 92 chunks
Coherence Score: 0.5655
QA Accuracy: 100.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 83 chunks
Coherence Score: 0.6009
QA Accuracy: 100.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 56 chunks
Coherence Score: 0.5846
QA Accuracy: 100.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 53 chunks
Coherence Score: 0.6607
QA Accuracy: 100.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 54 chunks
Coherence Score: 0.6794
QA Accuracy: 100.00%


# Testing with Document #9: "VVS005s_030s_AHU_EN.pdf"

In [None]:
file_path = "/content/VVS005s_030s_AHU_EN.md"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

test_questions = [
    ("What is the first step in air handling units delivery?", "inspect all components"),
    ("Recommended pressure control operating position for electric heaters?", "horizontal")
]

In [None]:
mcs = 3
ovl = 1
chunker_test(mcs, ovl)

Using min_chunk_size=3, overlap=1
Successfully split text into 232 chunks
Coherence Score: 0.5637
QA Accuracy: 0.00%


In [None]:
mcs = 5
ovl = 2
chunker_test(mcs, ovl)

Using min_chunk_size=5, overlap=2
Successfully split text into 204 chunks
Coherence Score: 0.6485
QA Accuracy: 50.00%


In [None]:
mcs = 10
ovl = 3
chunker_test(mcs, ovl)

Using min_chunk_size=10, overlap=3
Successfully split text into 126 chunks
Coherence Score: 0.6467
QA Accuracy: 50.00%


In [None]:
mcs = 15
ovl = 5
chunker_test(mcs, ovl)

Using min_chunk_size=15, overlap=5
Successfully split text into 132 chunks
Coherence Score: 0.7087
QA Accuracy: 0.00%


In [None]:
mcs = 20
ovl = 6
chunker_test(mcs, ovl)

Using min_chunk_size=20, overlap=6
Successfully split text into 117 chunks
Coherence Score: 0.7191
QA Accuracy: 0.00%
