# Sentence similarity

In [2]:
!pip install python-docx
!pip install docx
from docx import Document
from io import BytesIO
import re
import os
from pathlib import Path

# from google.colab import files
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:

from docx import Document
!pip install nltk
import nltk
nltk.download('punkt_tab')

import docx
from nltk.tokenize import sent_tokenize
import re
from typing import List, Dict, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package punkt_tab to /home/javad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:


class DocumentChunker:
    def __init__(self,
                 min_chunk_size: int = 100,
                 max_chunk_size: int = 500,
                 overlap: int = 50,
                 min_sentence_similarity: float = 0.3):
        """
        Initialize the document chunker with configurable parameters.

        Args:
            min_chunk_size: Minimum characters per chunk
            max_chunk_size: Maximum characters per chunk
            overlap: Number of characters to overlap between chunks
            min_sentence_similarity: Minimum cosine similarity threshold for sentences
        """
        self.min_chunk_size = min_chunk_size
        self.max_chunk_size = max_chunk_size
        self.overlap = overlap
        self.min_sentence_similarity = min_sentence_similarity
        self.vectorizer = TfidfVectorizer()

    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text content from a Word document."""
        doc = docx.Document(file_path)
        full_text = []
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                full_text.append(paragraph.text.strip())
        return "\n\n".join(full_text)

    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text."""
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'[^\w\s.,!?;:-]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def calculate_sentence_similarities(self, sentences: List[str]) -> np.ndarray:
        """
        Calculate pairwise cosine similarities between sentences.

        Args:
            sentences: List of sentences to compare

        Returns:
            Similarity matrix for sentences
        """
        if not sentences:
            return np.array([])

        # Create TF-IDF vectors for sentences
        tfidf_matrix = self.vectorizer.fit_transform(sentences)

        # Calculate cosine similarity
        return cosine_similarity(tfidf_matrix)

    def find_similar_sentences(self,
                             sentences: List[str],
                             similarity_matrix: np.ndarray) -> List[List[int]]:
        """
        Group sentences based on similarity threshold.

        Args:
            sentences: List of sentences
            similarity_matrix: Pairwise similarity matrix

        Returns:
            List of groups of similar sentence indices
        """
        sentence_groups = []
        used_indices = set()

        for i in range(len(sentences)):
            if i in used_indices:
                continue

            # Start new group with current sentence
            current_group = [i]
            used_indices.add(i)

            # Find similar sentences
            for j in range(i + 1, len(sentences)):
                if j in used_indices:
                    continue

                # Check similarity with all sentences in current group
                similarities = [similarity_matrix[j][k] for k in current_group]
                if min(similarities) >= self.min_sentence_similarity:
                    current_group.append(j)
                    used_indices.add(j)

            sentence_groups.append(current_group)

        return sentence_groups

    def create_semantic_chunks(self, text: str) -> List[Dict]:
        """
        Split text into semantically coherent chunks based on sentence similarity.

        Args:
            text: Preprocessed text to chunk

        Returns:
            List of chunk dictionaries with text and metadata
        """
        # Split into sentences
        sentences = sent_tokenize(text)
        if not sentences:
            return []

        # Calculate sentence similarities
        similarity_matrix = self.calculate_sentence_similarities(sentences)

        # Group similar sentences
        sentence_groups = self.find_similar_sentences(sentences, similarity_matrix)

        chunks = []
        current_chunk = []
        current_length = 0

        for group in sentence_groups:
            group_sentences = [sentences[i] for i in group]
            group_text = ' '.join(group_sentences)
            group_length = len(group_text)

            # Check if adding this group would exceed max chunk size
            if current_length + group_length > self.max_chunk_size and current_chunk:
                # Create chunk from accumulated sentences
                chunk_text = ' '.join(current_chunk)
                chunks.append({
                    'text': chunk_text,
                    'length': len(chunk_text),
                    'sentences': len(current_chunk),
                    'avg_group_similarity': np.mean([
                        similarity_matrix[i][j]
                        for i in range(len(current_chunk))
                        for j in range(i + 1, len(current_chunk))
                    ]) if len(current_chunk) > 1 else 1.0
                })

                # Start new chunk
                current_chunk = group_sentences
                current_length = group_length
            else:
                current_chunk.extend(group_sentences)
                current_length += group_length

        # Add final chunk if it meets minimum size
        if current_length >= self.min_chunk_size:
            final_text = ' '.join(current_chunk)
            chunks.append({
                'text': final_text,
                'length': len(final_text),
                'sentences': len(current_chunk),
                'avg_group_similarity': np.mean([
                    similarity_matrix[i][j]
                    for i in range(len(current_chunk))
                    for j in range(i + 1, len(current_chunk))
                ]) if len(current_chunk) > 1 else 1.0
            })

        return chunks

    def process_document(self, file_path: str) -> List[Dict]:
        """Process a Word document end-to-end."""
        raw_text = self.extract_text_from_docx(file_path)
        processed_text = self.preprocess_text(raw_text)
        return self.create_semantic_chunks(processed_text)

    def write_chunks_to_file(self, chunks: List[Dict], output_path: str = "/content/mychunk.txt"):
        """Write chunks to a text file with metadata."""
        with open(output_path, 'w', encoding='utf-8') as f:
            for i, chunk in enumerate(chunks, 1):
                f.write(f"{'='*80}\n")
                f.write(f"CHUNK {i}\n")
                f.write(f"Length: {chunk['length']} characters\n")
                f.write(f"Sentences: {chunk['sentences']}\n")
                f.write(f"Average group similarity: {chunk['avg_group_similarity']:.3f}\n")
                f.write(f"{'-'*40}\n")
                f.write(f"{chunk['text']}\n\n")

# Example usage


In [7]:
if __name__ == "__main__":
    chunker = DocumentChunker(
        min_chunk_size=200,
        max_chunk_size=500,
        overlap=100,
        min_sentence_similarity=0.3  # Adjust this threshold as needed
    )

    # Process a document
    chunks = chunker.process_document("/content/TheEvolutionofPrivacy.docx")
    chunker.write_chunks_to_file(chunks)

PackageNotFoundError: Package not found at '/content/TheEvolutionofPrivacy.docx'

Reference sentence: Knowledge graph embedding methods for entity alignment: experimental review Paper Review Eleventh Hour Enthusiast · 10 min read · Oct 12, 2024 Introduction Knowledge graphs (KGs) have become essential in various domains, powering applications such as question answering, recommendations, and semantic search.

Similarity ranking:

Similarity: 0.4524
Sentence: Knowledge graph embedding methods for entity alignment: experimental review.

Similarity: 0.1838
Sentence: Methods Used for Entity Alignment The paper evaluates several methods for entity alignment using knowledge graph embeddings.

Similarity: 0.1445
Sentence: MTransE is one of the foundational supervised methods for entity alignment across knowledge graphs.

Similarity: 0.1197
Sentence: The study’s experimental evaluation highlights the strengths and trade-offs of various embedding-based entity alignment methods, revealing statistically significant rankings and correlations with dataset characteristics.

Simila