In [243]:
from itertools import tee
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

from xxhash import xxh64_intdigest


import numpy as np

@dataclass
class Document:
    id: int
    text: str

def ngrams(sequence: list, n: int):
    """
    Generate n-grams from a sequence of items
    Example:
        ngrams("Hi how are you?".split(), 3) -> [('Hi', 'how', 'are'), ('how', 'are', 'you')]
    """
    if len(sequence) < n:
        return []
    
    return [tuple(sequence[i:i+n]) for i in range(len(sequence) - n + 1)]

def get_hash(text: str, seed: int) -> int:
    """
    Get the hash of a text using a seed
    """
    return xxh64_intdigest(text, seed)

def get_signatures(shingles: np.ndarray) -> np.ndarray:
    """
    Get signatures (minhash of n-grams) from a string of text

    Args:
        shingles: numpy array of shingles: dtype = uint64, shape = (k, n_grams)

    Returns:
        numpy array of signatures: dtype = uint64, shape = (k, n_grams)
    """
    return np.min(shingles, axis=1)

def get_shingles(text: str, n_grams: int, k: int) -> np.ndarray:
    """
    Get kxn shingles (hashed n-grams) from a string of text

    Args:
        text: input text
        n_grams: n-grams size to use
        k: number of hash functions to use

    Returns:
        numpy array of shingles: dtype = uint64, shape = (k, n_grams)
    """
    ngrams_hashes = np.fromiter(
        [
            xxh64_intdigest(" ".join(x), seed)
            for seed in range(k)
            for x in ngrams(
                text.split(),
                n_grams,
            )
        ],
        dtype=np.uint64,
    ).reshape((k, -1))
    return ngrams_hashes



def dedup(data: list[Document], n_grams: int, k: int, jaccard_threshold: float = 0.8):
    # First stage: create a signatures (k minhashes of n-grams) for each document
    signatures: list[tuple[int, np.ndarray]] = []
    for doc in data:
        shingles = get_shingles(doc.text, n_grams, k)
        if shingles.size != 0:
            signatures.append((doc.id, get_signatures(shingles)))
    

    # Second stage: compute the jaccard similarity between all signatures
    # When duplicates are found, always keep only the one with the smallest index
    to_remove_ids = []
    for i in range(len(signatures)):
        for j in range(i+1, len(signatures)):
            jaccard_similarity = np.sum(signatures[i][1] == signatures[j][1]) / len(signatures[i][1])
            if jaccard_similarity > jaccard_threshold:
                to_remove_ids.append(signatures[j][0])
    
    # Due to transitivity of the jaccard similarity, we could have marked same document
    # multiple times. We thus remove such duplicates here.
    to_remove_ids = list(set(to_remove_ids))
    
    # Last stage, remove the documents with the ids in to_remove_ids
    # First we sort both the data and to_remove_indices by the id
    data = sorted(data, key=lambda x: x.id)
    to_remove_ids = sorted(to_remove_ids)

    to_remove_ids_idx = 0
    current_to_remove_id = to_remove_ids[to_remove_ids_idx] if to_remove_ids_idx < len(to_remove_ids) else -1

    # Then we iterate through the data and yield the ones that are not in to_remove_indices
    kept_docs = []
    for doc in data:
        if current_to_remove_id == doc.id:
            to_remove_ids_idx += 1
            current_to_remove_id = to_remove_ids[to_remove_ids_idx] if to_remove_ids_idx < len(to_remove_ids) else -1
            continue
        kept_docs.append(doc)

    return kept_docs


In [244]:
def load_document(path: Path) -> str:
    with open(path, "r") as f:
        return f.read()

documents = [
    Document(i, load_document(path)) for i, path in enumerate(Path("documents").glob("*.txt"))
]

print(f"Loaded {len(documents)} documents")

Loaded 27 documents


In [247]:
for doc in documents:
    print(f"Document {doc.id}: {doc.text}")
    print("-------")


Document 0: Hello beautiful world today. 
-------
Document 1: Hello beautiful world today. 
-------
Document 2: The field of Diverse Intelligence aims to identify, formalize, and understand commonalities in behavioral competencies
-------
Document 3: Machine learning algorithms have revolutionized weather prediction accuracy, but challenges remain in extreme event forecasting. This paper presents a hybrid approach combining traditional numerical weather models with deep learning networks, achieving a 40% improvement in tropical cyclone path prediction. Our method processes real-time satellite data and historical storm patterns to generate high-resolution forecasts up to 10 days in advance, significantly outperforming existing systems. 
-------
Document 4: The field of Diverse Intelligence aims to identify, formalize, and understand commonalities in behavioral competencies
-------
Document 5: Neuralss networks have transformed natural language process, yet their energy requirements rema

In [250]:
deduplicated_documents = dedup(documents, 5, 1000, 0.7)
print(f"Kept: {len(deduplicated_documents)}/{len(documents)} documents")


Kept: 24/27 documents


In [251]:
print("Kept documents:")
for doc in deduplicated_documents:
    print(f"Document {doc.id}: {doc.text}")
    print("-------")


Kept documents:
Document 0: Hello beautiful world today. 
-------
Document 1: Hello beautiful world today. 
-------
Document 2: The field of Diverse Intelligence aims to identify, formalize, and understand commonalities in behavioral competencies
-------
Document 3: Machine learning algorithms have revolutionized weather prediction accuracy, but challenges remain in extreme event forecasting. This paper presents a hybrid approach combining traditional numerical weather models with deep learning networks, achieving a 40% improvement in tropical cyclone path prediction. Our method processes real-time satellite data and historical storm patterns to generate high-resolution forecasts up to 10 days in advance, significantly outperforming existing systems. 
-------
Document 5: Neuralss networks have transformed natural language process, yet their energy requirements remain problematic. This research introduces a novel method for reducing the computational demands of large language models whi

In [252]:
removed_documents = [doc for doc in documents if doc.id not in [doc.id for doc in deduplicated_documents]]
print(f"Removed: {len(removed_documents)}/{len(documents)} documents")

print("Removed documents:")
for doc in removed_documents:
    print(f"Document {doc.id}: {doc.text}")
    print("-------")

Removed: 3/27 documents
Removed documents:
Document 4: The field of Diverse Intelligence aims to identify, formalize, and understand commonalities in behavioral competencies
-------
Document 6: Neural networks have transformed natural language processing, yet their energy requirements remain problematic. This research introduces a novel method for reducing the computational demands of large language models while preserving performance. Through the implementation of sparse attention patterns and progressive network pruning, we achieve a 70% decrease in energy usage with only a 2% accuracy trade-off on standard benchmarks. These results demonstrate that environmental sustainability in AI systems is achievable without significant performance degradation. 
-------
Document 12: Historic space mission successfully lands humans on Mars.

After a seven-month journey, the international Mars mission has achieved humanity's first crewed landing on the Red Planet.

"This marks the beginning of a n