In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from datasketch import MinHash, MinHashLSH
import string
import os

In [2]:
dataset = load_dataset('miracl/miracl-corpus', 'ru')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['docid', 'title', 'text'],
        num_rows: 9543918
    })
})

In [5]:
def filter_short_texts(example):
    return len(example['text']) >= 50 and len(example['text']) <= 100000

filtered_data = dataset['train'].filter(filter_short_texts, desc="Filtering texts")
filtered_data

Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 8835109
})

In [None]:
def preprocess_text(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator).lower()

def generate_ngrams(text, n=13):
    tokens = text.split()
    ngrams = set([" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])
    return ngrams

def create_minhash(ngrams):
    m = MinHash(num_perm=128)
    for gram in ngrams:
        m.update(gram.encode('utf8'))
    return m

def deduplicate_documents(dataset, similarity_threshold=0.8, ngram_size=13):
    lsh = MinHashLSH(threshold=similarity_threshold, num_perm=128)
    unique_docs = []
    added_docs = set()

    for i, doc in enumerate(dataset['text']):
        preprocessed_text = preprocess_text(doc)
        ngrams = generate_ngrams(preprocessed_text, n=ngram_size)
        minhash = create_minhash(ngrams)
        
        if len(lsh.query(minhash)) == 0:
            lsh.insert(f"doc_{i}", minhash)
            unique_docs.append(doc)
            added_docs.add(f"doc_{i}")
    
    return unique_docs

filtered_dataset = deduplicate_documents(
    dataset=filtered_data,  
    similarity_threshold=0.8,  
    ngram_size=13           
)

filtered_dataset

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

def split_text_into_chunks(text, chunk_size=1024):
    words = text.split()  
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def calculate_perplexity(text, max_length=1024):

    chunks = split_text_into_chunks(text, chunk_size=max_length // 2)  
    total_loss = 0.0
    num_batches = 0

    for chunk in chunks:
        tokens = tokenizer.encode(chunk, return_tensors="pt")
        if tokens.size(1) > max_length:  
            tokens = tokens[:, :max_length]

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            total_loss += outputs.loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches if num_batches > 0 else 1e9 
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    return perplexity

def filter_by_perplexity(data, perplexity_threshold=100, max_length=1024):

    progress_bar = tqdm(data, desc="Calculating perplexity", unit="text")

    def is_below_threshold(example):
        perplexity = calculate_perplexity(example['text'], max_length=max_length)
        return perplexity < perplexity_threshold

    filtered_data = data.filter(
        lambda example, idx: is_below_threshold(example),
        with_indices=True,
        desc="Filtering texts by perplexity"
    )

    progress_bar.close()
    return filtered_data

filtered_dataset = filter_by_perplexity(
    data=filtered_dataset,  
    perplexity_threshold=100,  
    max_length=1024  
)

In [24]:
len(filtered_dataset)

7580594

In [25]:
NewDataset = Dataset.from_dict({
    "_id": list(range(len(filtered_dataset))),  
    "text": filtered_dataset                    
})
NewDataset

Dataset({
    features: ['_id', 'text'],
    num_rows: 7580594
})

In [28]:
NewDataset = NewDataset.train_test_split(test_size=0.1)
NewDataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'text'],
        num_rows: 6822534
    })
    test: Dataset({
        features: ['_id', 'text'],
        num_rows: 758060
    })
})

In [None]:
def deduplicate_train_by_test(train_dataset, test_dataset, similarity_threshold=0.8, ngram_size=13):

    lsh = MinHashLSH(threshold=similarity_threshold, num_perm=128)

    for i, doc in enumerate(test_dataset['text']):
        preprocessed_text = preprocess_text(doc)
        ngrams = generate_ngrams(preprocessed_text, n=ngram_size)
        minhash = create_minhash(ngrams)
        lsh.insert(f"test_doc_{i}", minhash)

    filtered_train = []
    for i, doc in enumerate(train_dataset['text']):
        preprocessed_text = preprocess_text(doc)
        ngrams = generate_ngrams(preprocessed_text, n=ngram_size)
        minhash = create_minhash(ngrams)

        if len(lsh.query(minhash)) == 0:
            filtered_train.append(train_dataset[i])  

    return filtered_train

train_filtered = deduplicate_train_by_test(
    train_dataset=NewDataset['train'],  
    test_dataset=NewDataset['test'],    
    similarity_threshold=0.8,  
    ngram_size=13
)

Calculating perplexity:   0%|                                                                                                                                                                                                | 0/7580594 [12:05:55<?, ?text/s]


In [46]:
train_part = {'_id': [item['_id'] for item in train_filtered], 
             'text': [item['text'] for item in train_filtered]}

In [47]:
train_part = Dataset.from_dict(train_part)

In [51]:
NewDataset['train'] = train_part
NewDataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'text'],
        num_rows: 6822534
    })
    test: Dataset({
        features: ['_id', 'text'],
        num_rows: 758060
    })
})

In [53]:
NewDataset.push_to_hub("kngrg/ru-miracl-cleaned")

Uploading the dataset shards:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/683 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/380 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/380 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kngrg/ru-miracl-cleaned/commit/16122a52b444856a5b5383cb67c934106f74bf13', commit_message='Upload dataset', commit_description='', oid='16122a52b444856a5b5383cb67c934106f74bf13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/kngrg/ru-miracl-cleaned', endpoint='https://huggingface.co', repo_type='dataset', repo_id='kngrg/ru-miracl-cleaned'), pr_revision=None, pr_num=None)

In [56]:
output_dir = "output_json"
os.makedirs(output_dir, exist_ok=True)

for split in NewDataset:
    json_path = os.path.join(output_dir, f"{split}.json")
    NewDataset[split].to_json(json_path)
    print(f"Saved {split} split to {json_path}")

Creating json from Arrow format:   0%|          | 0/6823 [00:00<?, ?ba/s]

Saved train split to output_json/train.json


Creating json from Arrow format:   0%|          | 0/759 [00:00<?, ?ba/s]

Saved test split to output_json/test.json
