In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

ds = load_dataset("alexandrainst/wiki40b-da")

In [None]:
def clean_paragraphs(text):
    """Clean and process text by removing formatting and keeping text after _START_PARAGRAPH_."""
    # Split at _START_PARAGRAPH_
    paragraphs = text.split('_START_PARAGRAPH_')[1:]  # Skip the first part before _START_PARAGRAPH_

    # Join all paragraphs and replace unwanted formatting markers
    processed_text = ' '.join(para.replace('\n', ' ').replace('_NEWLINE_', ' ').replace('_START_SECTION_', '').strip() for para in paragraphs)
    
    return processed_text

# Apply the transformation to all splits of the dataset
def clean_dataset(ds):
    """Function to clean the 'text' column for a Hugging Face DatasetDict."""
    # Define the transformation to apply to each instance
    def apply_clean_fn(batch):
        batch['text'] = clean_paragraphs(batch['text'])
        return batch
    
    # Use `map` function to apply `apply_clean_fn` to the entire dataset (train, test, validation)
    ds = ds.map(apply_clean_fn, batched=False)

    return ds

# Calling the clean_dataset function to clean the text in all splits (train/validation/test)
ds_clean = clean_dataset(ds)

In [None]:
ds_clean['validation']['text'][:5]

In [None]:
#Drop the wikipedia_id column and version_id column from all splits
def drop_columns(ds):
    """Function to drop columns from a Hugging Face DatasetDict."""
    # Define the transformation to apply to each instance
    def apply_drop_fn(batch):
        batch.pop('wikidata_id', None)
        batch.pop('version_id', None)
        return batch
    
    # Use `map` function to apply `apply_drop_fn` to the entire dataset (train, test, validation)
    ds = ds.map(apply_drop_fn, batched=False)

    return ds

# Calling the drop_columns function to drop the columns from all splits (train/validation/test)
ds_clean = drop_columns(ds_clean)

In [None]:
# Assuming you ran all cleaning already, and your script is ready
ds_clean.save_to_disk("data/wiki40b-da")

In [None]:
# Push the directory with the multiple configurations to the Hub
ds_clean.push_to_hub(repo_id="jealk/wiki40b-da-clean", config_name="default", set_default=True, commit_message="Prev commit was original data: Now: Removed text formatting and article titles, dropped ID columns")

### SIMCSE, Split to sentences

In [None]:
import re
from datasets import DatasetDict

def split_into_sentences(text_batch):
    """
    Splits each text entry in the input batch of texts into sentences using punctuation followed by capital letters
    (., !, and ?) to recognize sentence boundaries. 
    Ensures that each split sentence becomes a new row in the dataset.
    """
    # Regex to split sentences on .,!,? only when followed by whitespace and a capital letter (or special Danish capital letters)
    split_regex = r'(?<=[.!?])(?=\s+[A-ZÆØÅ])'

    # List to store all sentences across the batch
    all_sentences = []

    # Process each text in the batch
    for text in text_batch:
        # Perform the smart splitting using the regex
        sentences = re.split(split_regex, text.strip())

        # Filter out empty results and remove extra whitespace from sentences, then extend the result list
        all_sentences.extend([sentence.strip() for sentence in sentences if sentence.strip()])

    # Return the flattened list of all sentences for this batch
    return all_sentences

def create_sentences_dataset(ds):
    """
    Given a Hugging Face dataset, it splits paragraphs into sentences and returns a new dataset
    where each sentence is a new row.
    """
    # Apply the map function to split into sentences
    ds_sentences = ds.map(
        lambda batch: {"text": split_into_sentences(batch["text"])},
        batched=True,  # Process on batches
        batch_size=1000,  # Adjust based on memory/capability, 1000 is a good value for efficiency
        num_proc=4,  # Use multiple processes for performance (adjust based on machine)
    )

    # Flatten the dataset’s structure (make sure every sentence is an individual row)
    ds_sentences = ds_sentences.flatten_indices()

    return ds_sentences

# Assuming 'ds' is the original dataset containing paragraphs
ds_sentences = DatasetDict({
    "train": create_sentences_dataset(ds_clean["train"]),
    "validation": create_sentences_dataset(ds_clean["validation"]),
    "test": create_sentences_dataset(ds_clean["test"])
})

In [None]:
sentences_before_filtering = sum(len(ds_sentences[split]) for split in ds_sentences.keys())

def filter_short_and_long_sentences(example):
    """
    Filters out sentences that are shorter than 5 words or longer than 100 words.
    """
    word_count = len(example['text'].split())  # Calculate word count
    return 5 <= word_count <= 100  # Only keep sentences with 5 <= word_count <= 100

ds_sentences_filtered = ds_sentences.filter(filter_short_and_long_sentences)

sentences_after_filtering = sum(len(ds_sentences_filtered[split]) for split in ds_sentences_filtered.keys())

print(f"{sentences_before_filtering - sentences_after_filtering} sentences was removed, of the total {sentences_before_filtering} sentences.")

In [None]:
# Save the new sentence-level dataset to disk
ds_sentences_filtered.save_to_disk("data/wiki40b-da-sentences")

In [None]:
# Push the directory with the multiple configurations to the Hub
ds_sentences_filtered.push_to_hub(repo_id="jealk/wiki40b-da-clean", config_name="sentences", set_default=False, commit_message="Filtered sentences to be between 5 and 100 words long")