In [2]:
from huggingface_hub import notebook_login
from datasets import load_dataset, concatenate_datasets, DatasetDict

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Create an empty list to hold datasets
all_datasets = []

# Load and sample the Danish dataset
da_dataset = load_dataset("alexandrainst/scandi-wiki", 'da', split='train')
da_sampled = da_dataset.shuffle().select(range(200000))
all_datasets.append(da_sampled)

# List of other language codes
other_langs = ['fo', 'is', 'nb', 'nn', 'sv']

# Sample 50,000 instances from each of the other languages
for lang in other_langs:
    lang_dataset = load_dataset("alexandrainst/scandi-wiki", lang, split='train')
    if len(lang_dataset) < 50000:
        lang_sampled = lang_dataset
    else:
        lang_sampled = lang_dataset.shuffle().select(range(50000))
    all_datasets.append(lang_sampled)

# Combine all datasets into one
combined_dataset = concatenate_datasets(all_datasets)
combined_dataset = combined_dataset.shuffle()
combined_dataset.save_to_disk("data/combined_scandi_wiki")

Saving the dataset (0/1 shards):   0%|          | 0/412582 [00:00<?, ? examples/s]

In [4]:
# Push the directory with the multiple configurations to the Hub
combined_dataset.push_to_hub(repo_id="jealk/scandi-wiki-combined", config_name="default", set_default=True, commit_message="Increased number of da samples to 200k")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/413 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jealk/scandi-wiki-combined/commit/08cc77f0f49d2fb47b4c477455d34c9697efe50b', commit_message='Increased number of da samples to 200k', commit_description='', oid='08cc77f0f49d2fb47b4c477455d34c9697efe50b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jealk/scandi-wiki-combined', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jealk/scandi-wiki-combined'), pr_revision=None, pr_num=None)

### SIMCSE, Split to sentences

In [3]:
#Drop the wikipedia_id column and version_id column from all splits
def drop_columns(ds):
    """Function to drop columns from a Hugging Face DatasetDict."""
    # Define the transformation to apply to each instance
    def apply_drop_fn(batch):
        batch.pop('id', None)
        batch.pop('url', None)
        batch.pop('title', None)
        return batch    
    
    # Use `map` function to apply `apply_drop_fn` to the entire dataset (train, test, validation)
    ds = ds.map(apply_drop_fn, batched=False)

    return ds
    
# Load and sample the Danish dataset
ds = load_dataset("jealk/scandi-wiki-combined")
ds = drop_columns(ds)

Downloading readme:   0%|          | 0.00/720 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/366M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/362582 [00:00<?, ? examples/s]

Map:   0%|          | 0/362582 [00:00<?, ? examples/s]

In [4]:
import re
from datasets import DatasetDict

def split_into_sentences(text_batch):
    """
    Splits each text entry in the input batch of texts into sentences using punctuation followed by capital letters
    (., !, and ?) to recognize sentence boundaries. 
    Ensures that each split sentence becomes a new row in the dataset.
    """
    # Regex to split sentences on .,!,? only when followed by whitespace and a capital letter (or special Danish capital letters)
    split_regex = r'(?<=[.!?])(?=\s+[A-ZÆØÅ])'

    # List to store all sentences across the batch
    all_sentences = []

    # Process each text in the batch
    for text in text_batch:
        # Perform the smart splitting using the regex
        sentences = re.split(split_regex, text.strip())

        # Filter out empty results and remove extra whitespace from sentences, then extend the result list
        all_sentences.extend([sentence.strip() for sentence in sentences if sentence.strip()])

    # Return the flattened list of all sentences for this batch
    return all_sentences

def create_sentences_dataset(ds):
    """
    Given a Hugging Face dataset, it splits paragraphs into sentences and returns a new dataset
    where each sentence is a new row.
    """
    # Apply the map function to split into sentences
    ds_sentences = ds.map(
        lambda batch: {"text": split_into_sentences(batch["text"])},
        batched=True,  # Process on batches
        batch_size=1000,  # Adjust based on memory/capability, 1000 is a good value for efficiency
        num_proc=4,  # Use multiple processes for performance (adjust based on machine)
    )

    # Flatten the dataset’s structure (make sure every sentence is an individual row)
    ds_sentences = ds_sentences.flatten_indices()

    return ds_sentences

# Assuming 'ds' is the original dataset containing paragraphs
ds_sentences = DatasetDict({
    "train": create_sentences_dataset(ds['train']),
})

Map (num_proc=4):   0%|          | 0/362582 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/4164833 [00:00<?, ? examples/s]

In [5]:
sentences_before_filtering = sum(len(ds_sentences[split]) for split in ds_sentences.keys())

def filter_short_and_long_sentences(example):
    """
    Filters out sentences that are shorter than 5 words or longer than 100 words.
    """
    word_count = len(example['text'].split())  # Calculate word count
    return 5 <= word_count <= 100  # Only keep sentences with 5 <= word_count <= 100

ds_sentences_filtered = ds_sentences.filter(filter_short_and_long_sentences)

sentences_after_filtering = sum(len(ds_sentences_filtered[split]) for split in ds_sentences_filtered.keys())

print(f"{sentences_before_filtering - sentences_after_filtering} sentences was removed, of the total {sentences_before_filtering} sentences.")

Filter:   0%|          | 0/4164833 [00:00<?, ? examples/s]

297647 sentences was removed, of the total 4164833 sentences.


In [27]:
# Shuffle sentences
ds_sentences_filtered = ds_sentences_filtered['train'].shuffle().select(range(1000000))

In [28]:
# Save the new sentence-level dataset to disk
ds_sentences_filtered.save_to_disk("data/scandi-wiki-combined-sentences")

Saving the dataset (0/1 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [29]:
# Push the directory with the multiple configurations to the Hub
ds_sentences_filtered.push_to_hub(repo_id="jealk/scandi-wiki-combined", config_name="sentences", set_default=False, commit_message="Filtered sentences to be between 5 and 100 words long, 1M samples")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1000 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jealk/scandi-wiki-combined/commit/a6f2e5dbfb3737248d629385c137a6056c160a9e', commit_message='Filtered sentences to be between 5 and 100 words long, 1M samples', commit_description='', oid='a6f2e5dbfb3737248d629385c137a6056c160a9e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jealk/scandi-wiki-combined', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jealk/scandi-wiki-combined'), pr_revision=None, pr_num=None)

### Local txt file
Needed for default LLM2Vec data loader

In [1]:
from huggingface_hub import notebook_login
from datasets import load_dataset, concatenate_datasets, DatasetDict

scandi_dataset = load_dataset("jealk/scandi-wiki-combined", 'sentences')

Downloading data:   0%|          | 0.00/101M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [3]:
output_file = "data/wiki1m_for_simcse_scandi.txt"
train_texts = scandi_dataset['train']['text']

# Write the dataset contents into the txt file, one sentence per line
with open(output_file, 'w', encoding='utf-8') as f:
    for text in train_texts:
        f.write(text + '\n')  # Write each text line and ensure Danish characters are preserved