In [None]:
dk_datasets =[
    {
        "dataset_name": "wiki_queries_gemma",
        "dataset_hf_path": "DDSC/da-wikipedia-queries-gemma-processed",
        "query": "anchor",
        "pos": "positive",
        "loss": "multiple_negatives_ranking",
        "label": ""
    },
    {
        "dataset_name": "hestenet",
        "dataset_hf_path": "kardosdrur/hestenet-qa",
        "query": "question",
        "pos": "answer",
        "loss": "multiple_negatives_ranking",
        "label": ""
    },
    {
        "dataset_name": "folketinget",
        "dataset_hf_path": "kardosdrur/folketinget-discussions",
        "query": "comment",
        "pos": "response",
        "loss": "multiple_negatives_ranking",
        "label": ""
    },
    {
        "dataset_name": "wiki_qa",
        "dataset_hf_path": "kardosdrur/dawiki_qa_zephyr",
        "query": "question",
        "pos": "answer",
        "loss": "multiple_negatives_ranking",
        "label": ""
    },
    {
        "dataset_name": "news",
        "dataset_hf_path": "jealk/danews_title_content_512",
        "config_name": "256-token",
        "query": "title",
        "pos": "content",
        "loss": "multiple_negatives_ranking",
        "label": ""
    },
    {
        "dataset_name": "opensubtitles_da_no",
        "dataset_hf_path": "kardosdrur/opensubtitles-no-da",
        "query": "no",
        "pos": "da",
        "loss": "multiple_negatives_ranking",
        "label": "overlap"
    },
    {
        "dataset_name": "europarl",
        "dataset_hf_path": "kardosdrur/europarl-scandinavian",
        "query": "sv",
        "pos": "da",
        "loss": "multiple_negatives_ranking",
        "label": ""
    }
]

In [19]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from datasets import Dataset as HFDataset

def create_combined_dataset(json_list):
    combined_train_data = []
    combined_test_data = []

    for entry in json_list:
        try:
            # Load the dataset and check for train/test splits
            if 'config_name' in entry:
                dataset_dict = load_dataset(entry['dataset_hf_path'], name=entry['config_name'])
            else:
                dataset_dict = load_dataset(entry['dataset_hf_path'])

            if 'train' in dataset_dict and 'test' in dataset_dict:
                train_dataset = dataset_dict['train']
                test_dataset = dataset_dict['test']
            elif 'train' in dataset_dict:
                train_dataset = dataset_dict['train']
                # Create a test split if not present
                test_size = max(1, int(0.05 * len(train_dataset)))
                train_size = len(train_dataset) - test_size
                train_dataset, test_dataset = train_dataset.train_test_split(test_size=test_size).values()
            else:
                # Use first available split to create both train and test splits
                first_split = next(iter(dataset_dict))
                dataset = dataset_dict[first_split]
                test_size = max(1, int(0.05 * len(dataset)))
                train_size = len(dataset) - test_size
                train_dataset, test_dataset = dataset.train_test_split(test_size=test_size).values()

        except ValueError:
            raise Exception(f"Error loading dataset: {entry['dataset_hf_path']}")

        # Select only relevant columns and limit to 10,000 samples per dataset
        def process_data(dataset):
            # Determine if we should subsample
            if entry['dataset_name'] in ["europarl", "opensubtitles_da_no"]:
                dataset = dataset.select(range(min(10000, len(dataset))))
            if entry['dataset_name'] == "news":
                dataset = dataset.select(range(min(30000, len(dataset))))
            
            return dataset.map(
                lambda example: {
                    'query': example[entry['query']],
                    'pos': example[entry['pos']],
                    'dataset_name': entry['dataset_name'],
                    'label': max(0.0, min(1.0, float(example[entry['label']]))) if entry['label'] and example[entry['label']] else 1.0
                },
                remove_columns=dataset.column_names
            )

        # Process train and test datasets
        train_dataset = process_data(train_dataset)
        test_dataset = process_data(test_dataset)

        # Append processed data to lists
        combined_train_data.append(train_dataset)
        combined_test_data.append(test_dataset)

    # Concatenate all train and test datasets into one
    combined_train_dataset = concatenate_datasets(combined_train_data)
    combined_test_dataset = concatenate_datasets(combined_test_data)

    return DatasetDict({"train": combined_train_dataset, "test": combined_test_dataset})

In [20]:
combined_dataset = create_combined_dataset(dk_datasets)

Map:   0%|          | 0/28766 [00:00<?, ? examples/s]

Map:   0%|          | 0/1514 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [21]:
# Push the directory with the multiple configurations to the Hub
combined_dataset.push_to_hub(repo_id="jealk/supervised-da", config_name="default", set_default=True, commit_message="Filtered da-news to only include 256 token context, and truncated at 30.000 samples")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/94 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/498 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jealk/supervised-da/commit/7a832ba37cc3cb21c18d5eb8b61d9500e6f8f213', commit_message='Filtered da-news to only include 256 token context, and truncated at 30.000 samples', commit_description='', oid='7a832ba37cc3cb21c18d5eb8b61d9500e6f8f213', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jealk/supervised-da', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jealk/supervised-da'), pr_revision=None, pr_num=None)

### Local cache for llama2vec default training script..

In [None]:
from datasets import load_dataset
combined_dataset = load_dataset("jealk/supervised-da")

README.md:   0%|          | 0.00/489 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/51.3M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/38.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/93200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/55124 [00:00<?, ? examples/s]

### Saving each dataset as a seperate JSONL

In [None]:
from datasets import Dataset
import os
import json

def save_combined_dataset_as_jsonl(combined_dataset, save_directory):
    os.makedirs(save_directory, exist_ok=True)
    grouped_samples = {}

    for sample in combined_dataset:
        dataset_name = sample['dataset_name']
        
        if dataset_name not in grouped_samples:
            grouped_samples[dataset_name] = []
        
        # Prepare the sample
        json_sample = {
            "query": sample['query'],
            "positive": sample['pos'],
            "negative": ""  # Placeholder: you might have logic to set this
                            # if there are negative examples as well
        }
        
        grouped_samples[dataset_name].append(json_sample)
    
    for dataset_name, samples in grouped_samples.items():
        jsonl_filepath = os.path.join(save_directory, f"{dataset_name}.jsonl")
        
        with open(jsonl_filepath, 'w', encoding='utf-8') as f:
            for sample in samples:
                f.write(json.dumps(sample) + "\n")
                
    print(f"Datasets saved in {save_directory} as JSONL files.")


save_directory = "cache/dk-data"
save_combined_dataset_as_jsonl(combined_dataset['train'], save_directory)

Datasets saved in cache/dk-data as JSONL files.


### Saving the dataset as a combined JSONL
**NOTE**: Shuffling the dataset

In [23]:
from datasets import Dataset
import json
import os

# Shufle the dataset
combined_dataset['train'] = combined_dataset['train'].shuffle()

# Define the save path
save_directory = "cache/dk-data"
os.makedirs(save_directory, exist_ok=True)
jsonl_filepath = os.path.join(save_directory, "supervised_dk_combined.jsonl")

# Save the shuffled samples as a JSONL file
with open(jsonl_filepath, 'w', encoding='utf-8') as f:
    for sample in combined_dataset['train']:
        json_sample = {
            "query": sample['query'],
            "positive": sample['pos'],
            "negative": ""  # Placeholder: customize based on your logic for negative samples
        }
        f.write(json.dumps(json_sample) + "\n")

print(f"Combined dataset saved in {jsonl_filepath}")

Combined dataset saved in cache/dk-data/supervised_dk_combined.jsonl


### Filtering the news dataset

In [None]:
from datasets import load_dataset
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o')

# Load the dataset
dataset = load_dataset("kardosdrur/danews_title_content")

# Define a function to filter based on the length of 'content'
def filter_by_content_length(example):
    return len(example['content']) < 5000

# Apply the filter to the train split or any split of your interest
filtered_dataset = dataset['train'].filter(filter_by_content_length)
#Plot distribution of token lengths
# Filter away any examples that are longer than 512 tokens
filtered_dataset = filtered_dataset.filter(lambda x: len(encoding.encode(x['content'])) < 512)

In [None]:

# Push the directory with the multiple configurations to the Hub
filtered_dataset.push_to_hub(repo_id="jealk/danews_title_content_512", config_name="256-token", set_default=False, commit_message="Version filtered down to max 256 tokens")