In [1]:
import json
from transformers import TrainingArguments
from llm2vec_da.data_utils import load_raw_datasets
from llm2vec_da.arguments import DataTrainingArguments, ModelArguments

In [3]:
with open('llm2vec-da/configs/mntp/MetaLlama3-swe-dk.json') as f:
    config = json.load(f)

print(f'Loading data from: {config["dataset_name"]}')
data_args = DataTrainingArguments(dataset_name = config['dataset_name'])
model_args = ModelArguments()
training_args = TrainingArguments("/tmp")

Loading data from: jealk/wiki40b-da-clean


In [4]:
raw_datasets = load_raw_datasets(data_args, model_args)
raw_datasets

Downloading readme:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/109486 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6173 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6219 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 109486
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 6173
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6219
    })
})

In [5]:
from transformers import AutoTokenizer
print(f'Loading tokenizer from: {config["model_name_or_path"]}')
tokenizer = AutoTokenizer.from_pretrained(config['model_name_or_path']) 

Loading tokenizer from: AI-Sweden-Models/Llama-3-8B-instruct


In [6]:
from llm2vec_da.preprocessing import tokenize_datasets, group_texts

tokenized_datasets = tokenize_datasets(raw_datasets, tokenizer, data_args, training_args)

Running tokenizer on every text in dataset:   0%|          | 0/109486 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/6173 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/6219 [00:00<?, ? examples/s]

In [7]:
from functools import partial

grouped_datasets = tokenized_datasets.map(partial(group_texts, max_seq_length=512), batched=True)

Map:   0%|          | 0/109486 [00:00<?, ? examples/s]

Map:   0%|          | 0/6173 [00:00<?, ? examples/s]

Map:   0%|          | 0/6219 [00:00<?, ? examples/s]

In [8]:
grouped_datasets.save_to_disk("data/mntp_wiki_dk_512")

Saving the dataset (0/1 shards):   0%|          | 0/127761 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7435 [00:00<?, ? examples/s]