In [7]:
import json
from transformers import TrainingArguments
from llm2vec_da.data_utils import load_raw_datasets
from llm2vec_da.arguments import DataTrainingArguments, ModelArguments

In [None]:
with open('llama-swe-it-mntp.json') as f:
    config = json.load(f)

print(f'Loading data from: {config["dataset_name"]}')
data_args = DataTrainingArguments(dataset_name = config['dataset_name'])
model_args = ModelArguments()
training_args = TrainingArguments("/tmp")

Loading data from: jealk/wiki40b-da-clean


In [10]:
raw_datasets = load_raw_datasets(data_args, model_args)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 109486
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 6173
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6219
    })
})

In [12]:
from transformers import AutoTokenizer
print(f'Loading tokenizer from: {config["model_name_or_path"]}')
tokenizer = AutoTokenizer.from_pretrained(config['model_name_or_path']) 

Loading tokenizer from: AI-Sweden-Models/Llama-3-8B-instruct


In [None]:
from llm2vec_da.preprocessing import tokenize_datasets, group_texts

tokenized_datasets = tokenize_datasets(raw_datasets, tokenizer, data_args, training_args)

Running tokenizer on every text in dataset:   0%|          | 0/109486 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/6173 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/6219 [00:00<?, ? examples/s]

In [15]:
from functools import partial

grouped_datasets = tokenized_datasets.map(partial(group_texts, max_seq_length=1024), batched=True)

Map:   0%|          | 0/109486 [00:00<?, ? examples/s]

Map:   0%|          | 0/6173 [00:00<?, ? examples/s]

Map:   0%|          | 0/6219 [00:00<?, ? examples/s]

In [None]:
grouped_datasets.save_to_disk("data/mntp_wiki_dk_1024")

Saving the dataset (0/1 shards):   0%|          | 0/63851 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3595 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3717 [00:00<?, ? examples/s]