In [1]:
import json
import os

# Handle lighting AI studio
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    print(os.getcwd())

from transformers import TrainingArguments
from llm2vec_da.data_utils import load_raw_datasets
from llm2vec_da.arguments import MNTPDataTrainingArguments, ModelArguments

In [2]:
with open('configs/mntp/MetaLlama3-swe-scandi-long.json') as f:
    config = json.load(f)

print(f'Loading data from: {config["dataset_name"]}')
data_args = MNTPDataTrainingArguments(dataset_name = config['dataset_name'])
model_args = ModelArguments()
training_args = TrainingArguments("/tmp")

Loading data from: jealk/scandi-wiki-combined


In [3]:
raw_datasets = load_raw_datasets(data_args, model_args)
raw_datasets

README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412582 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 391953
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 20629
    })
})

In [4]:
from transformers import AutoTokenizer
print(f'Loading tokenizer from: {config["model_name_or_path"]}')
tokenizer = AutoTokenizer.from_pretrained(config['model_name_or_path']) 

Loading tokenizer from: AI-Sweden-Models/Llama-3-8B-instruct


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [5]:
from llm2vec_da.preprocessing import tokenize_datasets, group_texts

tokenized_datasets = tokenize_datasets(raw_datasets, tokenizer, data_args, training_args)

Running tokenizer on every text in dataset:   0%|          | 0/391953 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/20629 [00:00<?, ? examples/s]

In [6]:
from functools import partial

grouped_datasets = tokenized_datasets.map(partial(group_texts, max_seq_length=512), batched=True)

Map:   0%|          | 0/391953 [00:00<?, ? examples/s]

Map:   0%|          | 0/20629 [00:00<?, ? examples/s]

In [7]:
grouped_datasets.save_to_disk("data/mntp_wiki_scandi_512")

Saving the dataset (0/3 shards):   0%|          | 0/390019 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20696 [00:00<?, ? examples/s]