In [10]:
import json
import os

# Handle lighting AI studio
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    print(os.getcwd())

from transformers import TrainingArguments
from llm2vec_da.data_utils import load_raw_datasets
from llm2vec_da.arguments import MNTPDataTrainingArguments, ModelArguments

In [12]:
with open('configs/mntp/Llama32-1B-scandi.json') as f:
    config = json.load(f)

print(f'Loading data from: {config["dataset_name"]}')
data_args = MNTPDataTrainingArguments(dataset_name = config['dataset_name'])
model_args = ModelArguments()
training_args = TrainingArguments("/tmp")

Loading data from: jealk/scandi-wiki-combined


In [None]:
from dotenv import load_dotenv
#Include the HF_HUB_TOKEN in your .env file
load_dotenv()

raw_datasets = load_raw_datasets(data_args, model_args)
raw_datasets

Loading from hub


README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412582 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 391953
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 20629
    })
})

In [15]:
from transformers import AutoTokenizer
print(f'Loading tokenizer from: {config["model_name_or_path"]}')
tokenizer = AutoTokenizer.from_pretrained(config['model_name_or_path']) 

Loading tokenizer from: meta-llama/Llama-3.2-1B


In [16]:
from llm2vec_da.preprocessing import tokenize_datasets, group_texts

tokenized_datasets = tokenize_datasets(raw_datasets, tokenizer, data_args, training_args)

Running tokenizer on every text in dataset:   0%|          | 0/391953 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/20629 [00:00<?, ? examples/s]

In [17]:
from functools import partial

grouped_datasets = tokenized_datasets.map(partial(group_texts, max_seq_length=512), batched=True)

Map:   0%|          | 0/391953 [00:00<?, ? examples/s]

Map:   0%|          | 0/20629 [00:00<?, ? examples/s]

In [18]:
grouped_datasets.save_to_disk("data/llama-32-1B-tokenized_mntp_wiki_scandi_512")

Saving the dataset (0/3 shards):   0%|          | 0/390019 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20696 [00:00<?, ? examples/s]

In [19]:
grouped_datasets.push_to_hub("jealk/tokenized-llama32-wiki-scandi-512")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/131 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/131 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/131 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jealk/tokenized-llama32-wiki-scandi-512/commit/3d65df40099e8ee33d8ed25915fca962857a11ce', commit_message='Upload dataset', commit_description='', oid='3d65df40099e8ee33d8ed25915fca962857a11ce', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jealk/tokenized-llama32-wiki-scandi-512', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jealk/tokenized-llama32-wiki-scandi-512'), pr_revision=None, pr_num=None)