In [None]:
# Installs below on the venv in terminal
# pip install tqdm
# pip install bittensor
# pip install datasets
# pip install omegaconf
# pip install hydra-core --upgrade
# pip install accelerate

In [1]:
import os
from tqdm.auto import tqdm

import datasets
from datasets import Dataset, DatasetDict, load_dataset

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig

from accelerate import Accelerator

import transformers
from transformers import (
    AutoTokenizer,
)

import bittensor

has no attribute 'buffer'


In [3]:
def check_cfg_and_load_defaults(cfg: DictConfig) -> DictConfig:

    subtensor = bittensor.subtensor(network=cfg.bittensor.network)
    if cfg.dataset.block_size is None:
        cfg.dataset.block_size = subtensor.validator_sequence_length
    if cfg.training.train_batch_size is None:
        cfg.training.train_batch_size = subtensor.validator_batch_size
    if cfg.training.eval_batch_size is None:
        cfg.training.eval_batch_size = subtensor.validator_batch_size

    return cfg

In [4]:
def load_raw_datasets(cfg: DictConfig) -> DatasetDict:

    if cfg.dataset.name == "bittensor":

        dataset = bittensor.dataset(
            no_tokenizer=True,
            batch_size=cfg.training.train_batch_size,
            block_size=cfg.dataset.block_size,
        )
        dataloader = dataset.dataloader(cfg.dataset.num_batches)
        bittensor_dataset = {"text": []}
        for batch in tqdm(dataloader, desc="Loading data from bittensor IPFS"):
            bittensor_dataset["text"].extend(batch)
        raw_datasets = Dataset.from_dict(bittensor_dataset)

        dataset.close()  # Avoid leaving threadqueue running.
        return raw_datasets

    if os.path.exists(cfg.dataset.name):
        data_files = {"text": cfg.dataset.name}
        dataset_args = {}

        extension = os.path.splitext(cfg.dataset.name)[-1].lstrip(".")

        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = cfg.dataset.keep_linebreaks
        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
        raw_datasets = raw_datasets["text"]
    else:
        raw_datasets = load_dataset(cfg.dataset.name, cfg.dataset.config_name)

    return raw_datasets

In [5]:
def load_tokenizer(cfg: DictConfig):
    if cfg.tokenizer.name is not None:
        tokenizer = AutoTokenizer.from_pretrained(
            cfg.tokenizer.name, use_fast=cfg.tokenizer.use_fast
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            cfg.model.name, use_fast=cfg.tokenizer.use_fast
        )
    
    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

In [6]:
def create_accelerator(cfg: DictConfig) -> Accelerator:

    accelerator = (
        Accelerator(log_with=cfg.tracking.report_to, logging_dir=cfg.output_dir)
        if cfg.tracking.enabled
        else Accelerator()
    )
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    return accelerator

In [7]:
def preprocess(cfg: DictConfig, tokenizer, raw_datasets):
    # First we tokenize all the texts.
    column_names = raw_datasets.column_names
    text_column_name = "text" if "text" in column_names else column_names["train"][0]
    if cfg.dataset.concatenate_raw is True:
        pad = False
    else:
        pad = "max_length"
    
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        if total_length >= cfg.dataset.block_size:
            total_length = (
                total_length // cfg.dataset.block_size
            ) * cfg.dataset.block_size
        # Split by chunks of max_len.
        result = {
            k: [
                t[i : i + cfg.dataset.block_size]
                for i in range(0, total_length, cfg.dataset.block_size)
            ]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    def tokenize_fn(examples):
        result = tokenizer(
            examples[text_column_name],
            padding=pad,
            truncation=True,
            max_length=cfg.dataset.block_size,
        )
        result["labels"] = result["input_ids"].copy()
        return result


    with accelerator.main_process_first():

        tokenized_datasets = raw_datasets.map(
            tokenize_fn,
            batched=True,
            num_proc=cfg.tokenizer.preprocessing_num_workers,
            load_from_cache_file=not cfg.dataset.overwrite_cache,
            desc="Running tokenizer on dataset",
        )

        if cfg.dataset.concatenate_raw is True:
            tokenized_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                num_proc=cfg.tokenizer.preprocessing_num_workers,
                load_from_cache_file=not cfg.dataset.overwrite_cache,
                desc=f"Grouping texts in chunks of {cfg.dataset.block_size}",
            )

    return tokenized_datasets

In [11]:
from datasets import concatenate_datasets

def write_tokenized_datasets(cfg: DictConfig, accelerator, tokenizer, loops=50) -> None:
    i = 0
    new_dataset = True
    for k in range(loops):
        raw_datasets = load_raw_datasets(cfg)
        tokenized_dataset_batch = preprocess(cfg, tokenizer, raw_datasets)

        if new_dataset:
            tokenized_dataset = tokenized_dataset_batch
            new_dataset = False

        else:
            tokenized_dataset = concatenate_datasets([tokenized_dataset, tokenized_dataset_batch])

        
        if tokenized_dataset.shape[0] > 100000:
            file_name = cfg.dataset.file_name + "_" + str(i)
            output_file = os.path.join(cfg.dataset.data_dir, file_name)

            tokenized_dataset.save_to_disk(output_file)
            i += 1
            new_dataset=True

        

In [1]:
cfg = OmegaConf.load('conf/config.yaml')
cfg = check_cfg_and_load_defaults(cfg)

accelerator = create_accelerator(cfg)
accelerator.wait_for_everyone()
tokenizer = load_tokenizer(cfg)

write_tokenized_datasets(cfg, accelerator, tokenizer)