How to train bert from scratch using line_by_line corpus #1794

yiqiang-zhao · 2022-11-04T04:32:47Z

yiqiang-zhao
Nov 4, 2022

Hi, I want to train a bert from scratch, here is my code. I followed this example and used the bert_base_tp1d.py config. I rewrote the build_data to load line_by_line form corpus.

import os, sys
import numpy as np

import torch
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
from torch.distributed import get_world_size

from transformers import BertConfig, BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from itertools import chain

import colossalai
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.utils import colo_set_process_memory_fraction, get_current_device, MultiTimer
from colossalai.nn.parallel.layers import init_colo_module
#from colossalai.tensor import TensorSpec, ComputePattern, ParallelAction
from colossalai.tensor import ComputePattern, ComputeSpec, ProcessGroup
from colossalai.context.parallel_mode import ParallelMode
from colossalai.nn import LinearWarmupLR
from colossalai.trainer import Trainer, hooks

torch.set_printoptions(profile="full")

def build_data(dataset_path, dataset_format, tokenizer_path, line_by_line=False, seq_len=512, batch_size=8):
    logger = get_dist_logger("build_data")
    logger.info("Building Dataset ...", ranks=[0])
    world_size = get_world_size()

    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    if os.path.isdir(dataset_path):
        raw_datasets = load_dataset(dataset_format, data_dir=dataset_path, cache_dir=None, use_auth_token=None)
        raw_datasets["validation"] = load_dataset(
            dataset_format,
            data_dir=dataset_path,
            split="train[:5%]",
            cache_dir=None,
            use_auth_token=None,
        )
        raw_datasets["train"] = load_dataset(
            dataset_format,
            data_dir=dataset_path,
            split="train[5%:]",
            cache_dir=None,
            use_auth_token=None,
        )
    else:
        data_files = {"train": dataset_path}
        raw_datasets = load_dataset(dataset_format, data_files=data_files, cache_dir=None, use_auth_token=None)
        raw_datasets["validation"] = load_dataset(
            dataset_format,
            data_files=data_files,
            split="train[:5%]",
            cache_dir=None,
            use_auth_token=None,
        )
        raw_datasets["train"] = load_dataset(
            dataset_format,
            data_files=data_files,
            split="train[5%:]",
            cache_dir=None,
            use_auth_token=None,
        )

    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]
    max_seq_length = min(seq_len, tokenizer.model_max_length)

    if line_by_line:
        def tokenize_function(examples):
            examples[text_column_name] = [line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()]
            return tokenizer(
                examples[text_column_name],
                padding="max_length",
                truncation=True,
                max_length=max_seq_length,
                return_special_tokens_mask=True,
            )
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=[text_column_name],
            load_from_cache_file=False,
        )
    else:
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=column_names,
            load_from_cache_file=False,
        )
        def group_texts(examples):
            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            if total_length >= max_seq_length:
                total_length = (total_length // max_seq_length) * max_seq_length
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
                for k, t in concatenated_examples.items()
            }
            return result
        tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=16, load_from_cache_file=False)

    def seed_worker():
        worker_seed = 1024
        np.random.seed(worker_seed)
        torch.manual_seed(worker_seed)
        random.seed(worker_seed)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    train_sampler = DistributedSampler(tokenized_datasets["train"], shuffle=True) if world_size > 1 else None
    train_data = DataLoader(
        tokenized_datasets["train"],
        shuffle=(train_sampler is None),
        sampler=train_sampler,
        drop_last=True,
        collate_fn=data_collator,
        worker_init_fn=seed_worker,
        batch_size=batch_size,
        pin_memory=True,
    )
    test_sampler = DistributedSampler(tokenized_datasets["validation"], shuffle=False) if world_size > 1 else None
    test_data = DataLoader(
        tokenized_datasets["validation"],
        sampler=test_sampler,
        drop_last=True,
        collate_fn=data_collator,
        worker_init_fn=seed_worker,
        batch_size=batch_size,
        pin_memory=True,
    )
    return train_data, test_data

class ModelFromHF(torch.nn.Module):
    def __init__(self, config, model_cls):
        super().__init__()
        self.module = model_cls(config)
        if gpc.config.model.get('checkpoint'):
            self.module.apply(self.set_checkpointing)

    def set_checkpointing(self, module):
        if hasattr(module, 'gradient_checkpointing'):
            module.gradient_checkpointing = True

    def forward(self, *args, **kwargs):
        output = self.module(*args, **kwargs)
        return output.logits

def build_model(cfg):
    model_cfg = cfg[gpc.config.model.type]
    bert_cfg = BertConfig(
        vocab_size=model_cfg['vocab_size'],
        hidden_size=model_cfg['hidden_size'],
        num_hidden_layers=model_cfg['depth'],
        num_attention_heads=model_cfg['num_heads'],
        intermediate_size=model_cfg['ff_size'],
        max_position_embeddings=model_cfg['seq_length'],
        use_cache=not gpc.config.model.get('checkpoint', False)
    )
    return ModelFromHF(bert_cfg, BertForMaskedLM)

def calc_local_model_size(model: torch.nn.Module):
    numel_per_device = 0
    for p in model.parameters():
        numel_per_device += p.numel()
    return numel_per_device

def main():
    parser = colossalai.get_default_parser()
    parser.add_argument('--from_torch', default=True, action='store_true')
    parser.add_argument('--line_by_line', default=False, action='store_true')
    parser.add_argument('--tokenizer', type=str, default="./bert-new-chinese")
    parser.add_argument('--checkpoint_dir', type=str, default="./colossalai-bert-mlm")
    parser.add_argument('--train', type=str, default="")
    parser.add_argument('--dataset_fmt', type=str, default="text")
    parser.add_argument('--seq_len', type=int, default=512)
    parser.add_argument('--vocab_size', type=int, default=52000)
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--weight_decay', type=float, default=0.001)
    parser.add_argument('--warmup', type=int, default=2)
    args = parser.parse_args()

    disable_existing_loggers()
    colossalai.launch_from_torch(config=args.config)
    logger = get_dist_logger()

    logger.info('Build data loader', ranks=[0])
    train_dataloader, test_dataloader = build_data(
        dataset_path=args.train,
        dataset_format=args.dataset_fmt,
        tokenizer_path=args.tokenizer,
        line_by_line=args.line_by_line,
        seq_len=gpc.config.SEQ_LENGTH,
        batch_size=gpc.config.BATCH_SIZE,
    )

    logger.info('Build model', ranks=[0])
    bert_base = dict(
        seq_length=args.seq_len,
        vocab_size=args.vocab_size,
        hidden_size=768,
        num_heads=12,
        depth=12,
        ff_size=3072,
        checkpoint=False,
        evaluation='ppl',
    )
    bert_large = dict(
        seq_length=args.seq_len,
        vocab_size=args.vocab_size,
        hidden_size=1024,
        num_heads=16,
        depth=24,
        ff_size=3072,
        checkpoint=False,
        evaluation='ppl',
    )
    bert_configurations = dict(
        bert=bert_base,
        bert_base=bert_base,
        bert_large=bert_large
    )
    with ColoInitContext(device=get_current_device()):
        model = build_model(bert_configurations)

    #parallel_action = ParallelAction(ComputePattern.TP1D)
    parallel_action = ComputeSpec(ComputePattern.TP1D)
    pg = ProcessGroup(tp_degree=dist.get_world_size())
    init_colo_module(model, parallel_action, pg=pg, recursive=True, mode='col')

    numel = calc_local_model_size(model)
    tflop = numel * gpc.config.BATCH_SIZE * gpc.config.SEQ_LENGTH \
        * gpc.get_world_size(ParallelMode.MODEL) * gpc.get_world_size(ParallelMode.DATA) * 8 / (1024 ** 4)

    logger.info('Build optimizer', ranks=[0])
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
    lr_scheduler = LinearWarmupLR(optimizer, total_steps=gpc.config.NUM_EPOCHS, warmup_steps=args.warmup)
    engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(
        model,
        optimizer,
        criterion,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        lr_scheduler=lr_scheduler
    )

    global_batch_size = gpc.config.BATCH_SIZE * \
        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])

    timier = MultiTimer()
    trainer = Trainer(engine=engine, logger=logger, timer=timier)

    hook_list = [
        hooks.LossHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
        hooks.LogMetricByEpochHook(logger),
        hooks.ThroughputHook(ignored_steps=10, tflop_per_step=tflop),
        hooks.LogMetricByStepHook(),
        hooks.LogMemoryByEpochHook(logger),
        hooks.SaveCheckpointHook(checkpoint_dir=args.checkpoint_dir)
    ]
    trainer.fit(
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        epochs=gpc.config.NUM_EPOCHS,
        test_interval=1,
        hooks=hook_list,
        display_progress=True,
        return_output_label=False,
        max_steps=5
    )

if __name__ == "__main__":
    main()

My env is a docker container, whose image is hpcaitech/colossalai:0.1.8, in a machine with 8 v100-16G gpus. I used the below training command:
colossalai run --nproc_per_node=8 cai_bert_mlm_trainer_clean.py --config=bert_base_tp1d.py --train /tf/wiki_zh_aa_09.sent --tokenizer /tf/bert-new-chinese --checkpoint_dir /tf/colossalai-bert-mlm --dataset_fmt text --line_by_line --from_torch

But I got such errors:

TypeError: Expected batch data to be of type torch.Tensor, list, tuple, or dict, but got <class 'transformers.tokenization_utils_base.BatchEncoding'>
Traceback (most recent call last):
  File "/tf/big_model/src/cai_bert_mlm_trainer_clean.py", line 284, in <module>
    trainer.fit(
  File "/opt/conda/lib/python3.9/site-packages/colossalai/trainer/_trainer.py", line 321, in fit
    self._train_epoch(
  File "/opt/conda/lib/python3.9/site-packages/colossalai/trainer/_trainer.py", line 181, in _train_epoch
    logits, label, loss = self.engine.execute_schedule(
  File "/opt/conda/lib/python3.9/site-packages/colossalai/engine/_base_engine.py", line 201, in execute_schedule
    output, label, loss = self._schedule.forward_backward_step(self, data_iter, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/colossalai/engine/schedule/_non_pipeline_schedule.py", line 68, in forward_backward_step
    batch_data = self.load_batch(data_iter)
  File "/opt/conda/lib/python3.9/site-packages/colossalai/engine/schedule/_base_schedule.py", line 79, in load_batch
    batch_data = self._move_to_device(batch_data)
  File "/opt/conda/lib/python3.9/site-packages/colossalai/engine/schedule/_base_schedule.py", line 49, in _move_to_device
    raise TypeError(

Until now, I can't figure out the problem, so I need a favor. Please give me some advice or some similar working examples, thanks.

Sze-qq · 2022-11-04T06:03:36Z

Sze-qq
Nov 4, 2022
Maintainer

Hi Yiqiang. Thanks for your question. Can we see the error that you came with? like what would follow after
"raise TypeError( " ?

3 replies

yiqiang-zhao Nov 4, 2022
Author

OK, not printed on the screen, it's in /opt/conda/lib/python3.9/site-packages/colossalai/engine/schedule/_base_schedule.py, line 49-50:

        else:
            raise TypeError(
                f"Expected batch data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")

oahzxl Nov 4, 2022

The problem is in your build_data function. Our dataloader requires input data to be tensor, not dict. You can refer to our examples https://github.com/hpcaitech/ColossalAI-Examples.

yiqiang-zhao Nov 7, 2022
Author

Can I just use raw sentence lines as input as my implementation of build_data?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to train bert from scratch using line_by_line corpus #1794

{{title}}

Replies: 1 comment 3 replies

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

How to train bert from scratch using line_by_line corpus #1794

yiqiang-zhao Nov 4, 2022

Replies: 1 comment · 3 replies

Sze-qq Nov 4, 2022 Maintainer

yiqiang-zhao Nov 4, 2022 Author

oahzxl Nov 4, 2022

yiqiang-zhao Nov 7, 2022 Author

yiqiang-zhao
Nov 4, 2022

Replies: 1 comment 3 replies

Sze-qq
Nov 4, 2022
Maintainer

yiqiang-zhao Nov 4, 2022
Author

yiqiang-zhao Nov 7, 2022
Author