# 实验

In [1]:
import os
import re
import json
import random
import numpy as np
import pandas as pd
import dataclasses
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


环境变量

In [2]:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"

In [3]:
@dataclasses.dataclass
class LMConfig:
    num_layers: int
    hidden_size: int
    num_attention_head: int
    seq_length: int = 1024


@dataclasses.dataclass
class Config:
    gpus_per_node: int

    # train configuration
    steps: int
    batch_size: int
    fp16: bool
    
    # model configuration
    lm: LMConfig
    
    # parallel configuration
    data_parallel_size: int = 1
    tensor_parallel_size: int = 1
    pipeline_parallel_size: int = 1

In [4]:
@dataclasses.dataclass
class Metrics:
    forward_backward_time: float
    forward_time: float
    backward_time: float
    optimizer_time: float
    communication_time: float
    memory_size: float
    

绘图函数

In [5]:

def plot(data: pd.DataFrame):
    ncols = len(data.columns)
    x = list(data.index)
    _, axs = plt.subplots(ncols=ncols, figsize=(16, 1))

    for i, metric in enumerate(data.columns):
        y = list(data[metric].values)
        axs[i].barh(x, y)
        axs[i].set_xlabel(metric)

    plt.show()

# 测试plot函数
# data = {scheme: {"compute": random.randint(1, 10), "optimizer": random.randint(1, 10), "memory": random.randint(10,100), "communication": random.randint(100, 1000)} for scheme in ["base", "fp16"]}
# data = pd.DataFrame(data).T
# plot(data)

In [6]:
def train_model(cfg: Config):
    cmd = f"""torchrun --nproc_per_node {cfg.gpus_per_node} --nnodes 1 --node_rank 0 pretrain_gpt.py \
    --distributed-backend nccl \
    --tensor-model-parallel-size {cfg.tensor_parallel_size} \
    --pipeline-model-parallel-size {cfg.pipeline_parallel_size} \
    --num-layers {cfg.lm.num_layers} \
    --hidden-size {cfg.lm.hidden_size} \
    --num-attention-heads {cfg.lm.num_attention_head} \
    --seq-length 1024 \
    --max-position-embeddings 1024 \
    --micro-batch-size {cfg.batch_size} \
    --global-batch-size {cfg.batch_size} \
    --train-iters {cfg.steps} \
    --lr 0.00015 \
    --lr-decay-iters 320000 \
    --lr-decay-style cosine \
    --min-lr 1.0e-5 \
    --weight-decay 1e-2 \
    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
    --log-interval 10 \
    --timing-log-level 2 \
    --data-path /data/datasets/gpt2/BookCorpusDataset_text_document \
    --vocab-file /data/datasets/gpt2/gpt2-vocab.json \
    --merge-file /data/datasets/gpt2/gpt2-merges.txt \
    --data-impl mmap \
    --split 949,50,1"""
    
    if cfg.fp16:
        cmd += " --fp16 "

    # 日志输出到train.log文件
    cmd += " > train.log 2>&1"

    print(cmd)
    os.system(cmd)

模型参数量

|model|params|num_layers|hidden_size|num_head|
|---|----|---|----|---|
|gpt2-xl|1558M|48|1600|25|
|dollm-6B|6064M|48|3200|40|
|dollm-13B|12848M|40|5120|40|

In [7]:
gpt2_xl_model = LMConfig(num_layers=48, hidden_size=1600, num_attention_head=25)

In [8]:

cfg = Config(gpus_per_node=1, steps=50, batch_size=1, fp16=True, lm=gpt2_xl_model)
print(f"train configuration: \n{json.dumps(dataclasses.asdict(cfg), indent=4)}")

train_model(cfg)

train configuration: 
{
    "gpus_per_node": 1,
    "steps": 50,
    "batch_size": 1,
    "fp16": true,
    "lm": {
        "num_layers": 48,
        "hidden_size": 3200,
        "num_attention_head": 25,
        "seq_length": 1024
    },
    "data_parallel_size": 1,
    "tensor_parallel_size": 1,
    "pipeline_parallel_size": 1
}
using world size: 1, data-parallel-size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 
using torch.float32 for parameters ...
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. False
  adam_beta1 ...................................... 0.9
  adam_beta2 ...................................... 0.999
  adam_eps ........................................ 1e-08
  adlr_autoresume ................................. False
  adlr_autoresume_interval ........................ 1000
  apply_query_key_layer_scaling ................... True
  apply_residual_connection_post_layernorm ........ False


make: Nothing to be done for 'default'.
make: Leaving directory '/workspace/Megatron-LM/megatron/data'
>>> done with dataset index builder. Compilation time: 0.157 seconds
> compiling and loading fused kernels ...
ninja: no work to do.
ninja: no work to do.
ninja: no work to do.


Detected CUDA files, patching ldflags
Emitting ninja build file /workspace/Megatron-LM/megatron/fused_kernels/build/build.ninja...
Building extension module scaled_upper_triang_masked_softmax_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module scaled_upper_triang_masked_softmax_cuda...
Detected CUDA files, patching ldflags
Emitting ninja build file /workspace/Megatron-LM/megatron/fused_kernels/build/build.ninja...
Building extension module scaled_masked_softmax_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module scaled_masked_softmax_cuda...
Detected CUDA files, patching ldflags
Emitting ninja build file /workspace/Megatron-LM/megatron/fused_kernels/build/build.ninja...
Building extension module scaled_softmax_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the e

ninja: no work to do.
ninja: no work to do.
>>> done with compiling and loading fused kernels. Compilation time: 0.703 seconds
time to initialize megatron (seconds): 1.929
[after megatron is initialized] datetime: 2023-05-17 09:48:17 
building GPT model ...


Traceback (most recent call last):
  File "pretrain_gpt.py", line 118, in <module>
    pretrain(train_valid_test_datasets_provider, model_provider,
  File "/workspace/Megatron-LM/megatron/training.py", line 111, in pretrain
    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
  File "/workspace/Megatron-LM/megatron/training.py", line 371, in setup_model_and_optimizer
    model = get_model(model_provider_func, model_type)
  File "/workspace/Megatron-LM/megatron/training.py", line 251, in get_model
    model = model_provider_func(
  File "pretrain_gpt.py", line 26, in model_provider
    model = GPTModel(
  File "/workspace/Megatron-LM/megatron/model/gpt_model.py", line 61, in __init__
    self.language_model, self._language_model_key = get_language_model(
  File "/workspace/Megatron-LM/megatron/model/language_model.py", line 67, in get_language_model
    language_model = TransformerLanguageModel(
  File "/workspace/Megatron-LM/megatron/model/language_model.py", line 347

In [9]:
# train_model(cfg)
# ret, metrics = parse_log("train.log")
# if ret:
#     print(f"metrics: \n{json.dumps(dataclasses.asdict(metrics), indent=4)}")
# else:
#     print("train failed!")