# 实验

In [2]:
import os
import re
import json
import random
import numpy as np
import pandas as pd
import dataclasses
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from typing import List, Dict

环境变量

In [3]:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"

In [4]:
@dataclasses.dataclass
class LMConfig:
    num_layers: int
    hidden_size: int
    num_attention_head: int
    seq_length: int = 1024


@dataclasses.dataclass
class Config:
    gpus_per_node: int

    # train configuration
    steps: int
    batch_size: int
    fp16: bool
    
    # model configuration
    lm: LMConfig
    
    # parallel configuration
    data_parallel_size: int = 1
    tensor_parallel_size: int = 1
    pipeline_parallel_size: int = 1

In [5]:
@dataclasses.dataclass
class Metrics:
    forward_backward_time: float
    forward_time: float
    backward_time: float
    optimizer_time: float
    communication_time: float
    memory_size: float
    

绘图函数

In [6]:

def plot(data: pd.DataFrame):
    ncols = len(data.columns)
    x = list(data.index)
    _, axs = plt.subplots(ncols=ncols, figsize=(16, 1))

    for i, metric in enumerate(data.columns):
        y = list(data[metric].values)
        axs[i].barh(x, y)
        axs[i].set_xlabel(metric)

    plt.show()

# 测试plot函数
# data = {scheme: {"compute": random.randint(1, 10), "optimizer": random.randint(1, 10), "memory": random.randint(10,100), "communication": random.randint(100, 1000)} for scheme in ["base", "fp16"]}
# data = pd.DataFrame(data).T
# plot(data)

In [None]:
def train_model(cfg):
    pass

模型参数量

|model|params|num_layers|hidden_size|num_head|
|---|----|---|----|---|
|gpt2-xl|1558M|48|3200|25|
|dollm-6B|6064M|48|3200|40|
|dollm-13B|12848M|40|5120|40|

In [8]:
gpt2_xl_model = LMConfig(num_layers=48, hidden_size=3200, num_attention_head=25)

In [9]:

cfg = Config(gpus_per_node=1, steps=50, batch_size=1, fp16=True, lm=gpt2_xl_model)
print(f"train configuration: \n{json.dumps(dataclasses.asdict(cfg), indent=4)}")

# train_model(cfg)

train configuration: 
{
    "gpus_per_node": 1,
    "steps": 50,
    "batch_size": 1,
    "fp16": true,
    "lm": {
        "num_layers": 48,
        "hidden_size": 3200,
        "num_attention_head": 25,
        "seq_length": 1024
    },
    "data_parallel_size": 1,
    "tensor_parallel_size": 1,
    "pipeline_parallel_size": 1
}


In [10]:
# train_model(cfg)
# ret, metrics = parse_log("train.log")
# if ret:
#     print(f"metrics: \n{json.dumps(dataclasses.asdict(metrics), indent=4)}")
# else:
#     print("train failed!")