In [49]:
import os.path
from typing import Optional

import numpy as np
import pandas as pd


In [50]:
DATA_DIR = "./ali-cluster/cluster-trace-gpu-v2020/data"

def read_csv_with_header(
    file_path: str,
    header: Optional[list[str]] = None
) -> pd.DataFrame:
    """读取 CSV 文件并处理表头

    Args:
        file_path: CSV 文件路径
        header: 可选的表头列表。如果为 None, 则从对应的 .header 文件读取

    Returns:
        pd.DataFrame: 读取并设置好表头的数据框

    Raises:
        FileNotFoundError: 当 CSV 文件或对应的 header 文件不存在时
        pd.errors.EmptyDataError: 当 CSV 文件为空时
    """
    df = pd.read_csv(file_path, header=None)
    df.columns = (pd.read_csv("{}.header".format(file_path.split('.csv')[0])).columns
                 if header is None else header)
    return df

In [51]:
def get_model_configs():
    """获取预定义的模型配置"""
    return [
        {
            'model_name': 'ResNet50',
            'task_type': 'Image Classification',
            'batch_sizes': [16, 32, 64]
        },
        {
            'model_name': 'VGG19',
            'task_type': 'Image Classification',
            'batch_sizes': [16, 32]
        },
        {
            'model_name': 'InceptionV3',
            'task_type': 'Image Classification',
            'batch_sizes': [16, 32]
        },
        {
            'model_name': 'DenseNet161',
            'task_type': 'Image Classification',
            'batch_sizes': [8, 16]
        },
        {
            'model_name': 'DCGAN',
            'task_type': 'Image Generation',
            'batch_sizes': [64, 128, 256]
        },
        {
            'model_name': 'LSTM',
            'task_type': 'Language Modeling',
            'batch_sizes': [32, 64, 128]
        },
        {
            'model_name': 'Transformer',
            'task_type': 'Language Modeling',
            'batch_sizes': [16, 32, 64]
        }
    ]

In [52]:
def preprocess_training_data(task_df) -> pd.DataFrame:
    """数据预处理：筛选和清理训练任务数据

    对原始任务数据进行预处理，包括：
    1. 合并任务和作业数据
    2. 清理无效时间戳
    3. 筛选有效训练任务
    4. 标准化时间
    5. 统一GPU类型

    Args:
        task_df: 任务数据表
        job_df: 作业数据表

    Returns:
        DataFrame: 处理后的训练任务数据，包含以下主要字段：
            - job_name: 作业名称
            - gpu_type: GPU类型(V100或T4)
            - runtime: 运行时长
            - norm_job_submit_time: 标准化后的作业提交时间
    """
    # 常量定义
    MIN_RUNTIME_SECONDS = 1000.0  # 最小运行时间（秒）
    VALID_TASK_TYPES = ['tensorflow', 'PyTorchWorker', 'worker']


    # 处理无效的时间戳
    task_df.loc[task_df.start_time == 0, ['start_time', 'end_time']] = np.nan
    task_df['runtime'] = task_df.end_time - task_df.start_time

    # 筛选有效的训练任务
    valid_tasks = task_df[
        (task_df['status'] == 'Terminated') &             # 已完成的任务
        (task_df['gpu_type'] != 'MISC') &                 # 排除杂项GPU类型
        (task_df['plan_gpu'] == 100) &                    # 完整GPU使用
        (task_df['runtime'] >= MIN_RUNTIME_SECONDS) &     # 运行时间足够长
        (task_df['inst_num'] <= 8) &                     # 实例数量小于8
        (task_df['task_name'].isin(VALID_TASK_TYPES))     # 有效的任务类型
    ]

    # 按提交时间排序并标准化
    valid_tasks = valid_tasks.sort_values(['start_time'])

    # 去重并统一GPU类型名称
    valid_tasks.loc[valid_tasks.gpu_type == 'V100M32', 'gpu_type'] = 'V100'

    return valid_tasks

In [53]:
import random


def sample_tasks(task_df: pd.DataFrame, jobs_count: int) -> pd.DataFrame:
    """
    从任务数据中随机采样指定数量的任务

    Args:
        task_df: 任务数据表
        jobs_count: 采样任务数量

    Returns:
        pd.DataFrame: 采样后的任务数据表
    """
    return task_df.sample(n=jobs_count)


def gen_task_runtimes(task_df: pd.DataFrame) -> pd.DataFrame:
    """生成不同 GPU 类型的运行时间"""
    t4_performance = 8.1
    p100_performance = 9.3
    v100_performance = 15.7

    runtimes = {
        'T4': (1, 1),
        'P100': (t4_performance / p100_performance, t4_performance / p100_performance),
        'V100': (t4_performance / v100_performance, t4_performance / v100_performance),
    }
    gpu_types = runtimes.keys()

    def gen_runtime(from_gpu, to_gpu, origin_runtime):
        if from_gpu == to_gpu:
            return origin_runtime
        if from_gpu not in gpu_types:
            print("not in gpu_types:", from_gpu)
        to_rand = random.uniform(*runtimes[to_gpu])
        from_rand = random.uniform(*runtimes[from_gpu])
        return int(origin_runtime * to_rand / from_rand)

    for gpu_type in gpu_types:
        task_df[f'runtime_{gpu_type}'] = task_df.apply(
            lambda row, gpu_type=gpu_type: gen_runtime(row['gpu_type'], gpu_type, row['runtime']),
            axis=1
        )

    # 获取模型配置
    model_configs = get_model_configs()

    # 为每个任务随机分配一个模型
    def assign_model(row):
        model_config = random.choice(model_configs)
        return model_config['model_name']

    # 应用模型分配
    task_df['task_name'] = task_df.apply(assign_model, axis=1)
    print(task_df)

    return task_df

def to_csv(df, name):
    df = df.reset_index(drop=True)
    df.to_csv(name)

In [54]:
def generate_task_config(jobs_count: int):
    """生成任务配置"""
    task_df = read_csv_with_header(os.path.join(DATA_DIR, "pai_task_table.csv"))
    valid_task_df = preprocess_training_data(task_df)
    sample_task_df = sample_tasks(valid_task_df, jobs_count)
    task_wrap_runtimes_df = gen_task_runtimes(sample_task_df)

    # 统计信息
    to_csv(task_wrap_runtimes_df, f"case_{jobs_count}_tasks.csv")
    print(f"task_file:{f'case_{jobs_count}_tasks.csv'} generated")

def main():
    jobs_count_list = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

    for jobs_count in jobs_count_list:
        generate_task_config(jobs_count)

if __name__ == '__main__':
    main()

                         job_name    task_name  inst_num      status  \
616875   2a3bece465539b9469e47860        DCGAN       1.0  Terminated   
991488   e86dd27b138a00467c0d3a67         LSTM       1.0  Terminated   
213538   e5ac0dd15a0fdd36c308319b        VGG19       4.0  Terminated   
910409   5e687a7bf9de78d40447cd36        DCGAN       1.0  Terminated   
683715   7ba104a8c3d949e3712972a3         LSTM       1.0  Terminated   
...                           ...          ...       ...         ...   
1064764  aafb4f84c07d92769a7f1296         LSTM       1.0  Terminated   
1072376  fa935a988b327aff5f0a9917        VGG19       1.0  Terminated   
1029012  092a9dbb8166a616c785161c  InceptionV3       1.0  Terminated   
1183094  d8d806796e18291ba8253889  Transformer       1.0  Terminated   
97566    a44973a94910a51357009e0d         LSTM       1.0  Terminated   

         start_time   end_time  plan_cpu   plan_mem  plan_gpu gpu_type  \
616875    6236951.0  6238230.0    1800.0  58.593750     100.0