In [16]:
import os.path

import pandas as pd
import numpy as np
import random

In [17]:
DATA_DIR = "./"

def get_df(file, header=None):
    df = pd.read_csv(file, header=None)
    # df.columns = DF_HEADER.get(key, df.columns)
    df.columns = pd.read_csv("{}.header".format(file.split('.csv')[0])).columns if header is None else header
    return df

In [18]:
def get_df_one_inst(dft, dfj):
    # 过滤掉太短的任务，它们有可能是inference任务。
    min_run_time = 1000.
    dfa = dft.merge(dfj, on=['job_name'], suffixes = ['','_j'])
    dfa.loc[dfa.start_time==0, 'start_time'] = np.nan
    dfa.loc[dfa.start_time==0, 'end_time'] = np.nan
    dfa['runtime'] = dfa.end_time - dfa.start_time
    df_target = dfa[
        (dfa['status'] == 'Terminated') &
        (dfa['gpu_type'] != 'MISC') &
        (dfa['plan_gpu'] == 100.0) &
        (dfa['inst_num'] == 1.0) &
        (dfa['end_time'] - dfa['start_time'] >= min_run_time) &
        (dfa['task_name'].isin(['tensorflow', 'PyTorchWorker', 'worker']))]
    df_target = df_target.sort_values(['start_time_j'])
    df_target['norm_job_submit_time'] = df_target.start_time_j - df_target.iloc[0]['start_time_j'].item()
    df_target = df_target.drop_duplicates(['job_name'], keep='first')
    print(df_target[df_target['gpu_type'] == 'V100M32'].shape)
    print("T4 shape", df_target[df_target['gpu_type'] == 'T4'].shape)
    df_target.loc[df_target.gpu_type == 'V100M32', 'gpu_type'] = 'V100'
    return df_target

In [20]:
def gen_ddl_and_gpu_runtimes(df_one_inst):
    gpu_types = [str(t) for t in df_one_inst['gpu_type'].unique()]
    """
    configs
    """
    runtimes = {
        'T4': (1, 1),
        'P100': (1.4, 2.),
        'V100': (2.4, 2.66),
    }
    assert set(runtimes.keys()) == set(list(gpu_types))
    ddl_ratio = 10
    ddl_range = (1.2, 3.0)
    jobs_count = 5000
    submit_together = True
    df_one_inst = df_one_inst[['job_name', 'gpu_type', 'runtime', 'norm_job_submit_time']]
    df_one_inst = df_one_inst[:jobs_count]

    print(df_one_inst.shape)
    print("print", df_one_inst.iloc[0:100, :])
    """
    end configs
    """

    def gen_runtime(from_gpu, to_gpu, origin_runtime):
        if from_gpu == to_gpu:
            return origin_runtime
        if from_gpu not in gpu_types:
            print("not in gpu_types:", from_gpu)
        to_rand = random.uniform(*runtimes[to_gpu])
        from_rand = random.uniform(*runtimes[from_gpu])
        return int(origin_runtime * to_rand / from_rand)

    def gen_ddl(norm_submit_time, runtime):
        if random.randint(0, 100) < ddl_ratio:
            return int(norm_submit_time + runtime * random.uniform(*ddl_range))
        return np.inf

    fix_gpu = {
        'T4': 'A100',
        'P100': 'GTX2080Ti',
        'V100': 'V100'
    }
    for gpu_type in gpu_types:
        df_one_inst.loc[:, fix_gpu[gpu_type]] = df_one_inst.apply(lambda x: gen_runtime(x.gpu_type, gpu_type, x.runtime), axis=1)
    df_one_inst.loc[:, 'ddl'] = df_one_inst.apply(lambda x: gen_ddl(x.norm_job_submit_time, x.runtime), axis=1)

    if submit_together:
        df_one_inst = df_one_inst.iloc[np.random.permutation(len(df_one_inst))]
        df_one_inst.loc[:, 'ddl'] = df_one_inst.apply(lambda x: x['ddl'] - x['norm_job_submit_time'], axis=1)
        df_one_inst.loc[:, 'norm_job_submit_time'] = df_one_inst.apply(lambda x: 0, axis=1)
    fix_gpu_types = [fix_gpu[t] for t in gpu_types]
    df_output = df_one_inst[['job_name', 'norm_job_submit_time', 'ddl', *fix_gpu_types]]
    print("output: ", df_output.shape)
    to_csv(df_output, f"case_{jobs_count}_all_{ddl_ratio}_ddl.csv")

In [19]:
def to_csv(df, name):
    df = df.reset_index(drop=True)
    df.to_csv(name)

In [21]:
def main():
    dft = get_df(os.path.join(DATA_DIR, "pai_task_table.csv"))
    dfj = get_df(os.path.join(DATA_DIR, "pai_job_table.csv"))
    df = get_df_one_inst(dft, dfj)
    gen_ddl_and_gpu_runtimes(df)

if __name__ == '__main__':
    main()

(3603, 17)
T4 shape (10988, 17)
(5000, 4)
print                         job_name gpu_type   runtime  norm_job_submit_time
273324  34b3d819023ea21e28afd50f       T4  176371.0                   0.0
200005  af3b0f5e810838d6c33840e1       T4  109920.0               65724.0
88185   8b3581e39780f46e28e48c4a     P100  240371.0               66246.0
296351  e3789c5e546bbbe82007127f     P100  232288.0               66313.0
388408  8dc5b00629f823cf8d30cade     P100  157663.0               66385.0
...                          ...      ...       ...                   ...
342854  dbbc7b184fb67cf1ff4254b5       T4    1190.0              184504.0
426200  ea3bd3de4cb78d2bd1a06619     P100    1188.0              185251.0
413030  5487ca4ba4dbb22cda463042     P100    1157.0              185313.0
277214  493650beeaaa64eb0504c911     P100    1476.0              185338.0
453596  4f0246b30625281fa3d0b537     P100    2081.0              185372.0

[100 rows x 4 columns]
output:  (5000, 6)


In [22]:

dft = get_df(os.path.join(DATA_DIR, "pai_task_table.csv"))
dfj = get_df(os.path.join(DATA_DIR, "pai_job_table.csv"))
df_one_inst = get_df_one_inst(dft, dfj)

(3603, 17)
T4 shape (10988, 17)


In [23]:
gen_ddl_and_gpu_runtimes(df_one_inst)

(5000, 4)
print                         job_name gpu_type   runtime  norm_job_submit_time
273324  34b3d819023ea21e28afd50f       T4  176371.0                   0.0
200005  af3b0f5e810838d6c33840e1       T4  109920.0               65724.0
88185   8b3581e39780f46e28e48c4a     P100  240371.0               66246.0
296351  e3789c5e546bbbe82007127f     P100  232288.0               66313.0
388408  8dc5b00629f823cf8d30cade     P100  157663.0               66385.0
...                          ...      ...       ...                   ...
342854  dbbc7b184fb67cf1ff4254b5       T4    1190.0              184504.0
426200  ea3bd3de4cb78d2bd1a06619     P100    1188.0              185251.0
413030  5487ca4ba4dbb22cda463042     P100    1157.0              185313.0
277214  493650beeaaa64eb0504c911     P100    1476.0              185338.0
453596  4f0246b30625281fa3d0b537     P100    2081.0              185372.0

[100 rows x 4 columns]
output:  (5000, 6)


In [24]:
df_one_inst

Unnamed: 0,job_name,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type,inst_id,user,status_j,start_time_j,end_time_j,runtime,norm_job_submit_time
273324,34b3d819023ea21e28afd50f,tensorflow,1.0,Terminated,925461.0,1101832.0,600.0,29.296875,100.0,T4,daa91577009c8ebd1fdc63d81113bd4b954bbf338ca476...,74238accb90b,Terminated,925461.0,1101832.0,176371.0,0.0
200005,af3b0f5e810838d6c33840e1,tensorflow,1.0,Terminated,991185.0,1101105.0,900.0,29.296875,100.0,T4,71ac5edf97e8af81c324ff263c60512034138f41368e16...,74238accb90b,Terminated,991185.0,1101105.0,109920.0,65724.0
88185,8b3581e39780f46e28e48c4a,PyTorchWorker,1.0,Terminated,991707.0,1232078.0,1800.0,58.593750,100.0,P100,a4928bb64b9d7c16e8a3da7d3b9ca5d0fceac598c4c550...,74238accb90b,Terminated,991707.0,1232078.0,240371.0,66246.0
296351,e3789c5e546bbbe82007127f,PyTorchWorker,1.0,Terminated,991774.0,1224062.0,1800.0,58.593750,100.0,P100,7b607f2c0057cf3a7a81900877f03efb6d4e44b89256f2...,74238accb90b,Terminated,991774.0,1224062.0,232288.0,66313.0
388408,8dc5b00629f823cf8d30cade,PyTorchWorker,1.0,Terminated,991846.0,1149509.0,1800.0,58.593750,100.0,P100,573723e604c957929beacad58c4aa97e203068615badce...,74238accb90b,Terminated,991846.0,1149509.0,157663.0,66385.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135239,745c52759b59bcedf31a6867,PyTorchWorker,1.0,Terminated,6441218.0,6442534.0,1800.0,58.593750,100.0,P100,a1c0afd51f7d107ede8ae6b0e676aff6ee2a31a2509e85...,31b0dbc2b5af,Terminated,6441218.0,6442534.0,1316.0,5515757.0
993864,e951ea0d52491e65ba788fd4,PyTorchWorker,1.0,Terminated,6441268.0,6442627.0,1800.0,58.593750,100.0,P100,14ab0a84efd5e653fdc0bc81722c872c7e4b49faa805d4...,31b0dbc2b5af,Terminated,6441268.0,6442627.0,1359.0,5515807.0
1078654,7cd6f580e3d5e7154c406985,PyTorchWorker,1.0,Terminated,6441285.0,6442675.0,1800.0,58.593750,100.0,P100,06dfd1c92c35010d570e65dcaf4029ce4f59c4d0858ea4...,31b0dbc2b5af,Terminated,6441285.0,6442675.0,1390.0,5515824.0
1080936,c8732322364f5133c0c0a6a9,PyTorchWorker,1.0,Terminated,6441341.0,6442551.0,1800.0,58.593750,100.0,P100,4271e8be81b22ab2071a8ce28ba8ac3b8129c7fe1257d1...,31b0dbc2b5af,Terminated,6441341.0,6442551.0,1210.0,5515880.0
