In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import lightgbm as lgb

In [2]:
# 常量定义
NFOLDS = 5  # 交叉验证的折数
SEQ_LEN = 5  # 序列长度
WINDOW_SIZE = 2 * SEQ_LEN  # 窗口长度
MODEL_N = 10  # 10个模型分别预测 CPU_USAGE_6...LAUNCHING_JOB_NUMS_10

__author__ = 'siliconx'
__version__ = '1.0.0'

pd.options.display.max_columns = None  # 展示所有列

In [3]:
# 初始数据
RAW_TRAIN = './data/train.csv'
RAW_TEST = './data/evaluation_public.csv'
SAMPLE_SUBMIT = './data/submit_example.csv'

# 1. 加载数据

In [4]:
# 加载原始数据
train_df = pd.read_csv(RAW_TRAIN)
test_df = pd.read_csv(RAW_TEST)
sample_df = pd.read_csv(SAMPLE_SUBMIT)

train_df = train_df.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)
test_df = test_df.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

In [5]:
display(train_df, test_df)

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501725,87139,16,available,general,x86_64,3,36,0,0,0,0,0,1599867000000,vm,26.0
501726,87139,16,available,general,x86_64,2,36,0,0,0,0,0,1599867420000,vm,26.0
501727,87139,16,available,general,x86_64,3,36,0,0,0,0,0,1599867840000,vm,26.0
501728,87139,16,available,general,x86_64,2,36,0,0,0,0,0,1599868260000,vm,26.0


Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,2996,287,16,available,sql,x86_64,1,20,0,0,0,0,0,1598227680000,vm,29
14976,2996,287,16,available,sql,x86_64,5,20,0,0,0,0,0,1598227980000,vm,29
14977,2996,287,16,available,sql,x86_64,7,21,0,0,0,0,0,1598228280000,vm,29
14978,2996,287,16,available,sql,x86_64,2,20,0,0,0,0,0,1598228580000,vm,29


# 2. 预处理

In [6]:
def digitalization(fields):
    """将非数值型域转换为数值型."""
    # 组合训练集和测试集，只用来构建编码器，不用来训练模型
    df = pd.concat([train_df[fields], test_df[fields]], ignore_index=True)

    for f in fields:
        # 构建编码器
        le = LabelEncoder()
        le.fit(df[f])

        # 设置新值
        train_df[f] = le.transform(train_df[f])
        test_df[f] = le.transform(test_df[f])
        print('%s:' % f, le.classes_)

In [7]:
def pre_processing():
    """预处理."""
    print('Preprocessing...')

    # 缺失值填充
    # 经检验，为NaN的都是vm（通过QUEUE_ID查找）
    train_df['RESOURCE_TYPE'].fillna('vm', inplace=True)

    # 观察数据，填充0比较合理（NaN集中在数据前面，可能是由服务器尚未开始运行导致的）
    train_df['DISK_USAGE'].fillna(0, inplace=True)

    # 需要转换的列
    fields = ['STATUS', 'QUEUE_TYPE', 'PLATFORM', 'RESOURCE_TYPE']

    # 数值化
    digitalization(fields)

    # 重命名，原来的名字太长了
    for df in [train_df, test_df]:
        df.rename(columns={
            'LAUNCHING_JOB_NUMS': 'LJOB',
            'RUNNING_JOB_NUMS': 'RJOB',
            'SUCCEED_JOB_NUMS': 'SJOB',
            'CANCELLED_JOB_NUMS': 'CJOB',
            'FAILED_JOB_NUMS': 'FJOB'
        }, inplace=True)

In [8]:
%%time
pre_processing()

Preprocessing...
STATUS: ['assigned' 'assigning' 'available' 'suspended']
QUEUE_TYPE: ['general' 'spark' 'sql']
PLATFORM: ['aarch64' 'x86_64']
RESOURCE_TYPE: ['container' 'vm']
Wall time: 364 ms


# 3. 特征工程

### 3.1 时间特征

-- 把DOTTINGTIME转换为一天之内的时间

In [9]:
for df in [train_df, test_df]:
    t = pd.to_datetime(df['DOTTING_TIME'], unit='ms')

    # 转成小时
    df['DOTTING_TIME'] = t.dt.hour + t.dt.minute / 60

### 3.2 行统计特征

In [10]:
%%time
used_features = ['CPU_USAGE', 'MEM_USAGE', 'DISK_USAGE', 'LJOB', 'RJOB']

# 分组，只用训练集数据做统计
group_data = train_df.groupby(by=['QUEUE_ID'])[used_features]

# 聚合函数
methods = {
    'AVG': 'mean',
    'MEDIAN': 'median',
    'MIN': 'min',
    'MAX': 'max',
    'STD': 'std',
}

for m in methods:
    agg_data = group_data.agg(methods[m])
    agg_data.fillna(method='ffill', inplace=True)
    agg_data.fillna(0, inplace=True)
    agg_data = agg_data.rename(lambda x: 'QUEUE_%s_%s' % (x, m), axis=1)
    agg_data = agg_data.reset_index()

    for df in [train_df, test_df]:
        merged_data = df[['QUEUE_ID']].merge(agg_data, how='left', on=['QUEUE_ID'])
        merged_data.drop(columns=['QUEUE_ID'], inplace=True)

        # 插入新的列
        for c in merged_data.columns:
            df[c] = 0

        # 赋值
        df.loc[:, list(merged_data.columns)] = merged_data.values

Wall time: 2.45 s


###    3.3 滑动窗口构造数据集

In [11]:
# 需要滑动的数值特征
num_features = ['CPU_USAGE', 'MEM_USAGE', 'DISK_USAGE',
                'LJOB', 'RJOB', 'SJOB', 'CJOB', 'FJOB']

# 需要预测的值
y_features = ['CPU_USAGE', 'LJOB']

In [12]:
%%time
# 生成测试集时间窗数据
for i in range(SEQ_LEN):
    for sf in num_features:
        new_f = '%s_%d' % (sf, i+1)
        test_df[new_f] = test_df[sf].shift(-i)

# 删除原来的列
test_df.drop(columns=num_features, inplace=True)

# 只取每个ID的第一条数据
test_df = test_df.groupby(by='ID', as_index=False).first()

Wall time: 42.9 ms


In [13]:
%%time
# 生成训练集时间窗数据
temp = pd.DataFrame()
qids = sorted(train_df['QUEUE_ID'].unique())

for qid in tqdm(qids):  # 按QUEUE_ID进行处理
    queue = train_df[train_df['QUEUE_ID'] == qid].copy(deep=True)

    # 生成时间窗数据
    for i in range(SEQ_LEN):
        for sf in num_features:
            new_f = '%s_%d' % (sf, i+1)
            queue[new_f] = queue[sf].shift(-i)

    # 处理需要预测的值
    for i in range(SEQ_LEN):
        for y in y_features:
            new_y = '%s_%d' % (y, i+SEQ_LEN+1)
            queue[new_y] = queue[y].shift(-i-SEQ_LEN)

    # 删除原来的列
    queue.drop(columns=num_features, inplace=True)

    # 对于每个QUEUE_ID，丢弃最后10条有NAN值的数据
    queue = queue.head(queue.shape[0]-WINDOW_SIZE)
    temp = temp.append(queue)

# 重设索引
train_df = temp.reset_index(drop=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 43/43 [00:12<00:00,  3.56it/s]


Wall time: 12.4 s


### 3.4 列统计特征

In [14]:
cpu_usages = []
mem_usages = []
disk_usages = []
ljobs = []
rjobs = []

for i in range(SEQ_LEN):
    postfix = '_%d' % (i + 1)
    cpu_usages.append('CPU_USAGE'+postfix)
    mem_usages.append('MEM_USAGE'+postfix)
    disk_usages.append('DISK_USAGE'+postfix)
    ljobs.append('LJOB'+postfix)
    rjobs.append('RJOB'+postfix)

In [15]:
%%time
for df in [train_df, test_df]:
    # zheng.heng baseline给的特征
    df['USED_CPU'] = df['CU'] * df['CPU_USAGE_5'] / 100
    df['USED_MEM'] = 4 * df['CU'] * df['MEM_USAGE_5'] / 100
    df['TO_RUN_JOBS'] = df['LJOB_5'] - df['RJOB_5']
    df.loc[df['TO_RUN_JOBS'] < 0, 'TO_RUN_JOBS'] = 0

    # zheng.heng baseline中的新的列特征
    pairs = [
        ('CPU', 'CPU_USAGE', cpu_usages),
        ('MEM', 'MEM_USAGE', mem_usages),
        ('DISK', 'DISK_USAGE', disk_usages),
        ('LJOB', 'LJOB', ljobs),
        ('RJOB', 'RJOB', rjobs),
    ]

    for short_name, f, usages in pairs:
        df[short_name+'_AVG'] = df[usages].mean(axis=1)
        df[short_name+'_STD'] = df[usages].std(axis=1)
        df[short_name+'_DIFF'] = df['%s_5' % f] - df['%s_1' % f]

Wall time: 2.26 s


### 3.5 特征过滤

In [16]:
# 去掉无用列
useless = [
    'QUEUE_ID', 'PLATFORM', 'RESOURCE_TYPE', 'STATUS',
]

train_df.drop(columns=useless, inplace=True)
test_df.drop(columns=['ID']+useless, inplace=True)

display(train_df, test_df)

Unnamed: 0,CU,QUEUE_TYPE,DOTTING_TIME,QUEUE_CPU_USAGE_AVG,QUEUE_MEM_USAGE_AVG,QUEUE_DISK_USAGE_AVG,QUEUE_LJOB_AVG,QUEUE_RJOB_AVG,QUEUE_CPU_USAGE_MEDIAN,QUEUE_MEM_USAGE_MEDIAN,QUEUE_DISK_USAGE_MEDIAN,QUEUE_LJOB_MEDIAN,QUEUE_RJOB_MEDIAN,QUEUE_CPU_USAGE_MIN,QUEUE_MEM_USAGE_MIN,QUEUE_DISK_USAGE_MIN,QUEUE_LJOB_MIN,QUEUE_RJOB_MIN,QUEUE_CPU_USAGE_MAX,QUEUE_MEM_USAGE_MAX,QUEUE_DISK_USAGE_MAX,QUEUE_LJOB_MAX,QUEUE_RJOB_MAX,QUEUE_CPU_USAGE_STD,QUEUE_MEM_USAGE_STD,QUEUE_DISK_USAGE_STD,QUEUE_LJOB_STD,QUEUE_RJOB_STD,CPU_USAGE_1,MEM_USAGE_1,DISK_USAGE_1,LJOB_1,RJOB_1,SJOB_1,CJOB_1,FJOB_1,CPU_USAGE_2,MEM_USAGE_2,DISK_USAGE_2,LJOB_2,RJOB_2,SJOB_2,CJOB_2,FJOB_2,CPU_USAGE_3,MEM_USAGE_3,DISK_USAGE_3,LJOB_3,RJOB_3,SJOB_3,CJOB_3,FJOB_3,CPU_USAGE_4,MEM_USAGE_4,DISK_USAGE_4,LJOB_4,RJOB_4,SJOB_4,CJOB_4,FJOB_4,CPU_USAGE_5,MEM_USAGE_5,DISK_USAGE_5,LJOB_5,RJOB_5,SJOB_5,CJOB_5,FJOB_5,CPU_USAGE_6,LJOB_6,CPU_USAGE_7,LJOB_7,CPU_USAGE_8,LJOB_8,CPU_USAGE_9,LJOB_9,CPU_USAGE_10,LJOB_10,USED_CPU,USED_MEM,TO_RUN_JOBS,CPU_AVG,CPU_STD,CPU_DIFF,MEM_AVG,MEM_STD,MEM_DIFF,DISK_AVG,DISK_STD,DISK_DIFF,LJOB_AVG,LJOB_STD,LJOB_DIFF,RJOB_AVG,RJOB_STD,RJOB_DIFF
0,16,2,16.416667,4.020410,38.748793,25.938769,0.000208,0.010127,3.0,39.0,26.0,0.0,0.0,1.0,16.0,20.0,0.0,0.0,62.0,81.0,26.0,1.0,1.0,2.802314,6.971659,0.435446,0.014412,0.100126,3,54,20.0,0,0,0,0,0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,7.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,4.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,2.0,0.0,5.0,0.0,6.0,0.0,0.80,34.56,0.0,4.2,1.923538,2.0,54.0,0.000000,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16,2,16.500000,4.020410,38.748793,25.938769,0.000208,0.010127,3.0,39.0,26.0,0.0,0.0,1.0,16.0,20.0,0.0,0.0,62.0,81.0,26.0,1.0,1.0,2.802314,6.971659,0.435446,0.014412,0.100126,2,54,20.0,0,0,0,0,0,7.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,4.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,3.0,55.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,5.0,0.0,6.0,0.0,2.0,0.0,0.48,35.20,0.0,4.2,1.923538,1.0,54.2,0.447214,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,2,16.583333,4.020410,38.748793,25.938769,0.000208,0.010127,3.0,39.0,26.0,0.0,0.0,1.0,16.0,20.0,0.0,0.0,62.0,81.0,26.0,1.0,1.0,2.802314,6.971659,0.435446,0.014412,0.100126,7,54,20.0,0,0,0,0,0,4.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,3.0,55.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,6.0,0.0,2.0,0.0,3.0,0.0,0.32,34.56,0.0,4.2,1.923538,-5.0,54.2,0.447214,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,2,16.666667,4.020410,38.748793,25.938769,0.000208,0.010127,3.0,39.0,26.0,0.0,0.0,1.0,16.0,20.0,0.0,0.0,62.0,81.0,26.0,1.0,1.0,2.802314,6.971659,0.435446,0.014412,0.100126,4,54,20.0,0,0,0,0,0,5.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,3.0,55.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,6.0,0.0,2.0,0.0,3.0,0.0,10.0,1.0,0.32,34.56,0.0,3.2,1.303840,-2.0,54.2,0.447214,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16,2,16.700000,4.020410,38.748793,25.938769,0.000208,0.010127,3.0,39.0,26.0,0.0,0.0,1.0,16.0,20.0,0.0,0.0,62.0,81.0,26.0,1.0,1.0,2.802314,6.971659,0.435446,0.014412,0.100126,5,54,20.0,0,0,0,0,0,3.0,55.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,54.0,20.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,3.0,0.0,10.0,1.0,6.0,1.0,0.80,34.56,0.0,3.4,1.516575,0.0,54.2,0.447214,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501295,16,0,22.350000,4.923465,29.892767,25.989066,0.001262,0.134567,3.0,27.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,66.0,26.0,1.0,49.0,9.798605,6.869162,0.533172,0.035504,1.841984,3,36,26.0,0,0,0,0,0,2.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.32,23.04,0.0,2.6,0.547723,-1.0,36.0,0.000000,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501296,16,0,22.450000,4.923465,29.892767,25.989066,0.001262,0.134567,3.0,27.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,66.0,26.0,1.0,49.0,9.798605,6.869162,0.533172,0.035504,1.841984,2,36,26.0,0,0,0,0,0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,0.48,23.04,0.0,2.6,0.547723,1.0,36.0,0.000000,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501297,16,0,22.566667,4.923465,29.892767,25.989066,0.001262,0.134567,3.0,27.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,66.0,26.0,1.0,49.0,9.798605,6.869162,0.533172,0.035504,1.841984,3,36,26.0,0,0,0,0,0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,2.0,0.0,0.48,23.04,0.0,2.8,0.447214,0.0,36.0,0.000000,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501298,16,0,22.700000,4.923465,29.892767,25.989066,0.001262,0.134567,3.0,27.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,66.0,26.0,1.0,49.0,9.798605,6.869162,0.533172,0.035504,1.841984,3,36,26.0,0,0,0,0,0,2.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,26.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,2.0,0.0,3.0,0.0,0.16,23.04,0.0,2.4,0.894427,-2.0,36.0,0.000000,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,CU,QUEUE_TYPE,DOTTING_TIME,QUEUE_CPU_USAGE_AVG,QUEUE_MEM_USAGE_AVG,QUEUE_DISK_USAGE_AVG,QUEUE_LJOB_AVG,QUEUE_RJOB_AVG,QUEUE_CPU_USAGE_MEDIAN,QUEUE_MEM_USAGE_MEDIAN,QUEUE_DISK_USAGE_MEDIAN,QUEUE_LJOB_MEDIAN,QUEUE_RJOB_MEDIAN,QUEUE_CPU_USAGE_MIN,QUEUE_MEM_USAGE_MIN,QUEUE_DISK_USAGE_MIN,QUEUE_LJOB_MIN,QUEUE_RJOB_MIN,QUEUE_CPU_USAGE_MAX,QUEUE_MEM_USAGE_MAX,QUEUE_DISK_USAGE_MAX,QUEUE_LJOB_MAX,QUEUE_RJOB_MAX,QUEUE_CPU_USAGE_STD,QUEUE_MEM_USAGE_STD,QUEUE_DISK_USAGE_STD,QUEUE_LJOB_STD,QUEUE_RJOB_STD,CPU_USAGE_1,MEM_USAGE_1,DISK_USAGE_1,LJOB_1,RJOB_1,SJOB_1,CJOB_1,FJOB_1,CPU_USAGE_2,MEM_USAGE_2,DISK_USAGE_2,LJOB_2,RJOB_2,SJOB_2,CJOB_2,FJOB_2,CPU_USAGE_3,MEM_USAGE_3,DISK_USAGE_3,LJOB_3,RJOB_3,SJOB_3,CJOB_3,FJOB_3,CPU_USAGE_4,MEM_USAGE_4,DISK_USAGE_4,LJOB_4,RJOB_4,SJOB_4,CJOB_4,FJOB_4,CPU_USAGE_5,MEM_USAGE_5,DISK_USAGE_5,LJOB_5,RJOB_5,SJOB_5,CJOB_5,FJOB_5,USED_CPU,USED_MEM,TO_RUN_JOBS,CPU_AVG,CPU_STD,CPU_DIFF,MEM_AVG,MEM_STD,MEM_DIFF,DISK_AVG,DISK_STD,DISK_DIFF,LJOB_AVG,LJOB_STD,LJOB_DIFF,RJOB_AVG,RJOB_STD,RJOB_DIFF
0,16,2,13.950000,22.605079,57.987538,7.341295,0.563350,3.372734,4.0,60.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,97.0,9.0,79.0,49.0,33.306008,11.125508,2.508640,3.648205,7.763952,60,69,9,0,5,5,0,0,58.0,69.0,9.0,0.0,9.0,4.0,0.0,0.0,80.0,67.0,9.0,0.0,9.0,1.0,0.0,0.0,100.0,65.0,9.0,0.0,7.0,2.0,0.0,1.0,98.0,67.0,9.0,0.0,10.0,3.0,0.0,1.0,15.68,42.88,0.0,79.2,20.029978,38.0,67.4,1.673320,-2.0,9.0,0.0,0.0,0.0,0.0,0.0,8.0,2.000000,5.0
1,64,0,13.766667,12.290532,30.926566,19.998607,0.431896,1.634362,7.0,30.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,99.0,20.0,147.0,49.0,14.911072,12.356762,0.166939,4.136347,5.374643,56,91,20,0,0,0,0,0,48.0,78.0,20.0,0.0,1.0,1.0,0.0,0.0,23.0,35.0,20.0,0.0,0.0,0.0,0.0,0.0,68.0,61.0,20.0,0.0,0.0,0.0,0.0,0.0,38.0,74.0,20.0,0.0,0.0,0.0,0.0,0.0,24.32,189.44,0.0,46.6,17.169741,-18.0,67.8,21.229696,-17.0,20.0,0.0,0.0,0.0,0.0,0.0,0.2,0.447214,0.0
2,16,2,12.816667,22.605079,57.987538,7.341295,0.563350,3.372734,4.0,60.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,97.0,9.0,79.0,49.0,33.306008,11.125508,2.508640,3.648205,7.763952,2,41,9,0,6,1,0,2,5.0,42.0,9.0,0.0,6.0,2.0,0.0,1.0,3.0,43.0,9.0,0.0,6.0,3.0,0.0,0.0,80.0,68.0,9.0,0.0,8.0,2.0,0.0,0.0,90.0,82.0,9.0,0.0,8.0,2.0,0.0,0.0,14.40,52.48,0.0,36.0,44.883182,88.0,55.2,18.753666,41.0,9.0,0.0,0.0,0.0,0.0,0.0,6.8,1.095445,2.0
3,16,2,4.966667,22.605079,57.987538,7.341295,0.563350,3.372734,4.0,60.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,97.0,9.0,79.0,49.0,33.306008,11.125508,2.508640,3.648205,7.763952,3,34,9,0,1,6,0,2,37.0,46.0,9.0,0.0,9.0,1.0,0.0,0.0,90.0,71.0,9.0,0.0,9.0,1.0,0.0,0.0,64.0,72.0,9.0,0.0,6.0,2.0,0.0,1.0,68.0,68.0,9.0,0.0,9.0,2.0,0.0,0.0,10.88,43.52,0.0,52.4,33.426038,65.0,58.2,17.210462,34.0,9.0,0.0,0.0,0.0,0.0,0.0,6.8,3.492850,8.0
4,1024,2,16.583333,9.850405,21.046488,26.000000,0.011706,0.590500,0.0,7.0,26.0,0.0,0.0,0.0,2.0,26.0,0.0,0.0,95.0,93.0,26.0,5.0,7.0,26.593691,23.321886,0.000000,0.219191,1.505973,3,9,26,0,0,12,0,0,0.0,10.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,26.0,0.0,0.0,3.0,0.0,0.0,0.0,10.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,26.0,0.0,0.0,0.0,0.0,0.0,10.24,409.60,0.0,1.0,1.224745,-2.0,9.8,0.447214,1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2991,16,2,0.283333,22.605079,57.987538,7.341295,0.563350,3.372734,4.0,60.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,97.0,9.0,79.0,49.0,33.306008,11.125508,2.508640,3.648205,7.763952,54,85,9,0,3,5,0,0,63.0,82.0,9.0,0.0,3.0,4.0,0.0,0.0,3.0,71.0,9.0,0.0,1.0,5.0,0.0,0.0,3.0,68.0,9.0,0.0,2.0,5.0,0.0,0.0,1.0,60.0,9.0,0.0,2.0,4.0,0.0,0.0,0.16,38.40,0.0,24.8,30.938649,-53.0,73.2,10.281051,-25.0,9.0,0.0,0.0,0.0,0.0,0.0,2.2,0.836660,-1.0
2992,16,2,5.933333,22.605079,57.987538,7.341295,0.563350,3.372734,4.0,60.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,97.0,9.0,79.0,49.0,33.306008,11.125508,2.508640,3.648205,7.763952,3,40,9,0,4,4,0,0,2.0,40.0,9.0,0.0,7.0,8.0,0.0,0.0,85.0,64.0,9.0,0.0,8.0,2.0,0.0,0.0,63.0,69.0,9.0,0.0,5.0,6.0,0.0,0.0,28.0,67.0,9.0,0.0,9.0,2.0,0.0,0.0,4.48,42.88,0.0,36.2,36.874110,25.0,56.0,14.713939,27.0,9.0,0.0,0.0,0.0,0.0,0.0,6.6,2.073644,5.0
2993,960,2,21.083333,4.310220,26.010503,26.000113,0.058724,0.443704,0.0,10.0,26.0,0.0,0.0,0.0,2.0,26.0,0.0,0.0,96.0,99.0,27.0,5.0,11.0,16.290600,29.986308,0.010627,0.466819,1.053445,0,55,29,0,1,3,0,0,0.0,55.0,29.0,0.0,1.0,7.0,0.0,0.0,0.0,55.0,29.0,0.0,1.0,0.0,0.0,0.0,0.0,55.0,29.0,0.0,1.0,3.0,0.0,0.0,0.0,55.0,29.0,0.0,1.0,3.0,0.0,0.0,0.00,2112.00,0.0,0.0,0.000000,0.0,55.0,0.000000,0.0,29.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.0
2994,16,0,6.766667,3.916342,65.613491,20.000000,0.075920,0.001766,3.0,67.0,20.0,0.0,0.0,1.0,46.0,20.0,0.0,0.0,25.0,79.0,20.0,8.0,1.0,2.059388,6.234262,0.000000,0.522400,0.041983,2,76,20,0,0,0,0,0,2.0,76.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,76.0,20.0,0.0,0.0,0.0,0.0,0.0,9.0,76.0,20.0,0.0,0.0,0.0,0.0,0.0,5.0,76.0,20.0,0.0,0.0,0.0,0.0,0.0,0.80,48.64,0.0,4.0,3.082207,3.0,76.0,0.000000,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


# 4. 训练&预测

In [17]:
Y_features = [
    'CPU_USAGE_6', 'LJOB_6',
    'CPU_USAGE_7', 'LJOB_7',
    'CPU_USAGE_8', 'LJOB_8',
    'CPU_USAGE_9', 'LJOB_9',
    'CPU_USAGE_10', 'LJOB_10'
]

Y_train = train_df[Y_features]
train_df.drop(columns=Y_features, inplace=True)

In [18]:
lgb_param = {
    'num_leaves': 64,
    'max_depth': 6,
    'learning_rate': 0.01,
    'n_estimators': 300,
    'subsample': 0.9,
    'feature_fraction': 0.8,
    'reg_alpha': 0.4,
    'reg_lambda': 0.8,
    'seed': 5616  # ！！！记得修改！！！
}

In [19]:
# 总迭代次数
N = MODEL_N * NFOLDS

# 进度条
pbar = tqdm(total=N, position=0, leave=True)

# 交叉验证
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=5616)
kf = kfold.split(train_df)

# out-of-folds
oof = np.zeros((train_df.shape[0], MODEL_N))

for train_idx, validate_idx in kf:
    # 切割训练集&验证集
    X_train, y_train = train_df.iloc[train_idx, :], Y_train.iloc[train_idx, :]
    X_valid, y_valid = train_df.iloc[validate_idx, :], Y_train.iloc[validate_idx]

    for i in range(MODEL_N):
        y = y_train.iloc[:, i]

        reg = lgb.LGBMRegressor(n_jobs=-1, **lgb_param)
        bst = reg.fit(X_train, y)

        # 验证集
        valid_pred = bst.predict(X_valid)
        valid_pred[valid_pred < 0] = 0
        valid_pred[valid_pred > 100] = 100
        valid_pred = valid_pred.astype(np.int)
        oof[validate_idx, i] = valid_pred
        
        # 测试集
        test_pred = bst.predict(test_df)
        test_pred[test_pred < 0] = 0
        test_pred[test_pred > 100] = 100
        sample_df.iloc[:, i+1] += test_pred / NFOLDS

        # 更新进度条
        pbar.update(1)

# 关闭进度条
pbar.close()

# 转为整型
sample_df = sample_df.astype(np.int)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [06:56<00:00,  8.33s/it]


In [20]:
def evaluate(Y_true, Y_preds):
    """赛题给的评估函数."""
    # shape: (n, 10)
    if not isinstance(Y_true, np.ndarray):
        Y_true = Y_true.to_numpy()

    if not isinstance(Y_preds, np.ndarray):
        Y_preds = Y_preds.to_numpy()

    dist = 0  # DIST_k
    for i in range(MODEL_N//2):
        cpu_true, job_true = Y_true[:, i*2], Y_true[:, i*2+1]  # shape: (n,)
        cpu_preds, job_preds = Y_preds[:, i*2], Y_preds[:, i*2+1]  # shape: (n,)
        max_job = np.max((job_true, job_preds), axis=0)

        # 防止分母为0（当分母为0是，分子也为0，所以可以把分母0设为1）
        max_job[max_job == 0] = 1.0
        dist += 0.9 * np.abs((cpu_preds - cpu_true) / 100) + 0.1 * np.abs((job_true - job_true) / max_job)

    score = 1 - dist.mean()
    return score

In [21]:
# 计算验证集分数
oof_score = evaluate(Y_train, oof)
print('oof score = %.6f' % oof_score)  # 0.909830

oof score = 0.901009


In [22]:
sample_df

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,73,0,76,0,45,0,36,0,58,0
1,2,34,0,32,0,29,0,32,0,28,0
2,3,49,0,51,0,16,0,8,0,6,0
3,4,27,0,8,0,9,0,6,0,5,0
4,5,1,0,2,0,4,0,4,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...
2991,2992,5,0,8,0,16,0,45,0,59,0
2992,2993,28,0,9,0,8,0,6,0,4,0
2993,2994,0,0,1,0,2,0,1,0,2,0
2994,2995,4,0,4,0,4,0,4,0,4,0


In [23]:
sample_df.to_csv('baseline_new.csv', index=False)