In [1]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import lightgbm
import math
import numpy as np
from datetime import datetime
from tqdm import tqdm
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

sub_sample = pd.read_csv('submit_example.csv')

## 数据分析

In [3]:
# train['CPU_USAGE'].hist(bins=70)

In [4]:
test['STATUS'].value_counts()

available    14980
Name: STATUS, dtype: int64

In [5]:
train['STATUS'].value_counts()

available    501639
assigning        85
assigned          4
suspended         2
Name: STATUS, dtype: int64

## 数据预处理

In [6]:
# 这些 columns 在 test 只有单一值, 所以直接去掉
# 考虑到不同属性差异性，仅保留test存在的属性值

train = train[train.STATUS=='available']
train = train[train.PLATFORM=='x86_64']
train = train[train.RESOURCE_TYPE=='vm']
train = train.reset_index(drop=True)

del train['STATUS']
del train['PLATFORM']
del train['RESOURCE_TYPE']

del test['STATUS']
del test['PLATFORM']
del test['RESOURCE_TYPE']

In [7]:
def make_ID(df_):
    
    df = df_.copy()
    new_df = pd.DataFrame()
    QUEUE_IDs = df['QUEUE_ID'].unique()
    
    for QUEUE_ID in tqdm(QUEUE_IDs):
        
        tmp = df[df.QUEUE_ID==QUEUE_ID]
        
        for i in range(0,10):
                
            tmp_index = [idx+i for idx in tmp.index][:tmp.shape[0]-i]
            tmpp = tmp.loc[tmp_index]
            tmpp['ID'] = [idx for idx in range(tmpp.shape[0])]
            new_df = new_df.append(tmpp)
            
    return new_df

train = make_ID(train)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [02:10<00:00,  3.26s/it]


In [8]:
train = train.sort_values(['QUEUE_ID','ID','DOTTING_TIME'])

train['ID'] = train['QUEUE_ID'].astype(str) + '_' + train['ID'].astype(str)

train['rank'] = train.groupby(['ID'])['DOTTING_TIME'].rank(method='first')
test['rank'] = test.groupby(['ID'])['DOTTING_TIME'].rank(method='first')

del train['DOTTING_TIME']
del test['DOTTING_TIME']

In [9]:
# 剔除不满足10条样本的new_ID
tmp = train['ID'].value_counts().reset_index()
drop_ID = tmp[tmp['ID']<10]['index'].tolist()
print(train.shape)
train = train[~train['ID'].isin(drop_ID)]
print(train.shape)

(4478190, 13)
(4476390, 13)


In [10]:
# 获取特征提取数据和最终训练集
train_feat = train[train['rank']<=5]
train_df = train[train['rank']>=5]
train_df = train_df.reset_index(drop=True)

test_feat = test[test['rank']<=5]
test_df = test[test['rank']>=5]
test_df = test_df.reset_index(drop=True)

In [11]:
def make_label(data):

    data['CPU_USAGE_1']=data.CPU_USAGE.shift(-1)
    data['CPU_USAGE_2']=data.CPU_USAGE.shift(-2)
    data['CPU_USAGE_3']=data.CPU_USAGE.shift(-3)
    data['CPU_USAGE_4']=data.CPU_USAGE.shift(-4)
    data['CPU_USAGE_5']=data.CPU_USAGE.shift(-5)
    
    data['LAUNCHING_JOB_NUMS_1']=data.LAUNCHING_JOB_NUMS.shift(-1)
    data['LAUNCHING_JOB_NUMS_2']=data.LAUNCHING_JOB_NUMS.shift(-2)
    data['LAUNCHING_JOB_NUMS_3']=data.LAUNCHING_JOB_NUMS.shift(-3)
    data['LAUNCHING_JOB_NUMS_4']=data.LAUNCHING_JOB_NUMS.shift(-4)
    data['LAUNCHING_JOB_NUMS_5']=data.LAUNCHING_JOB_NUMS.shift(-5)
    
    data = data[data['rank']==5]
    
    return data

# 最终训练数据获取label
train_df = make_label(train_df)
train_df = train_df.reset_index(drop=True)

## 特征提取

In [12]:
num_cols = ['CPU_USAGE','MEM_USAGE','LAUNCHING_JOB_NUMS','RUNNING_JOB_NUMS','SUCCEED_JOB_NUMS','CANCELLED_JOB_NUMS',\
            'FAILED_JOB_NUMS','DISK_USAGE']

In [13]:
# 直接对 train_feat 和 test_feat 构造特征即可
## 历史平移
for i in range(1,5): 
    tmp = train_feat[train_feat['rank']==5-i][['ID']+num_cols] # 历史1 2 3 4单位
    tmp.columns = ['ID'] + ['{}_shift{}'.format(f,i) for f in tmp.columns if f != 'ID']
    tmp = tmp.reset_index(drop=True)
    train_df = train_df.merge(tmp, on='ID', how='left')
    
for i in range(1,5):
    tmp = test_feat[test_feat['rank']==5-i][['ID']+num_cols]
    tmp.columns = ['ID'] + ['{}_shift{}'.format(f,i) for f in tmp.columns if f != 'ID']
    tmp = tmp.reset_index(drop=True)
    test_df = test_df.merge(tmp, on='ID', how='left')

In [14]:
## 滑窗统计
for df in [train_feat, test_feat]:
    for i in range(2,5):
        agg_func = {}
        tmp = df[df['rank']>5-i] # 最近2/3/4单位时刻
        for col in ['CPU_USAGE','LAUNCHING_JOB_NUMS']:
            agg_func[col] = ['mean','median','std',np.ptp]

        agg_df = tmp.groupby(['ID']).agg(agg_func)
        agg_df.columns = ['last'+str(i)+'_'.join(col).strip() for col in agg_df.columns.values]
        agg_df.reset_index(drop=False, inplace=True)

        if agg_df.shape[0] == train_df.shape[0]:
            train_df = train_df.merge(agg_df, on='ID', how='left')
        else:
            test_df = test_df.merge(agg_df, on='ID', how='left')

In [15]:
## 聚合统计
for df in [train_feat, test_feat]:
    agg_func = {}
    for col in num_cols:
        agg_func[col] = ['mean','std','max','min','median',np.ptp]
    
    agg_df = df.groupby(['ID']).agg(agg_func)
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(drop=False, inplace=True)
    
    if agg_df.shape[0] == train_df.shape[0]:
        train_df = train_df.merge(agg_df, on='ID', how='left')
    else:
        test_df = test_df.merge(agg_df, on='ID', how='left')

## 模型训练

In [16]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    oof = np.zeros(train_x.shape[0])
    pred = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metric': 'mse',
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'learning_rate': 0.05,
                'seed': 2020,
                'nthread': 28,
                'n_jobs': 24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, 
                              valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[],
                              verbose_eval=1000,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix  = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'eval_metric': 'mae',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.05,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      'n_jobs': 24,
                      'silent': True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, 
                              evals=watchlist, verbose_eval=500, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        oof[valid_index] = val_pred
        pred += test_pred / kf.n_splits
        cv_scores.append(mean_absolute_error(val_y, val_pred))
        
        print(cv_scores)
    
    print("oof true mean {}, oof pred mean {}, test mean {}".format(train_y.mean(),val_pred.mean(),pred.mean()))
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    return oof, pred

In [17]:
cate_feat = ['QUEUE_ID','CU','QUEUE_TYPE']

for i in cate_feat:
    lbl = LabelEncoder() 
    train_df[i] = lbl.fit_transform(train_df[i].astype(str))
    test_df[i] = lbl.fit_transform(test_df[i].astype(str))
    
features = [f for f in test_df.columns if f not in ['ID','rank','DOTTING_TIME']]

x_train = train_df[features]
x_test = test_df[features]

In [18]:
m_type = 'lgb'
# for label in ['CPU_USAGE_1','LAUNCHING_JOB_NUMS_1','CPU_USAGE_2','LAUNCHING_JOB_NUMS_2','CPU_USAGE_3','LAUNCHING_JOB_NUMS_3',
#               'CPU_USAGE_4','LAUNCHING_JOB_NUMS_4','CPU_USAGE_5','LAUNCHING_JOB_NUMS_5']:
for label in ['CPU_USAGE_1','LAUNCHING_JOB_NUMS_1']:
    print('############## {} ##############'.format(label))
    
    y_train = train_df[label]
    
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, m_type)
    
    test_df[label] = lgb_test

############## CPU_USAGE_1 ##############
************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[1000]	training's l2: 15.1363	valid_1's l2: 21.0785
[2000]	training's l2: 12.2311	valid_1's l2: 20.6737
[3000]	training's l2: 10.5624	valid_1's l2: 20.5359
[4000]	training's l2: 9.4265	valid_1's l2: 20.4734
Early stopping, best iteration is:
[4104]	training's l2: 9.3236	valid_1's l2: 20.4664
[2.0371287108527634]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[1000]	training's l2: 15.0198	valid_1's l2: 21.6763
[2000]	training's l2: 12.2688	valid_1's l2: 21.1869
[3000]	training's l2: 10.606	valid_1's l2: 21.0257
[4000]	training's l2: 9.39866	valid_1's l2: 20.9478
Early stopping, best iteration is:
[4124]	training's l2: 9.27918	valid_1's l2: 20.9375
[2.0371287108527634, 2.0430338874019514]
*********************************

## 结果提交

In [19]:
# 注意: 提交要求预测结果需为非负整数, 包括 ID 也需要是整数

sub = test_df[['ID','CPU_USAGE_1','LAUNCHING_JOB_NUMS_1']]
sub['ID'] = sub['ID'].astype(int)

for col in [i for i in sub.columns if i != 'ID']:
    sub[col] = sub[col].apply(np.floor)
    sub[col] = sub[col].apply(lambda x: 0 if x<0 else x)
    sub[col] = sub[col].astype(int)
    
print(test_df['CPU_USAGE_1'].mean())
print(test_df['LAUNCHING_JOB_NUMS_1'].mean())

21.25373336669068
1.0376422231871056


In [20]:
print(test[test['rank']==5]['CPU_USAGE'].mean())
print(test[test['rank']==5]['LAUNCHING_JOB_NUMS'].mean())

20.960280373831775
2.252002670226969


## xgb