In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('raw_data/evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

sub_sample = pd.read_csv('raw_data/submit_example.csv')

In [3]:
train.head(10)

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0
5,2,16,available,sql,x86_64,3,55,0,0,0,0,0,1590684420000,vm,20.0
6,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590684720000,vm,20.0
7,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590685020000,vm,20.0
8,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590685320000,vm,20.0
9,2,16,available,sql,x86_64,6,54,0,0,0,0,0,1590685620000,vm,20.0


In [4]:
test.head(10)

Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9
5,2,85153,64,available,general,x86_64,56,91,0,0,0,0,0,1613655960000,vm,20
6,2,85153,64,available,general,x86_64,48,78,0,1,1,0,0,1613656260000,vm,20
7,2,85153,64,available,general,x86_64,23,35,0,0,0,0,0,1613656560000,vm,20
8,2,85153,64,available,general,x86_64,68,61,0,0,0,0,0,1613656860000,vm,20
9,2,85153,64,available,general,x86_64,38,74,0,0,0,0,0,1613657160000,vm,20


In [5]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [6]:
train.shape, test.shape, sub_sample.shape

((501730, 15), (14980, 16), (2996, 11))

In [7]:
# 这些 columns 在 test 只有单一值, 所以直接去掉

del train['STATUS']
del train['PLATFORM']
del train['RESOURCE_TYPE']

del test['STATUS']
del test['PLATFORM']
del test['RESOURCE_TYPE']

In [8]:
# 时间排序好后也没什么用了

del train['DOTTING_TIME']
del test['DOTTING_TIME']

In [9]:
# Label Encoding

le = LabelEncoder()
train['QUEUE_TYPE'] = le.fit_transform(train['QUEUE_TYPE'].astype(str))
test['QUEUE_TYPE'] = le.transform(test['QUEUE_TYPE'].astype(str))

In [10]:
# 1 CU = 1 CPU 4G MEM

train['used_cpu'] = train['CU'] * train['CPU_USAGE'] / 100
train['used_mem'] = train['CU'] * 4 * train['MEM_USAGE'] / 100

test['used_cpu'] = test['CU'] * test['CPU_USAGE'] / 100
test['used_mem'] = test['CU'] * 4 * test['MEM_USAGE'] / 100

In [11]:
train['to_run_jobs'] = train['LAUNCHING_JOB_NUMS'] - train['RUNNING_JOB_NUMS']
test['to_run_jobs'] = test['LAUNCHING_JOB_NUMS'] - test['RUNNING_JOB_NUMS']

In [12]:
# diffs

train['used_cpu_diff1'] = train.groupby(['QUEUE_ID'])['used_cpu'].diff(1).fillna(0)
train['used_mem_diff1'] = train.groupby(['QUEUE_ID'])['used_mem'].diff(1).fillna(0)
train['used_disk_diff1'] = train.groupby(['QUEUE_ID'])['DISK_USAGE'].diff(1).fillna(0)
train['to_run_jobs_diff1'] = train.groupby(['QUEUE_ID'])['to_run_jobs'].diff(1).fillna(0)
train['launching_diff1'] = train.groupby(['QUEUE_ID'])['LAUNCHING_JOB_NUMS'].diff(1).fillna(0)
train['running_diff1'] = train.groupby(['QUEUE_ID'])['RUNNING_JOB_NUMS'].diff(1).fillna(0)
train['succeed_diff1'] = train.groupby(['QUEUE_ID'])['SUCCEED_JOB_NUMS'].diff(1).fillna(0)
train['cancelled_diff1'] = train.groupby(['QUEUE_ID'])['CANCELLED_JOB_NUMS'].diff(1).fillna(0)
train['failed_diff1'] = train.groupby(['QUEUE_ID'])['FAILED_JOB_NUMS'].diff(1).fillna(0)

train['used_cpu_diff-1'] = train.groupby(['QUEUE_ID'])['used_cpu'].diff(-1).fillna(0)
train['used_mem_diff-1'] = train.groupby(['QUEUE_ID'])['used_mem'].diff(-1).fillna(0)
train['used_disk_diff-1'] = train.groupby(['QUEUE_ID'])['DISK_USAGE'].diff(-1).fillna(0)
train['to_run_jobs_diff-1'] = train.groupby(['QUEUE_ID'])['to_run_jobs'].diff(-1).fillna(0)
train['launching_diff-1'] = train.groupby(['QUEUE_ID'])['LAUNCHING_JOB_NUMS'].diff(-1).fillna(0)
train['running_diff-1'] = train.groupby(['QUEUE_ID'])['RUNNING_JOB_NUMS'].diff(-1).fillna(0)
train['succeed_diff-1'] = train.groupby(['QUEUE_ID'])['SUCCEED_JOB_NUMS'].diff(-1).fillna(0)
train['cancelled_diff-1'] = train.groupby(['QUEUE_ID'])['CANCELLED_JOB_NUMS'].diff(-1).fillna(0)
train['failed_diff-1'] = train.groupby(['QUEUE_ID'])['FAILED_JOB_NUMS'].diff(-1).fillna(0)


test['used_cpu_diff1'] = test.groupby(['QUEUE_ID'])['used_cpu'].diff(1).fillna(0)
test['used_mem_diff1'] = test.groupby(['QUEUE_ID'])['used_mem'].diff(1).fillna(0)
test['used_disk_diff1'] = test.groupby(['QUEUE_ID'])['DISK_USAGE'].diff(1).fillna(0)
test['to_run_jobs_diff1'] = test.groupby(['QUEUE_ID'])['to_run_jobs'].diff(1).fillna(0)
test['launching_diff1'] = test.groupby(['QUEUE_ID'])['LAUNCHING_JOB_NUMS'].diff(1).fillna(0)
test['running_diff1'] = test.groupby(['QUEUE_ID'])['RUNNING_JOB_NUMS'].diff(1).fillna(0)
test['succeed_diff1'] = test.groupby(['QUEUE_ID'])['SUCCEED_JOB_NUMS'].diff(1).fillna(0)
test['cancelled_diff1'] = test.groupby(['QUEUE_ID'])['CANCELLED_JOB_NUMS'].diff(1).fillna(0)
test['failed_diff1'] = test.groupby(['QUEUE_ID'])['FAILED_JOB_NUMS'].diff(1).fillna(0)

test['used_cpu_diff-1'] = test.groupby(['QUEUE_ID'])['used_cpu'].diff(-1).fillna(0)
test['used_mem_diff-1'] = test.groupby(['QUEUE_ID'])['used_mem'].diff(-1).fillna(0)
test['used_disk_diff-1'] = test.groupby(['QUEUE_ID'])['DISK_USAGE'].diff(-1).fillna(0)
test['to_run_jobs_diff-1'] = test.groupby(['QUEUE_ID'])['to_run_jobs'].diff(-1).fillna(0)
test['launching_diff-1'] = test.groupby(['QUEUE_ID'])['LAUNCHING_JOB_NUMS'].diff(-1).fillna(0)
test['running_diff-1'] = test.groupby(['QUEUE_ID'])['RUNNING_JOB_NUMS'].diff(-1).fillna(0)
test['succeed_diff-1'] = test.groupby(['QUEUE_ID'])['SUCCEED_JOB_NUMS'].diff(-1).fillna(0)
test['cancelled_diff-1'] = test.groupby(['QUEUE_ID'])['CANCELLED_JOB_NUMS'].diff(-1).fillna(0)
test['failed_diff-1'] = test.groupby(['QUEUE_ID'])['FAILED_JOB_NUMS'].diff(-1).fillna(0)

In [13]:
train.head()

Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DISK_USAGE,used_cpu,used_mem,to_run_jobs,used_cpu_diff1,used_mem_diff1,used_disk_diff1,to_run_jobs_diff1,launching_diff1,running_diff1,succeed_diff1,cancelled_diff1,failed_diff1,used_cpu_diff-1,used_mem_diff-1,used_disk_diff-1,to_run_jobs_diff-1,launching_diff-1,running_diff-1,succeed_diff-1,cancelled_diff-1,failed_diff-1
0,2,16,2,3,54,0,0,0,0,0,20.0,0.48,34.56,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,16,2,2,54,0,0,0,0,0,20.0,0.32,34.56,0,-0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,16,2,7,54,0,0,0,0,0,20.0,1.12,34.56,0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,16,2,4,54,0,0,0,0,0,20.0,0.64,34.56,0,-0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,16,2,5,54,0,0,0,0,0,20.0,0.8,34.56,0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,-0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# 加个 id 后面方便处理
train['myid'] = train.index
test['myid'] = test.index

In [15]:
# 生成 target 列

df_train = pd.DataFrame()

for id_ in tqdm(train.QUEUE_ID.unique()):
    tmp = train[train.QUEUE_ID == id_]
    tmp['CPU_USAGE_next25mins'] = tmp['CPU_USAGE'].shift(-5)
    tmp['LAUNCHING_JOB_NUMS_next25mins'] = tmp['LAUNCHING_JOB_NUMS'].shift(-5)
    df_train = df_train.append(tmp)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [16]:
df_train = df_train[df_train.CPU_USAGE_next25mins.notna()]
# df_train['CPU_USAGE_next25mins'] /= 100

print(df_train.shape)
df_train.head()

(501515, 35)


Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DISK_USAGE,used_cpu,used_mem,to_run_jobs,used_cpu_diff1,used_mem_diff1,used_disk_diff1,to_run_jobs_diff1,launching_diff1,running_diff1,succeed_diff1,cancelled_diff1,failed_diff1,used_cpu_diff-1,used_mem_diff-1,used_disk_diff-1,to_run_jobs_diff-1,launching_diff-1,running_diff-1,succeed_diff-1,cancelled_diff-1,failed_diff-1,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,2,16,2,3,54,0,0,0,0,0,20.0,0.48,34.56,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,0.0
1,2,16,2,2,54,0,0,0,0,0,20.0,0.32,34.56,0,-0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0,0.0
2,2,16,2,7,54,0,0,0,0,0,20.0,1.12,34.56,0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2.0,0.0
3,2,16,2,4,54,0,0,0,0,0,20.0,0.64,34.56,0,-0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,5.0,0.0
4,2,16,2,5,54,0,0,0,0,0,20.0,0.8,34.56,0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,-0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,6.0,0.0


In [17]:
def run_lgb_qid(df_train, df_test, target, qid):
    
    feature_names = list(
        filter(lambda x: x not in ['CPU_USAGE_next25mins', 
                                   'LAUNCHING_JOB_NUMS_next25mins', 
                                   'QUEUE_ID', 
                                   'myid',
                                   'CU',
                                   'QUEUE_TYPE'], df_train.columns))

#     feature_names = ['CPU_USAGE', 'MEM_USAGE', 'LAUNCHING_JOB_NUMS',
#                      'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS',
#                      'FAILED_JOB_NUMS', 'DISK_USAGE', 
#                      'used_cpu', 'used_mem', 
#                      'used_cpu_diff1', 'used_mem_diff1',
#                      'used_disk_diff1', 'to_run_jobs_diff1',
#                      'launching_diff1', 'running_diff1',
#                      'succeed_diff1', 'cancelled_diff1',
#                      'failed_diff1',
#                      'used_cpu_diff-1', 'used_mem_diff-1',
#                      'used_disk_diff-1', 'to_run_jobs_diff-1',
#                      'launching_diff-1', 'running_diff-1',
#                      'succeed_diff-1', 'cancelled_diff-1',
#                      'failed_diff-1']
    
    # 提取 QUEUE_ID 对应的数据集
    df_train = df_train[df_train.QUEUE_ID == qid]
    df_test = df_test[df_test.QUEUE_ID == qid]
    
    print(f"QUEUE_ID:{qid}, target:{target}, train:{len(df_train)}, test:{len(df_test)}")
    
    model = lgb.LGBMRegressor(num_leaves=32,
                              max_depth=6,
                              learning_rate=0.08,
                              n_estimators=10000,
                              subsample=0.9,
                              feature_fraction=0.8,
                              reg_alpha=0.5,
                              reg_lambda=0.8,
                              random_state=2020)
    oof = []
    prediction = df_test[['ID', 'QUEUE_ID', 'myid']]
    prediction[target] = 0
    
    kfold = KFold(n_splits=5, random_state=2020)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train, df_train[target])):
        
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][target]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][target]
        
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=0,
                              eval_metric='mse',
                              early_stopping_rounds=20)
        
        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = df_train.iloc[val_idx][[target, 'myid', 'QUEUE_ID']].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        
        pred_test = lgb_model.predict(df_test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[target] += pred_test / kfold.n_splits
        
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
        
    df_oof = pd.concat(oof)
    score = mean_squared_error(df_oof[target], df_oof['pred'])
    print('MSE:', score)

    return prediction, score

In [18]:
oofs1 = list()
oofs2 = list()
predictions1 = list()
predictions2 = list()
scores1 = list()
scores2 = list()

for qid in tqdm(test.QUEUE_ID.unique()):
    prediction1, score1 = run_lgb_qid(df_train, test, target='CPU_USAGE_next25mins', qid=qid)
    predictions1.append(prediction1)
    scores1.append(score1)
    prediction2, score2 = run_lgb_qid(df_train, test, target='LAUNCHING_JOB_NUMS_next25mins', qid=qid)
    predictions2.append(prediction2)
    scores2.append(score2)

HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))

QUEUE_ID:297, target:CPU_USAGE_next25mins, train:21179, test:5710
MSE: 190.33013812383643
QUEUE_ID:297, target:LAUNCHING_JOB_NUMS_next25mins, train:21179, test:5710
MSE: 6.439134989782293
QUEUE_ID:85153, target:CPU_USAGE_next25mins, train:14348, test:1950
MSE: 206.25600765498814
QUEUE_ID:85153, target:LAUNCHING_JOB_NUMS_next25mins, train:14348, test:1950
MSE: 14.774694229376149
QUEUE_ID:291, target:CPU_USAGE_next25mins, train:8879, test:285
MSE: 253.01064139163603
QUEUE_ID:291, target:LAUNCHING_JOB_NUMS_next25mins, train:8879, test:285
MSE: 0.04822364450600933
QUEUE_ID:21487, target:CPU_USAGE_next25mins, train:28926, test:2235
MSE: 78.21323592695981
QUEUE_ID:21487, target:LAUNCHING_JOB_NUMS_next25mins, train:28926, test:2235
MSE: 12.264486203235629
QUEUE_ID:85265, target:CPU_USAGE_next25mins, train:13511, test:95
MSE: 1.0642757707905275
QUEUE_ID:85265, target:LAUNCHING_JOB_NUMS_next25mins, train:13511, test:95
MSE: 30.074528990648865
QUEUE_ID:4, target:CPU_USAGE_next25mins, train:19252

In [19]:
print(np.mean(scores1), np.mean(scores2))

64.87561716755006 5.6581716617230935


In [19]:
predictions1 = pd.concat(predictions1)
predictions2 = pd.concat(predictions2)

predictions1 = predictions1.sort_values(by='myid').reset_index(drop=True)
predictions2 = predictions2.sort_values(by='myid').reset_index(drop=True)

In [20]:
prediction = predictions1.copy()
prediction = pd.merge(prediction, predictions2[['myid', 'LAUNCHING_JOB_NUMS_next25mins']], on='myid')

prediction

Unnamed: 0,ID,QUEUE_ID,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,1,297,0,11.343928,0.021107
1,1,297,1,12.731280,0.024004
2,1,297,2,19.294139,0.025891
3,1,297,3,95.529173,0.702751
4,1,297,4,83.432941,0.081439
...,...,...,...,...,...
14975,2996,287,14975,2.840110,0.029591
14976,2996,287,14976,5.747585,0.023989
14977,2996,287,14977,5.009150,0.026029
14978,2996,287,14978,4.434103,0.023251


In [21]:
prediction.CPU_USAGE_next25mins.describe()

count    14980.000000
mean        16.036306
std         15.114320
min         -9.143809
25%          4.448658
50%         11.680019
75%         21.347691
max         99.670620
Name: CPU_USAGE_next25mins, dtype: float64

In [22]:
prediction.LAUNCHING_JOB_NUMS_next25mins.describe()

count    14980.000000
mean         0.545399
std          3.285629
min         -0.290879
25%          0.013331
50%          0.021885
75%          0.223851
max         52.382402
Name: LAUNCHING_JOB_NUMS_next25mins, dtype: float64

In [23]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [24]:
# 注意: 提交要求预测结果需为非负整数

prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(np.floor)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].astype(int)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(np.floor)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].astype(int)

prediction

Unnamed: 0,ID,QUEUE_ID,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,1,297,0,11,0
1,1,297,1,12,0
2,1,297,2,19,0
3,1,297,3,95,0
4,1,297,4,83,0
...,...,...,...,...,...
14975,2996,287,14975,2,0
14976,2996,287,14976,5,0
14977,2996,287,14977,5,0
14978,2996,287,14978,4,0


In [25]:
preds = []

for id_ in tqdm(prediction.ID.unique()):
    items = [id_]
    tmp = prediction[prediction.ID == id_].sort_values(by='myid').reset_index(drop=True)
    for i, row in tmp.iterrows():
        items.append(row['CPU_USAGE_next25mins'])
        items.append(row['LAUNCHING_JOB_NUMS_next25mins'])
    preds.append(items)

HBox(children=(FloatProgress(value=0.0, max=2996.0), HTML(value='')))




In [26]:
sub = pd.DataFrame(preds)
sub.columns = sub_sample.columns

sub.head(10)

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,11,0,12,0,19,0,95,0,83,0
1,2,18,0,18,0,30,0,38,0,21,0
2,3,17,0,34,0,10,0,13,0,12,0
3,4,13,0,20,0,6,0,11,0,16,0
4,5,5,0,11,0,10,0,12,0,21,0
5,6,8,0,12,0,8,0,10,0,13,0
6,7,10,0,8,0,8,0,12,0,20,0
7,8,0,25,1,31,0,32,0,32,0,14
8,9,3,0,3,0,3,0,3,0,3,0
9,10,14,0,14,0,12,0,8,0,10,0


In [27]:
sub.shape, sub_sample.shape

((2996, 11), (2996, 11))

In [28]:
sub.to_csv('baseline_202010151337.csv', index=False)