In [1]:
import pandas as pd
import numpy as np
import os
import catboost as cat
import warnings 
warnings.filterwarnings("ignore") 
import random
from datetime import timedelta

In [2]:
PARENT_FOLDER = '../data'


kernel_train_path = 'memory_sample_kernel_log_round1_a_train.csv'
failure_train_path = 'memory_sample_failure_tag_round1_a_train.csv'
adress_train_path = 'memory_sample_address_log_round1_a_train.csv'


kernel_test_path = 'memory_sample_kernel_log_round1_b_test.csv'
failure_test_path = 'memory_sample_failure_tag_round1_b_test.csv'
adress_test_path = 'memory_sample_address_log_round1_b_test.csv'

In [3]:
def label(x):
    if x<=60*60*24*7:
        return 1
    else:
        return 0

In [4]:
kernel_var = ['1_hwerr_f', '1_hwerr_e', '2_hwerr_c', '2_sel', '3_hwerr_n', '2_hwerr_s', '3_hwerr_m', '1_hwerr_st',
       '1_hw_mem_c', '3_hwerr_p', '2_hwerr_ce', '3_hwerr_as', '1_ke', '2_hwerr_p', '3_hwerr_kp', '1_hwerr_fl', '3_hwerr_r', '_hwerr_cd',
       '3_sup_mce_note', '3_cmci_sub', '3_cmci_det', '3_hwerr_pi', '3_hwerr_o', '3_hwerr_mce_l']

In [5]:
def etl_kernel(path, agg_time,time):
    data = pd.read_csv(os.path.join(PARENT_FOLDER, path))
    data['collect_time'] = pd.to_datetime(data['collect_time']).dt.floor(agg_time)
    data['count'+time] = 1
    
    if 'tag' in data.columns:
        del data['tag']
        del data['failure_time']
    
    group_data = data.groupby(['serial_number','manufacturer','vendor','collect_time'],as_index=False).agg('sum')

    return group_data

In [6]:
def getLabel(label_path, kernel_df):
    failure_tag = pd.read_csv(os.path.join(PARENT_FOLDER,label_path))
    failure_tag['failure_time']= pd.to_datetime(failure_tag['failure_time'])
    merged_kernel = pd.merge(kernel_df,failure_tag[['serial_number',
                                               'manufacturer','vendor','failure_time']],how='left',on=['serial_number',
                                                                                                       'manufacturer','vendor'])
    merged_kernel['diff_seconds'] = (merged_kernel['failure_time'] - merged_kernel['collect_time']).dt.days*24*60*60 \
                                + ((merged_kernel['failure_time']-merged_kernel['collect_time']).dt.seconds)
    merged_kernel['label'] = merged_kernel['diff_seconds'].map(label)
    return merged_kernel

In [7]:
## 评分函数，引用自PAKDD2021论坛中的score_func_round2，链接见readme
def score_func_round2(sub_df, cur_failure_tag, verbose=False):
    '''
    注意cur_failure_tag的时间窗口要和sub_list一样。
    sub_list需要加一个collect_time, 表示做出预测的那一分钟, 类型为 pd.Timestamp。
    [{"serial_number":server_1, "pti":14, 'collect_time': Timestamp('2019-08-01 05:18:00')},
    {"serial_number":server_123, "pti":1200, 'collect_time': Timestamp('2019-08-02 00:08:00')}]
    '''
    if sub_df.empty:
        print("[Warning] sub num 0")
        return 0
    # remove invalid sub
    # 删除 ati < 0的提交
    sub_df = sub_df.join(cur_failure_tag.set_index('serial_number')['failure_time'], how='left', on='serial_number')
    sub_df['ati'] = (sub_df['failure_time']-sub_df['collect_time'])/pd.Timedelta('1min')
    sub_df = sub_df[(sub_df['ati']>=0)|(sub_df['ati'].isna())]
    # 取每个周期第一个
    sub_df = sub_df.sort_values(by=['serial_number', 'collect_time'])
    pre_ser = -1
    init_pre_time = pd.to_datetime('2018-12-01')
    window_time = pd.Timedelta('7D')
    pre_time = init_pre_time
    judge = []
    for sn, cur_time in sub_df[['serial_number', 'collect_time']].values:
        if pre_ser != sn:
            pre_time = init_pre_time
        if (cur_time-pre_time) < window_time:
            judge.append(0)
        else:
            judge.append(1)
            pre_time = cur_time
        pre_ser = sn
    judge = np.array(judge)
    sub_df = sub_df[judge==1].reset_index(drop=True)

    # failure_time_dict = cur_failure_tag.set_index('serial_number')['failure_time'].to_dict()
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    # score
    n_pp = len(sub_df)
    n_pr = len(cur_failure_tag)

    n_tpr = 0
    n_tpp = 0
    for sn, pti, ati in sub_df[['serial_number', 'pti', 'ati']].values:
        if pd.notna(ati):
            if 0 <= pti < 7*24*60: # 待确定
                if pti <= ati:
                    n_tpp += sigmoid(pti/ati)
            if ati < 7*24*60:
                n_tpr += 1
    
    precision = n_tpp/n_pp
    recall = n_tpr/n_pr
    if (precision+recall) == 0:
        f1 = 0
    else:
        f1 = 2*(precision*recall)/(precision+recall)
    if verbose:
        print(f'n_tpp: {n_tpp}, n_pp: {n_pp}, precision: {precision}, n_tpr: {n_tpr}, n_pr: {n_pr}, recall: {recall}, f1: {f1}')
    return f1

In [8]:
failure_tag = pd.read_csv('../data/memory_sample_failure_tag_round1_a_train.csv')
failure_tag['failure_time']= pd.to_datetime(failure_tag['failure_time'])

In [9]:
failure_data = pd.read_csv('../data/memory_sample_failure_tag_round1_b_test.csv')
failure_data['failure_time'] = pd.to_datetime(failure_data['failure_time'])

## 提取kernel表特征

In [15]:
kernel_train_2min = etl_kernel(kernel_train_path,'2min','2min')
kernel_test_2min = etl_kernel(kernel_test_path,'2min','2min')

## 提取mce表特征

In [11]:
mce_a = pd.read_csv("../data/memory_sample_mce_log_round1_a_train.csv")
mce_b = pd.read_csv("../data/memory_sample_mce_log_round1_b_test.csv")

In [12]:
def get_mce_data(df_mce,agg_time,time):
    df = df_mce.copy()
    for i in ["Z", "AP", "G", "F", "BB", "E", "CC", "AF", "AE"]:
        df['mca_'+i+time] = (df.mca_id == i).astype("float")
    for i in [0, 1, 2, 3]:
        df['trans_'+str(i)+time] = (df.transaction == i).astype("float")
    df['collect_time'] = pd.to_datetime(df['collect_time']).dt.floor(agg_time)

    df['count_mce_'+time] = 1
    mce_new = df.groupby(['serial_number','manufacturer','vendor','collect_time'],as_index=False).agg('sum')
    return mce_new

In [13]:
mce_train_2min = get_mce_data(mce_a,'2min','2min')
mce_test_2min = get_mce_data(mce_b,'2min','2min')

In [16]:
train = pd.merge(kernel_train_2min, mce_train_2min,how='left',on=['serial_number','manufacturer','vendor','collect_time'])
test = pd.merge(kernel_test_2min, mce_test_2min,how='left',on=['serial_number','manufacturer','vendor','collect_time'])

In [17]:
kernel_train_2min_sta = kernel_train_2min[['serial_number','count2min']].groupby(['serial_number'],as_index=False).agg(list)
kernel_train_2min_sta['mean_2min'] = kernel_train_2min_sta['count2min'].apply(lambda x: np.mean(x))
kernel_train_2min_sta['median_2min'] = kernel_train_2min_sta['count2min'].apply(lambda x: np.median(x))
kernel_train_2min_sta['sum_2min'] = kernel_train_2min_sta['count2min'].apply(lambda x: sum(x))
train = pd.merge(train,kernel_train_2min_sta[['serial_number','mean_2min','median_2min','sum_2min']],how='left',on=['serial_number'])

In [18]:
kernel_test_2min_sta = kernel_test_2min[['serial_number','count2min']].groupby(['serial_number'],as_index=False).agg(list)
kernel_test_2min_sta['mean_2min'] = kernel_test_2min_sta['count2min'].apply(lambda x: np.mean(x))
kernel_test_2min_sta['median_2min'] = kernel_test_2min_sta['count2min'].apply(lambda x: np.median(x))
kernel_test_2min_sta['sum_2min'] = kernel_test_2min_sta['count2min'].apply(lambda x: sum(x))
test = pd.merge(test,kernel_test_2min_sta[['serial_number','mean_2min','median_2min','sum_2min']],how='left',on=['serial_number'])

## 读取address特征文件

In [20]:
address_sta_train = pd.read_csv("address_sta_train.csv")
address_sta_test = pd.read_csv("address_sta_test.csv")

In [22]:
address_sta_train['collect_time'] = pd.to_datetime(address_sta_train['collect_time'])
address_sta_test['collect_time'] = pd.to_datetime(address_sta_test['collect_time'])

In [23]:
train_new = pd.merge(train,address_sta_train[['serial_number', 'collect_time',
       'row_num', 'row_max', 'row_repeat', 'col_num', 'col_max', 'col_repeat',
       ]],how='left',on = ['serial_number','collect_time']) 

In [31]:
test_new = pd.merge(test,address_sta_test[['serial_number', 'collect_time',
       'row_num', 'row_max', 'row_repeat', 'col_num', 'col_max', 'col_repeat',
       ]],how='left',on = ['serial_number','collect_time']) 

In [25]:
train_new = getLabel(failure_train_path, train_new)

In [26]:
feats = ['manufacturer', 'vendor', '1_hwerr_f','1_hwerr_e','2_hwerr_c', '3_hwerr_as','1_ke','3_hwerr_kp','3_sup_mce_note','2_hwerr_p', 'count2min',
       'mca_Z2min', 'mca_AP2min', 'mca_G2min', 'mca_BB2min',
       'mca_E2min', 'mca_CC2min', 'mca_AF2min',  'trans_02min',
       'trans_12min', 'trans_22min', 'trans_32min', 'count_mce_2min',
       'mean_2min', 'median_2min', 'sum_2min',
       'row_num', 'row_max', 'row_repeat', 'col_num', 'col_max', 'col_repeat',
       ]

In [27]:
len(feats)

32

In [32]:
test_new = pd.merge(test_new,failure_data[['serial_number',
                                               'manufacturer','vendor','failure_time']],how='left',on=['serial_number','manufacturer','vendor'])
test_new['diff_seconds'] = (test_new['failure_time'] - test_new['collect_time']).dt.days*24*60*60 + ((test_new['failure_time']-test_new['collect_time']).dt.seconds)
test_new['label'] = test_new['diff_seconds'].map(label)

In [33]:
## 将1-5月数据与7月前20天数据合并
test_7 = test_new[(test_new['collect_time']>=pd.to_datetime('20190701'))&(test_new['collect_time']<pd.to_datetime('20190722'))]
train_concat = pd.concat([train_new,test_7])

## catboost建模

In [34]:
clf_cat = cat.CatBoostClassifier(learning_rate=0.01,iterations=1000,depth=6
                                 ,colsample_bylevel = 0.8
                                )

clf_r_cat = cat.CatBoostRegressor(learning_rate=0.01,iterations=1000,depth=6,loss_function ='MAPE'
                                  ,colsample_bylevel = 0.8
                                 )

In [35]:
sample_0 = train_concat[train_concat['label']==0].sample(len(train_concat[train_concat['label']==1])*10,random_state=2)
sample = sample_0.append(train_concat[train_concat['label']==1])

In [36]:
start = pd.to_datetime('20190722')
end = start + timedelta(days=10)
test_ = test_new[(test_new['collect_time']>=start)&(test_new['collect_time']<end)]

In [37]:
cat_features = ['manufacturer','vendor']

In [38]:
sample['vendor'] = sample['vendor'].apply(lambda x: int(x))
test_['vendor'] = test_['vendor'].apply(lambda x: int(x))

In [39]:
clf_cat.fit(sample[feats],sample['label'],verbose=False,cat_features = cat_features)
y_pred = clf_cat.predict_proba(test_[feats])[:,1]
test_['cat'] = y_pred
test_p = test_[test_['cat']>=0.8]
fail_7 = sample[sample['label']==1]
fail_7['diff_min'] = np.floor(fail_7['diff_seconds']/60)+2
clf_r_cat.fit(fail_7[feats],fail_7['diff_min'],verbose=False)
y_pred = clf_r_cat.predict(test_p[feats])
test_p['pti'] = np.ceil(y_pred)
res = test_p[['serial_number','collect_time','pti']]
fail = failure_data[(failure_data['failure_time']>=start)&(failure_data['failure_time']<end)]
fail['failure_time'] = pd.to_datetime(fail['failure_time'])
score_func_round2(res, fail, verbose=False)

0.32394350408672246

## 对回归结果做一些规则性处理，提高测评分数

In [40]:
test_p['reg'] = np.ceil(y_pred)
def deal_pti(x):
    if x>40:
        return 4000
    elif x>30:
        return 200
    elif x>20:
        return 30
    else:
        return 10
        
test_p['pti'] = test_p['reg'].apply(lambda x: deal_pti(x))       
res = test_p[['serial_number','collect_time','pti']]
score_func_round2(res, fail, verbose=False)

0.353634142418905

In [41]:
import pickle
with open('p1.pickle','wb')as f:
    pickle.dump(clf_cat,f)

In [42]:
import pickle
with open('p2.pickle','wb')as f:
    pickle.dump(clf_r_cat,f)