In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import gc
import glob
import math
import random
from multiprocessing import Pool

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
import lightgbm as lgb

In [2]:
DATA_PATH = './data'
FEATURE_PATH = './feature'
WORKER_NUM = 64
NEGATIVE_SAMPLE_RATIO = 0.05

In [3]:
# 内存故障维修记录单 (2024/01/01 - 2024/05/31) 
# 记录的是该内存在这五个月内第一次故障的时间点 
# 如果没出现在记录单内的内存，说明在这五个月内没有发生故障

ticket = pd.read_csv(f'{DATA_PATH}/ticket.csv')
display(ticket)

# 内存 - 第一次故障时间 映射表
ticket_alarm_time_map = ticket[['sn_name', 'alarm_time']].set_index('sn_name').to_dict()['alarm_time']

Unnamed: 0,sn_name,alarm_time,sn_type
0,sn_4191,1704077850,A
1,sn_10692,1704140121,A
2,sn_7219,1704148156,A
3,sn_31281,1704151660,A
4,sn_8854,1704159849,A
...,...,...,...
831,sn_10276,1717118644,A
832,sn_70978,1717127819,B
833,sn_41325,1717149399,A
834,sn_57720,1717169595,A


In [4]:
# 所有内存 sn
all_sn_names = [x.split('/')[-1].replace('.feather', '') for x in glob.glob(f'{DATA_PATH}/type_[AB]/sn_*.feather')]

# 在这五个月内发生过故障的内存 sn
positive_sn_names = ticket['sn_name'].values.tolist()

# 在这五个月内没发生过故障的内存 sn
negative_sn_names = list(set(all_sn_names) - set(positive_sn_names))

assert len(all_sn_names) == len(positive_sn_names) + len(negative_sn_names)

In [5]:
# 这里用回归来做, 毕竟从官方定义来说, 第一次出现故障是最重要的, 需要有个体现
# 线性指标变换: 第一次出现故障为 1, 往后的正样本进行标签衰减, 最小值截断为 0.5

def get_label_by_index(idx):
    label = 1 - (idx / 250) * 0.5
    return max(label, 0.5)

In [6]:
def get_positive_data(sn_name):
    # 获取文件路径
    if os.path.exists(f'{FEATURE_PATH}/type_A/{sn_name}.feather'):
        filepath = f'{FEATURE_PATH}/type_A/{sn_name}.feather'
    else:
        filepath = f'{FEATURE_PATH}/type_B/{sn_name}.feather'
    # 读取文件内容
    df = pd.read_feather(filepath)
    # 添加 sn
    df['sn_name'] = sn_name
    # 从映射表获取该 sn 第一次发生故障的时间点 (每个 sn 都只有一次)
    alarm_time = ticket_alarm_time_map[sn_name]
    # 训练集 timestamp < 1717171200 (2024/06/01 00:00:00) & alarm_time 之后的不要了
    df = df[(df['LogTime'] < 1717171200)&(df['LogTime'] <= alarm_time - 15*60)].sort_values('LogTime').reset_index(drop=True) 
    labels = []
    idx = 0   # 记录离第一次故障信息的相对位置
    for ts in df['LogTime'].values:
        # 落在范围内的才能是正样本 Tl(15minutes)+Tp(7days)
        if ts + 15*60 <= alarm_time <= ts + 15*60 + 7*24*60*60:
            labels.append(get_label_by_index(idx))
            idx += 1
        else:
            labels.append(0)
    df['label'] = labels
    return df

with Pool(WORKER_NUM) as pool:
    res = list(
        tqdm(
            pool.imap(get_positive_data, positive_sn_names),
            total=len(positive_sn_names),
            desc="Generating positive data",
        )
    )

df_positive = pd.concat(res).reset_index(drop=True)
df_positive['label'].value_counts(dropna=False)

Generating positive data:   0%|          | 0/836 [00:00<?, ?it/s]

label
0.000    267598
1.000       796
0.998       685
0.996       631
0.994       601
          ...  
0.672        93
0.670        90
0.668        85
0.666        76
0.664         3
Name: count, Length: 170, dtype: int64

In [7]:
def get_negative_data(sn_name):
    # 获取文件路径
    if os.path.exists(f'{FEATURE_PATH}/type_A/{sn_name}.feather'):
        filepath = f'{FEATURE_PATH}/type_A/{sn_name}.feather'
    else:
        filepath = f'{FEATURE_PATH}/type_B/{sn_name}.feather'
    # 读取文件内容
    df = pd.read_feather(filepath)
    # 添加 sn
    df['sn_name'] = sn_name
    # 负样本只用训练集的最后两个月 训练集 (2024/04/01 00:00:00) 1711900800 < timestamp < 1717171200 (2024/06/01 00:00:00)
    df = df[df['LogTime'] < 1717171200].sort_values('LogTime').reset_index(drop=True)
    df['label'] = 0
    return df

with Pool(WORKER_NUM) as pool:
    res = list(
        tqdm(
            pool.imap(get_negative_data, negative_sn_names),
            total=len(negative_sn_names),
            desc="Generating negative data",
        )
    )

df_negative = pd.concat(res).reset_index(drop=True)

Generating negative data:   0%|          | 0/61388 [00:00<?, ?it/s]

In [8]:
df_data = pd.concat([df_positive, df_negative]).sort_values('sn_name').reset_index(drop=True)
feature_names = [c for c in df_data.columns if c not in ['LogTime', 'sn_name', 'label', 'time_index']]
df_data

Unnamed: 0,time_index,LogTime,window_logs_count,window_read_error_logs_count,window_scrub_error_logs_count,window_burst_count,window_dq_count,window_max_burst_interval,window_max_dq_interval,window_deviceID_nunique,window_ChannelId_nunique,window_BankId_nunique,window_DimmId_nunique,window_ColumnId_nunique,window_RowId_nunique,window_MciAddr_nunique,window_RetryRdErrLogParity_nunique,window_RetryRdErrLog_nunique,window_locale_nunique,window_CellId_nunique,fault_mode_others,fault_mode_device,fault_mode_bank,fault_mode_row,fault_mode_column,fault_mode_cell,window_LogTime_diff_mean,window_LogTime_diff_max,window_LogTime_diff_std,window_burst_count_diff_mean,window_burst_count_diff_max,window_burst_count_diff_std,window_dq_count_diff_mean,window_dq_count_diff_max,window_dq_count_diff_std,sn_name,label
0,474111,1706803167,32,32,0,32,32,0,0,1,1,1,1,1,2,2,1,1,2,2,0,0,1,0,1,1,115.032258,475,102.826658,0.0,0,0.0,0.0,0,0.0,sn_1,0.0
1,473936,1706173164,66,66,0,66,66,0,0,1,1,1,1,1,2,2,1,1,2,2,0,0,1,0,1,1,54.738462,122,40.546754,0.0,0,0.0,0.0,0,0.0,sn_1,0.0
2,473935,1706169563,61,61,0,61,61,0,0,1,1,1,1,1,2,2,1,1,2,2,0,0,1,0,1,1,57.950000,124,40.216674,0.0,0,0.0,0.0,0,0.0,sn_1,0.0
3,473934,1706165967,57,57,0,57,57,0,0,1,1,1,1,1,2,2,1,1,2,2,0,0,1,0,1,1,62.196429,125,40.350438,0.0,0,0.0,0.0,0,0.0,sn_1,0.0
4,473933,1706162367,62,62,0,62,62,0,0,1,1,1,1,1,2,2,1,1,2,2,0,0,1,0,1,1,57.098361,126,36.367300,0.0,0,0.0,0.0,0,0.0,sn_1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14522353,474588,1708518588,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,-1.000000,-1,-1.000000,-1.0,-1,-1.0,-1.0,-1,-1.0,sn_9999,0.0
14522354,474622,1708642625,31,31,0,31,31,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,45.366667,234,48.903635,0.0,0,0.0,0.0,0,0.0,sn_9999,0.0
14522355,474623,1708645606,40,40,0,40,40,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,68.384615,337,81.961722,0.0,0,0.0,0.0,0,0.0,sn_9999,0.0
14522356,474476,1708116907,7,7,0,7,7,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,307.833333,1194,412.202384,0.0,0,0.0,0.0,0,0.0,sn_9999,0.0


In [9]:
kf = GroupKFold(n_splits=5)
models = []
oof_pred = np.zeros(len(df_data))
for i, (train_index, valid_index) in enumerate(kf.split(df_data, groups=df_data['sn_name'])):
    print(f'Fold {i} ...')
    x_valid = df_data.loc[valid_index, feature_names].copy()
    y_valid = df_data.loc[valid_index, 'label']
    # 训练集负采样
    train_df = df_data.loc[train_index, :].copy()
    pos_df = train_df[train_df['label'] != 0].reset_index(drop=True)
    neg_df = train_df[train_df['label'] == 0].reset_index(drop=True)
    neg_df = neg_df.sample(frac=NEGATIVE_SAMPLE_RATIO, random_state=42).reset_index(drop=True)
    train_df = pd.concat([pos_df, neg_df]).reset_index(drop=True)
    x_train = train_df.loc[:, feature_names].copy()
    y_train = train_df.loc[:, 'label']
    
    model = lgb.LGBMRegressor(
        max_depth=8, 
        num_leaves=64,
        min_child_samples=64,
        n_estimators=500, 
        learning_rate=0.1, 
        verbose=-1
    )
    model.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)]
    )
    oof_pred[valid_index] = model.predict(x_valid)
    models.append(model)
    del model; gc.collect()

Fold 0 ...
Fold 1 ...
Fold 2 ...
Fold 3 ...
Fold 4 ...


In [10]:
df_data['pred'] = oof_pred
df_data['pred'].describe()

count    1.452236e+07
mean     3.420888e-02
std      6.158538e-02
min     -4.086797e-01
25%      1.291183e-02
50%      1.985801e-02
75%      3.130795e-02
max      1.272218e+00
Name: pred, dtype: float64

In [11]:
# sn wise 指标 (不是很高效, 需改进)
def calc_score(df_data, ticket, threshold=0.5):
    result_df = df_data[['sn_name', 'LogTime', 'pred']].merge(ticket[['sn_name', 'alarm_time']], 
                                                              on='sn_name', how='left')
    # 正样本
    pos = result_df[result_df['alarm_time'].notna()].reset_index(drop=True)
    sn_list = pos['sn_name'].unique()
    res = []
    for sn in sn_list:
        tmp = pos[pos['sn_name'] == sn].reset_index(drop=True)
        di = {}
        di['sn_name'] = sn
        flag = 0
        for _, row in tmp.iterrows():
            if (row['LogTime'] + 15*60 <= row['alarm_time'] <= row['LogTime'] + 15*60 + 7*24*60*60) and \
               (row['pred'] >= threshold):
                flag = 1
                break
        di['pred'] = flag
        res.append(di)
    pos = pd.DataFrame(res)
    pos['label'] = 1
    # 负样本
    neg = result_df[result_df['alarm_time'].isna()].reset_index(drop=True)
    neg = neg.groupby('sn_name')['pred'].max().to_frame().reset_index()
    neg['pred'] = neg['pred'].apply(lambda x: 0 if x < threshold else 1)
    neg['label'] = 0
    # 合并算分
    metric_df = pd.concat([pos, neg]).reset_index(drop=True)
    score = f1_score(metric_df['label'], metric_df['pred'])

    return score, metric_df


for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
    score, _ = calc_score(df_data, ticket, threshold)
    print(f'threshold: {threshold:.3f} score: {score:.6f}')

threshold: 0.100 score: 0.086236
threshold: 0.200 score: 0.111879
threshold: 0.300 score: 0.159628
threshold: 0.400 score: 0.194745
threshold: 0.500 score: 0.209229
threshold: 0.600 score: 0.203080
threshold: 0.700 score: 0.191868


In [12]:
feature_importances = np.mean([m.feature_importances_ for m in models], axis=0)
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

display(importance_df)

Unnamed: 0,Feature,Importance
25,window_LogTime_diff_max,2348.8
26,window_LogTime_diff_std,2195.2
24,window_LogTime_diff_mean,1896.8
29,window_burst_count_diff_std,1887.2
3,window_burst_count,1795.6
14,window_RetryRdErrLogParity_nunique,1676.0
13,window_MciAddr_nunique,1620.0
4,window_dq_count,1510.2
6,window_max_dq_interval,1276.4
5,window_max_burst_interval,1276.0


In [13]:
# 测试集
def get_test_data(sn_name):
    # 获取文件路径
    if os.path.exists(f'{FEATURE_PATH}/type_A/{sn_name}.feather'):
        filepath = f'{FEATURE_PATH}/type_A/{sn_name}.feather'
    else:
        filepath = f'{FEATURE_PATH}/type_B/{sn_name}.feather'
    # 读取文件内容
    df = pd.read_feather(filepath)
    # 添加 sn
    df['sn_name'] = sn_name
    df = df[df['LogTime'] >= 1717171200].sort_values('LogTime').reset_index(drop=True)
    return df

# 已经故障过的正样本就不用做 infer 了，定义是第一次故障才会记录
with Pool(WORKER_NUM) as pool:
    res = list(
        tqdm(
            pool.imap(get_test_data, negative_sn_names),
            total=len(negative_sn_names),
            desc="Generating test data",
        )
    )

df_test = pd.concat(res).reset_index(drop=True)
df_test

Generating test data:   0%|          | 0/61388 [00:00<?, ?it/s]

Unnamed: 0,time_index,LogTime,window_logs_count,window_read_error_logs_count,window_scrub_error_logs_count,window_burst_count,window_dq_count,window_max_burst_interval,window_max_dq_interval,window_deviceID_nunique,window_ChannelId_nunique,window_BankId_nunique,window_DimmId_nunique,window_ColumnId_nunique,window_RowId_nunique,window_MciAddr_nunique,window_RetryRdErrLogParity_nunique,window_RetryRdErrLog_nunique,window_locale_nunique,window_CellId_nunique,fault_mode_others,fault_mode_device,fault_mode_bank,fault_mode_row,fault_mode_column,fault_mode_cell,window_LogTime_diff_mean,window_LogTime_diff_max,window_LogTime_diff_std,window_burst_count_diff_mean,window_burst_count_diff_max,window_burst_count_diff_std,window_dq_count_diff_mean,window_dq_count_diff_max,window_dq_count_diff_std,sn_name
0,477102,1717569631,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,-1.0,-1,-1.0,-1.0,-1,-1.000000,-1.0,-1,-1.0,sn_14374
1,477218,1717984864,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,-1.0,-1,-1.0,-1.0,-1,-1.000000,-1.0,-1,-1.0,sn_14374
2,477291,1718249567,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,-1.0,-1,-1.0,-1.0,-1,-1.000000,-1.0,-1,-1.0,sn_14374
3,477295,1718262765,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,-1.0,-1,-1.0,-1.0,-1,-1.000000,-1.0,-1,-1.0,sn_14374
4,477296,1718268758,2,2,0,2,2,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,118.0,118,0.0,0.0,0,0.000000,0.0,0,0.0,sn_14374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7234844,478406,1722264133,8,0,8,10,8,2,0,1,1,1,1,4,1,4,3,1,4,4,0,0,1,1,0,1,0.0,0,0.0,0.0,1,0.755929,0.0,0,0.0,sn_41811
7234845,478418,1722306412,4,0,4,7,4,2,0,1,1,1,1,4,1,4,3,1,4,4,0,0,1,1,0,1,0.0,0,0.0,0.0,1,0.816497,0.0,0,0.0,sn_41811
7234846,478430,1722348691,6,0,6,8,6,2,0,1,1,1,1,3,1,3,3,1,3,3,0,0,1,1,0,1,0.0,0,0.0,0.0,1,0.632456,0.0,0,0.0,sn_41811
7234847,478441,1722390970,4,0,4,8,4,6,0,1,1,1,1,4,1,4,4,1,4,4,0,0,1,1,0,1,0.0,0,0.0,0.0,3,2.160247,0.0,0,0.0,sn_41811


In [14]:
%%time

pred_test = np.zeros(len(df_test))
for model in tqdm(models):
    pred_test += model.predict(df_test[feature_names]) / kf.n_splits

  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 35min 32s, sys: 7.11 s, total: 35min 39s
Wall time: 1min 14s


In [17]:
# 生成特征时忘了 ...
df_sn_type = pd.concat([
    pd.DataFrame({
        'sn_name': [i.replace('feature/type_A/', '').replace('.feather', '') for i in glob.glob('feature/type_A/*.feather')],
        'serial_number_type': ['A'] * len(glob.glob('feature/type_A/*.feather'))
    }),
    pd.DataFrame({
        'sn_name': [i.replace('feature/type_B/', '').replace('.feather', '') for i in glob.glob('feature/type_B/*.feather')],
        'serial_number_type': ['B'] * len(glob.glob('feature/type_B/*.feather'))
    })
]).reset_index(drop=True)

df_sub = df_test[['sn_name', 'LogTime']].copy()
df_sub = df_sub.merge(df_sn_type, on='sn_name', how='left')
df_sub['pred'] = pred_test

df_sub = df_sub[df_sub['pred'] >= 0.5].reset_index(drop=True)
df_sub.drop('pred', axis=1, inplace=True)
df_sub.columns = ['sn_name','prediction_timestamp','serial_number_type']
print(df_sub['sn_name'].nunique())
display(df_sub)
df_sub.to_csv('submission.csv', index=False)

1051


Unnamed: 0,sn_name,prediction_timestamp,serial_number_type
0,sn_55617,1717332860,A
1,sn_55617,1717417253,A
2,sn_55617,1717670433,A
3,sn_55617,1717923614,A
4,sn_55617,1718008007,A
...,...,...,...
13912,sn_65191,1720832376,A
13913,sn_65191,1720835975,A
13914,sn_65191,1720839575,A
13915,sn_65191,1720840734,A
