In [1]:
import torch
import numpy as np
import pandas as pd
import random
import os
random.seed(0)
np.random.seed(0)#seed应该在main里尽早设置，以防万一
os.environ['PYTHONHASHSEED'] =str(0)#消除hash算法的随机性
import transformers as _
from transformers1 import BertTokenizer
from Config import TOKENIZERS
from tqdm import tqdm

from NEZHA.configuration_nezha import NeZhaConfig
from NEZHA.modeling_nezha import NeZhaForMaskedLM


2022-04-10 23:36:57.748508: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
maxlen=64
batch_size=64
vocab_file_dir = './nezha_model/vocab.txt'
tokenizer = BertTokenizer.from_pretrained(vocab_file_dir)

def paddingList(ls:list,val,returnTensor=False):
    ls=ls[:]#不要改变了原list尺寸
    maxLen=max([len(i) for i in ls])
    for i in range(len(ls)):
        ls[i]=ls[i]+[val]*(maxLen-len(ls[i]))
    return torch.tensor(ls,device='cuda') if returnTensor else ls

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [3]:
config = NeZhaConfig(
    vocab_size=len(tokenizer),
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    max_position_embeddings=512,
)

model = NeZhaForMaskedLM.from_pretrained("./nezha_model").to(torch.device('cuda'))

In [4]:
tokenizer = TOKENIZERS['NEZHA'].from_pretrained(vocab_file_dir)



In [5]:
label1 = pd.read_csv('../../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [6]:
submit_df = pd.read_csv('../../data/preliminary_submit_dataset_b.csv')

In [7]:
log_df = pd.read_csv('../../new_src/new_log.csv')

In [8]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [9]:
# def get_bert_feature(text):
#     input_ids, input_masks, segment_ids = [], [], []
#     tkRes = tokenizer(text, max_length=64, truncation='longest_first',
#                            return_attention_mask=False)
#     input_id = tkRes['input_ids']
#     segment_id = tkRes['token_type_ids']
#     input_ids.append(input_id)
#     segment_ids.append(segment_id)

#     input_ids = paddingList(input_ids, 0, returnTensor=True)
#     segment_ids = paddingList(segment_ids, 0, returnTensor=True)
#     input_masks = (input_ids != 0)

#     feature = model(input_ids, input_masks, segment_ids)[0].cpu().detach().numpy()
#     return np.mean(feature, axis=1)

In [10]:
def get_bert_feature(df):
    features = []
    for i in range(df.shape[0]):
        input_ids, input_masks, segment_ids = [], [], []
        text = df.iloc[i]['msg'].strip()
        tkRes = tokenizer(text, max_length=64, truncation='longest_first',
                               return_attention_mask=False)
        input_id = tkRes['input_ids']
        segment_id = tkRes['token_type_ids']
        input_ids.append(input_id)
        segment_ids.append(segment_id)

        input_ids = paddingList(input_ids, 0, returnTensor=True)
        segment_ids = paddingList(segment_ids, 0, returnTensor=True)
        input_masks = (input_ids != 0)

        features.append(model(input_ids, input_masks, segment_ids)[0].cpu().detach().numpy().mean(axis=1))
    features = np.array(features)
    features = features.reshape(features.shape[0], features.shape[2])
    features = np.mean(features, axis=0)
    return features

In [14]:
tr_data = []
for idx, row in tqdm(label_df.iterrows()):
    sn = row['sn']
    fault_time = row['fault_time']
    ts = row['fault_time_ts']
    label = row['label']

    df = log_df[log_df['sn'] == sn].copy()
    df = df[df['time_ts'] <= ts].copy()
    df = df.sort_values(by='time_ts').reset_index(drop=True)
    df = df.tail(20).copy()

    if df.shape[0] > 0:
        tr_data.append(get_bert_feature(df))
    else:
        tr_data.append(np.zeros(465,))

36it [00:10,  3.34it/s]


KeyboardInterrupt: 

In [15]:
train_feature = np.array(tr_data)

In [None]:
te_data = []
for idx, row in tqdm(submit_df.iterrows()):
    sn = row['sn']
    fault_time = row['fault_time']
    ts = row['fault_time_ts']

    df = log_df[log_df['sn'] == sn].copy()
    df = df[df['time_ts'] <= ts].copy()
    df = df.sort_values(by='time_ts').reset_index(drop=True)
    df = df.tail(20).copy()

    if df.shape[0] > 0:
        te_data.append(get_bert_feature(df))
    else:
        te_data.append(np.zeros(461,))

In [None]:
test_feature = np.array(te_data)

In [None]:
label_data = []
sn_data = []
fault_time_data = []
for idx, row in tqdm(label_df.iterrows()):
    label = row['label']
    sn = row['sn']
    fault_time = row['fault_time']
    label_data.append(label)
    sn_data.append(sn)
    fault_time_data.append(fault_time)

In [None]:
train_df = pd.DataFrame(train_feature)
train_df.columns = ['bert_%d'%i for i in range(465)]

In [None]:
train_df['sn'] = sn_data
train_df['fault_time'] = fault_time_data
train_df['label'] = label_data

In [None]:
sn_data = []
fault_time_data = []
for idx, row in tqdm(submit_df.iterrows()):
    sn = row['sn']
    fault_time = row['fault_time']
    sn_data.append(sn)
    fault_time_data.append(fault_time)

In [None]:
test_df = pd.DataFrame(test_feature)
test_df.columns = ['bert_%d'%i for i in range(465)]

In [None]:
test_df['sn'] = sn_data
test_df['fault_time'] = fault_time_data

In [None]:
train_df.to_csv('train5.csv', index=False)
test_df.to_csv('test5.csv', index=False)

In [23]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [24]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [26]:
train_label = np.array(label_data)

In [27]:
preds = np.zeros((test_feature.shape[0], 4))
val_preds = np.zeros((train_feature.shape[0], 4))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(train_feature, train_label):
    xtrain, ytrain = train_feature[train_index], train_label[train_index]
    xtest, ytest = train_feature[test_index], train_label[test_index]
    
    dtrain = lgb.Dataset(xtrain, label=ytrain)
    dvalid = lgb.Dataset(xtest, label=ytest)
    param = {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'early_stopping_rounds': 20,
        'learning_rate': 0.03,
        'random_state': 42
    }
    gbm = lgb.train(
        param, dtrain, valid_sets=[dtrain, dvalid], num_boost_round=100000, verbose_eval=10
    )
    
    val_preds[test_index] = gbm.predict(xtest)
    preds += gbm.predict(test_feature) / 5

print(macro_f1(train_label, np.argmax(val_preds, axis=1)))



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 117503
[LightGBM] [Info] Number of data points in the train set: 13283, number of used features: 461
[LightGBM] [Info] Start training from score -2.420971
[LightGBM] [Info] Start training from score -1.589536
[LightGBM] [Info] Start training from score -0.577466
[LightGBM] [Info] Start training from score -1.925345
Training until validation scores don't improve for 20 rounds
[10]	training's multi_logloss: 0.824577	valid_1's multi_logloss: 0.850534
[20]	training's multi_logloss: 0.658913	valid_1's multi_logloss: 0.70363
[30]	training's multi_logloss: 0.558378	valid_1's multi_logloss: 0.617978
[40]	training's multi_logloss: 0.492121	valid_1's multi_logloss: 0.566407
[50]	training's multi_logloss: 0.445785	valid_1's multi_logloss: 0.533293
[60]	training's multi_logloss: 0.411038	valid_1's multi_logloss: 0.510574
[70]	training's multi_logloss: 0.384668	valid_1's multi_logloss: 0.495853
[80]	training's mu

[10]	training's multi_logloss: 0.826315	valid_1's multi_logloss: 0.845512
[20]	training's multi_logloss: 0.661358	valid_1's multi_logloss: 0.695768
[30]	training's multi_logloss: 0.561927	valid_1's multi_logloss: 0.609819
[40]	training's multi_logloss: 0.496067	valid_1's multi_logloss: 0.555868
[50]	training's multi_logloss: 0.450284	valid_1's multi_logloss: 0.52148
[60]	training's multi_logloss: 0.416412	valid_1's multi_logloss: 0.498558
[70]	training's multi_logloss: 0.39055	valid_1's multi_logloss: 0.483038
[80]	training's multi_logloss: 0.369342	valid_1's multi_logloss: 0.472462
[90]	training's multi_logloss: 0.35188	valid_1's multi_logloss: 0.464482
[100]	training's multi_logloss: 0.337195	valid_1's multi_logloss: 0.45957
[110]	training's multi_logloss: 0.324496	valid_1's multi_logloss: 0.456352
[120]	training's multi_logloss: 0.313371	valid_1's multi_logloss: 0.454789
[130]	training's multi_logloss: 0.303343	valid_1's multi_logloss: 0.453951
[140]	training's multi_logloss: 0.2939

In [23]:
submit_df['label'] = np.argmax(preds, axis=1)
submit_df[['sn', 'fault_time', 'label']].to_csv('./preliminary_pred_a.csv', index=0)

In [24]:
0.5786237446986044

0.5786237446986044