## 1. 引入包

In [2]:
import os
import nltk
# nltk.download('punkt')
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jinca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. 读取数据

In [4]:
# 读取sel日志，排序
sel_data = pd.read_csv('./pre_contest/dataset/preliminary_sel_logbdataset.csv')
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

In [5]:
sel_data.groupby('sn').size()

sn
SERVER_10001      9
SERVER_10003    117
SERVER_10008      9
SERVER_10009      4
SERVER_10012      4
               ... 
SERVER_999        4
SERVER_9991       3
SERVER_9993       2
SERVER_9998       2
SERVER_9999      16
Length: 13705, dtype: int64

## 3. 分词

In [6]:
# 取出每台服务器的最后十条日志
sn_list = sel_data['sn'].drop_duplicates(keep='first').to_list()
tail_msg_list = ['.'.join(sel_data[sel_data['sn']==i]['msg'].tail(10).to_list()) for i in sn_list]
tokenized_sent = [word_tokenize(s.lower()) for s in tail_msg_list]

In [8]:
len(tokenized_sent), len(sn_list)

(13705, 13705)

## 4. 训练embbeding模型（Doc2Vec）

In [12]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
model = Doc2Vec(tagged_data, vector_size = 10, window = 2, min_count = 1, epochs = 10)

In [14]:
import pickle
file = open('./pre_contest/Doc2Vec_models/modelv1_presellog.p','wb')
pickle.dump(model, file)

## 5. 构建树模型的训练集

In [16]:
label = pd.read_csv('./pre_contest/dataset/preliminary_train_label_dataset.csv')
label.sort_values(by=['sn', 'fault_time'], inplace=True)
label.reset_index(drop=True, inplace=True)
train_data = []
for i, row in label.iterrows():
    train_data.append(model.infer_vector(word_tokenize('.'.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
train_feature = np.array(train_data)
train_label = label['label'].values

In [22]:
np.save('./pre_contest/Doc2Vec_models/train_dat.v1.npy',train_feature)
np.save('./pre_contest/Doc2Vec_models/train_labels.v1.npy', train_label)

## 6. 训练随机森林

In [None]:
# performance evaluation
def macro_f1(target_df: pd.DataFrame,  submit_df: pd.DataFrame)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = target_df.merge(submit_df, how='left', on=['sn', 'fault_time'], suffixes=['_gt', '_pr'])
    overall_df.fillna(-1)
    
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        
        print('Task %d:\n Prcesion %.2f, Recall %.2f, F1 %.2f' % (i+1, precision, recall, F1))
        
    return macro_F1

In [37]:
# validation dataset
import random
val_mask = [random.random() < 0.2 for _ in range(len(train_label))]
train_mask = [not xx for xx in val_mask]
val_feature = train_feature[val_mask]
val_label = train_label[val_mask]

train_feat = train_feature[train_mask]
train_lab = train_label[train_mask]

In [38]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(train_feat, train_lab)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
train_label_pred = rf.predict(train_feature)
submit_df = label[['sn','fault_time']]
submit_df['label'] = train_label_pred

In [40]:
macro_f1(label.loc[val_mask], submit_df.loc[val_mask])

Task 1:
 Prcesion 0.46, Recall 0.07, F1 0.13
Task 2:
 Prcesion 0.53, Recall 0.53, F1 0.53
Task 3:
 Prcesion 0.78, Recall 0.93, F1 0.85
Task 4:
 Prcesion 0.56, Recall 0.44, F1 0.50


0.3990924560173966

In [41]:
macro_f1(label.loc[train_mask], submit_df.loc[train_mask])

Task 1:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 2:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 3:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 4:
 Prcesion 1.00, Recall 0.99, F1 0.99


0.9979683096583338

## 7. 构建测试集

In [25]:
submit = pd.read_csv('./pre_contest/dataset/preliminary_submit_dataset_a.csv')
submit.sort_values(by=['sn', 'fault_time'], inplace=True)
submit.reset_index(drop=True, inplace=True)
test_data = []
for i, row in submit.iterrows():
    test_data.append(model.infer_vector(word_tokenize('. '.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
test_feature = np.array(test_data)

## 8. 预测并保存结果

In [26]:
test_label = rf.predict(test_feature)
submit['label'] = test_label
submit.to_csv('./pre_contest/output/preliminary_pred_df.v1.csv', index=0)