# 基于V0P1进行调整，主要调整log和label对应关系，使用XGB

## 1. 引入包

In [29]:
import os
import nltk
# nltk.download('punkt')
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import random
import pickle
import multiprocessing
import re

In [30]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jinca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. 读取数据

In [3]:
# 更改工作目录为当前项目根目录
import sys
import os
os.chdir(os.path.dirname(os.path.dirname(sys.path[0])))
print(os.getcwd())

C:\workfile\python\Log-diagnosis


In [4]:
import pandas as pd
# 读取sel日志数据
sel_log = pd.read_csv('./pre_contest/dataset/preliminary_sel_log_dataset.csv')

# 读取训练标签数据：有重复数据！
train_label1=pd.read_csv('./pre_contest/dataset/preliminary_train_label_dataset.csv')
train_label2=pd.read_csv('./pre_contest/dataset/preliminary_train_label_dataset_s.csv')
train_label=pd.concat([train_label1,train_label2],axis=0).drop_duplicates()

# 读取日志语料数据
additional_sel_log=pd.read_csv('./pre_contest/dataset/additional_sel_log_dataset.csv')

# 所有去重的日志语料
all_log_list=list(set(list(additional_sel_log['msg'].drop_duplicates())+list(sel_log['msg'].drop_duplicates())))

In [5]:
# 关联sel_log和train_label：两张表的sn均可以匹配到，但是多对多
# 一般是当天的sel_log全打出之后，当天晚些时间会报错，可以先只考虑一天只有一个报错信息的数据，一天多个报错的数据不纳入训练集和验证集
# 总标签数据16604   sn和fault_day不同的15521   sn在fault_day唯一的标签数据14516   大概2000条标签数据不会用到
# 按照sn+day分组
train_label['day']=train_label['fault_time'].apply(lambda x:x[0:10])
temp=train_label.groupby(['sn','day']).size()
use_temp=temp[temp.values==1]
sn_list=[use_temp.index[i][0] for i in range(len(use_temp))]
day_list=[use_temp.index[i][1] for i in range(len(use_temp))]
use_temp_df=pd.DataFrame({'sn':sn_list,'day':day_list})
use_train_label=pd.merge(train_label,use_temp_df,how='inner',on=['sn','day'])
sel_log['day']=sel_log['time'].apply(lambda x:x[0:10])
use_log_label_df=pd.merge(sel_log,use_train_label,how='inner',on=['sn','day'])
columns_order=['sn','day','time','msg','server_model','fault_time','label']
use_log_label_df=use_log_label_df[columns_order]
# 按照sn+day分组后按time排序
# use_log_label_group_df=use_log_label_df.groupby(['sn','day']).apply(lambda x:x.sort_values('time',ascending=True))

## 3. 分词

In [6]:
log_str_list=[]
label_list=[]
for log_df in use_log_label_df.groupby(['sn','day']):
    log_str=''
    for info in log_df[1]['msg'].drop_duplicates():
        log_str=log_str+info.lower()+'.'
    log_label=log_df[1].iloc[0]['label']
    log_str_list.append(log_str)
    label_list.append(log_label)
word_label_df=pd.DataFrame({'log':log_str_list,'label':label_list})
word_label_df.to_csv('./pre_contest/data_analysis/word_label_df.txt',sep='\t')

In [7]:
word_label_df.groupby('label').size()

label
0    1460
1    3016
2    7731
3    2214
dtype: int64

## 4. 训练embbeding模型（Doc2Vec）

In [31]:
# 训练Doc2Vec模型
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(all_log_list)]
model = Doc2Vec(tagged_data, dm=1,vector_size = 200, window = 2, min_count = 1, epochs = 100,workers=12)

## 5. 构建树模型的训练集

In [16]:
vector_list=[]
for log in word_label_df['log']:
    vector_list.append(model.infer_vector(word_tokenize(log)))
feature=np.array(vector_list)
label=np.array(label_list)

AttributeError: 'Word2Vec' object has no attribute 'infer_vector'

In [176]:
# file = open('./pre_contest/doc2vec_model/modelv0p2.model','wb')
# pickle.dump(model, file)
# file = open('./pre_contest/doc2vec_model/modelv0p2.feature','wb')
# pickle.dump(feature, file)
# file = open('./pre_contest/doc2vec_model/modelv0p2.label','wb')
# pickle.dump(label_list, file)

## 6. 训练XGB

In [177]:
# 指标评估
def macro_f1(label,prediction)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  np.sum((label==i) & (prediction==i))
        FP =  np.sum((label!= i) & (prediction == i))
        FN =  np.sum((label == i) & (prediction!= i))
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        
        print('Task %d:\n Prcesion %.2f, Recall %.2f, F1 %.2f' % (i+1, precision, recall, F1))
        
    return macro_F1

In [178]:
# validation dataset
random.seed(0)
val_mask = [random.random() < 0.2 for _ in range(len(feature))]
train_mask = [not xx for xx in val_mask]
val_feature = feature[val_mask]
val_label = label[val_mask]
train_feature = feature[train_mask]
train_label = label[train_mask]

In [179]:
train_data=xgb.DMatrix(train_feature,label=train_label)
train_feature=xgb.DMatrix(train_feature)
val_feature=xgb.DMatrix(val_feature)

In [180]:
# xgb模型参数
params = {
    'booster':'gbtree',
    'objective':'multi:softmax',   # 多分类问题
    'num_class':4,  # 类别数，与multi softmax并用
    'gamma':0.1,    # 用于控制是否后剪枝的参数，越大越保守，一般0.1 0.2的样子
    'max_depth':6,  # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2 正则化项参数，参数越大，模型越不容易过拟合
    'subsample':1, # 随机采样训练样本
    'colsample_bytree':1,# 这个参数默认为1，是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言，假设h在0.01附近，min_child_weight为1
    #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果，
    # 控制叶子节点中二阶导的和的最小值，该参数值越小，越容易过拟合
    'silent':0,  # 设置成1 则没有运行信息输入，最好是设置成0
    'eta':0.3,  # 如同学习率
    'seed':1000,
    'nthread':12,  #CPU线程数
    #'eval_metric':'auc'
}

In [181]:
xgb_model=xgb.train(params,train_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [182]:
train_pred=xgb_model.predict(train_feature)
val_pred=xgb_model.predict(val_feature)

In [183]:
macro_f1(train_label,train_pred)

Task 1:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 2:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 3:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 4:
 Prcesion 1.00, Recall 1.00, F1 1.00


0.9999999999999998

In [184]:
macro_f1(val_label,val_pred)

Task 1:
 Prcesion 0.35, Recall 0.11, F1 0.16
Task 2:
 Prcesion 0.49, Recall 0.47, F1 0.48
Task 3:
 Prcesion 0.69, Recall 0.86, F1 0.77
Task 4:
 Prcesion 0.63, Recall 0.45, F1 0.53


0.3918364597278289

## 7. 构建测试集

In [25]:
submit = pd.read_csv('./pre_contest/dataset/preliminary_submit_dataset_a.csv')
submit.sort_values(by=['sn', 'fault_time'], inplace=True)
submit.reset_index(drop=True, inplace=True)
test_data = []
for i, row in submit.iterrows():
    test_data.append(model.infer_vector(word_tokenize('. '.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
test_feature = np.array(test_data)

## 8. 预测并保存结果

In [26]:
test_label = rf.predict(test_feature)
submit['label'] = test_label
submit.to_csv('./pre_contest/output/preliminary_pred_df.v1.csv', index=0)