# 背景：优化word2vec模型

1、采用最终的日志和标签的匹配方式

2、计算标签的embedding向量时，将权重由均值调整为加入时间间隔因子计算权重

## 导包、设置根目录

In [57]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import random
import pickle
import multiprocessing
import re
import pickle

from collections import Counter
from itertools import chain
from datetime import datetime 

# 更改工作目录为当前项目根目录
import sys
import os
os.chdir(os.path.dirname(os.path.dirname(sys.path[0])))
print(os.getcwd())

/Users/jincan02/Projects/Log-diagnosis


In [88]:
# 处理后数据的主键为 sn + fault_time + label !!!

# sn分组后，按照最近邻+时间间隔划分日志数据
def divideLogByNearestTime(log_label_df: pd.DataFrame):
    log_correspond_label_df = pd.DataFrame(columns=['sn', 'fault_time', 'msg', 'time', 'server_model', 'label'])
    no_label_log_list = []
    # 不设置截断时间，效果最好
    cutoff = 10 * 3600

    for sn, log in log_label_df.groupby('sn'):
        if len(log[log['label'] != '']) == 0:
            no_label_log_list.append(log)
        elif len(log[log['label'] != '']) == 1:
            # 没有日志的标签，直接将标签的可用信息作为日志
            if len(log) == 1:
                msg_df = log
                msg_df['fault_time'] = log['time'].iloc[0]
                log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
            else:
                msg_df = log[log['label'] == '']
                msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
                msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
                log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
        else:
            label_df = log[log['label'] != '']
            msg_df = log[log['label'] == '']
            for msg_item in msg_df.iterrows():
                previous_delta_time = 1000 * 24 * 3600
                for label_item in label_df.iterrows():
                    now_delta_time = abs(datetime.strptime(label_item[1]['time'],'%Y-%m-%d %H:%M:%S'
                        ) - datetime.strptime(msg_item[1]['time'],'%Y-%m-%d %H:%M:%S'))
                    if now_delta_time.days * 24 * 3600 + now_delta_time.seconds < previous_delta_time:
                        previous_delta_time = now_delta_time.days * 24 * 3600 + now_delta_time.seconds
                        if previous_delta_time < cutoff:
                            msg_item[1]['fault_time'] = label_item[1]['time']
                            msg_item[1]['label'] = label_item[1]['label']
            log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
            # 没有日志的标签，直接将标签的可用信息作为日志
            for label_item in label_df.iterrows():
                if len(msg_df[(msg_df['fault_time'] == label_item[1]['time']) & (
                    msg_df['label'] == label_item[1]['label'])]) == 0:
                    label_item[1]['fault_time'] = label_item[1]['time']
            log_correspond_label_df = pd.concat([log_correspond_label_df, label_df])
    if len(log_correspond_label_df[log_correspond_label_df['label'] == '']) > 0:
        no_label_log_list.append(log_correspond_label_df[log_correspond_label_df['label'] == ''])
    log_correspond_label_df = log_correspond_label_df[log_correspond_label_df['fault_time'] != '']
    return log_correspond_label_df, no_label_log_list



# sn分组后，本次报错和上次报错之间的日志匹配到本次报错
def divideLogByFaultTime(log_label_df: pd.DataFrame):
    log_correspond_label_df = pd.DataFrame(columns=['sn', 'fault_time', 'msg', 'time', 'server_model', 'label'])
    no_label_log_list = []
    log_label_df =  log_label_df.reset_index(drop = True)
    
    for sn, log in log_label_df.groupby('sn'):
        if len(log[log['label'] != '']) == 0:
            no_label_log_list.append(log)
        elif len(log[log['label'] != '']) == 1:
            msg_df = log[log['label'] == '']
            msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
            msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
            log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
        else:
            # 使用index的顺序取数时，要注意index必须按所需的顺序排列
            cutoff_index = [-1] + log.loc[log['label'] != ''].index.tolist() + [log.index.tolist()[-1]+1]
            for kth in range(len(cutoff_index)-1):
                temp_log = log.loc[(log.index <= cutoff_index[kth+1]) & (log.index > cutoff_index[kth])]
                if len(temp_log) > 0:
                    if len(temp_log[temp_log['label'] != '']) == 0:
                        no_label_log_list.append(temp_log)
                    # 只有标签，没有日志的数据，把标签的部分数据直接作为日志
                    elif len(temp_log) == 1:
                        msg_df = temp_log
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
                    else:
                        msg_df = temp_log[temp_log['label'] == '']
                        msg_df['label'] = temp_log[temp_log['label'] != '']['label'].iloc[0]
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
    return log_correspond_label_df, no_label_log_list



# 计算统计特征
def calculateStatisticFeature(log_correspond_label_df: pd.DataFrame):
    use_log_label_df = log_correspond_label_df

    use_log_label_df['msg_hour'] = use_log_label_df['time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['msg_minute'] = use_log_label_df['time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
    use_log_label_df['fault_hour'] = use_log_label_df['fault_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['fault_minute'] = use_log_label_df['fault_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)

    # 0414新增
    # 不去重msg_log
    all_msg_log_list = []
    
    # 0408新增
    # 最近一次日志时间距报错时间间隔，单位秒
    nearest_msg_fault_time_delta_list = []
    # 日志不去重时长度1,2,3,4日志数量统计
    all_msg_1_cnt_list=[]
    all_msg_2_cnt_list=[]
    all_msg_3_cnt_list=[]
    all_msg_4_cnt_list=[]
    
    fault_minute_list = []
    msg_1_cnt_list=[]
    msg_2_cnt_list=[]
    msg_3_cnt_list=[]
    msg_4_cnt_list=[]
    msg_hour_max_list=[]
    msg_hour_min_list=[]
    msg_hour_avg_list=[]
    msg_hour_median_list=[]
    msg_hour_mode_list=[]
    msg_minute_max_list=[]
    msg_minute_min_list=[]
    msg_minute_avg_list=[]
    msg_minute_median_list=[]
    msg_minute_mode_list=[]

    sn_list=[]
    day_list=[]
    server_model_list=[]
    msg_log_list=[]
    msg_cnt_list=[]
    fault_hour_list=[]
    label_list=[]
    fault_time_list=[]
    for msg_log_df in use_log_label_df.groupby(['sn','fault_time','label']):
        all_msg_log_str = ''
        msg_log_str = ''
        all_msg_1_cnt = 0
        all_msg_2_cnt = 0
        all_msg_3_cnt = 0
        all_msg_4_cnt = 0
        msg_1_cnt = 0
        msg_2_cnt = 0
        msg_3_cnt = 0
        msg_4_cnt = 0
        for info in msg_log_df[1]['msg']:
            if info == info:
                all_msg_log_str = all_msg_log_str + info.lower() + '.'
                if len(info.split('|')) == 1:
                    all_msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    all_msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    all_msg_3_cnt += 1
                else:
                    all_msg_4_cnt += 1
        for info in msg_log_df[1]['msg'].drop_duplicates():
            if info == info:
                msg_log_str=msg_log_str+info.lower()+'.'
                if len(info.split('|')) == 1:
                    msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    msg_3_cnt += 1
                else:
                    msg_4_cnt += 1
        nearest_msg_fault_time_delta = abs(datetime.strptime(msg_log_df[1].iloc[-1]['time'],'%Y-%m-%d %H:%M:%S'
                        ) - datetime.strptime(msg_log_df[0][1],'%Y-%m-%d %H:%M:%S'))
        nearest_msg_fault_time_delta = nearest_msg_fault_time_delta.days * 24 * 3600 + nearest_msg_fault_time_delta.seconds
        sm=int(msg_log_df[1].iloc[0]['server_model'][2:])

        sn_list.append(msg_log_df[0][0])
        fault_time_list.append(msg_log_df[0][1])
        label_list.append(msg_log_df[0][2])

        nearest_msg_fault_time_delta_list.append(nearest_msg_fault_time_delta)
        server_model_list.append(sm)
        all_msg_log_list.append(all_msg_log_str)
        msg_log_list.append(msg_log_str)
        msg_cnt_list.append(len(msg_log_df[1]))

        fault_hour_list.append(msg_log_df[1].iloc[0]['fault_hour'])
        fault_minute_list.append(msg_log_df[1].iloc[0]['fault_minute'])

        all_msg_1_cnt_list.append(all_msg_1_cnt)
        all_msg_2_cnt_list.append(all_msg_2_cnt)
        all_msg_3_cnt_list.append(all_msg_3_cnt)
        all_msg_4_cnt_list.append(all_msg_4_cnt)    

        msg_1_cnt_list.append(msg_1_cnt)
        msg_2_cnt_list.append(msg_2_cnt)
        msg_3_cnt_list.append(msg_3_cnt)
        msg_4_cnt_list.append(msg_4_cnt)

        msg_hour_max_list.append(msg_log_df[1]['msg_hour'].max())
        msg_hour_min_list.append(msg_log_df[1]['msg_hour'].min())
        msg_hour_avg_list.append(msg_log_df[1]['msg_hour'].mean())
        msg_hour_median_list.append(msg_log_df[1]['msg_hour'].median())
        msg_hour_mode_list.append(msg_log_df[1]['msg_hour'].mode()[0])

        msg_minute_max_list.append(msg_log_df[1]['msg_minute'].max())
        msg_minute_min_list.append(msg_log_df[1]['msg_minute'].min())
        msg_minute_avg_list.append(msg_log_df[1]['msg_minute'].mean())
        msg_minute_median_list.append(msg_log_df[1]['msg_minute'].median())
        msg_minute_mode_list.append(msg_log_df[1]['msg_minute'].mode()[0])

    msg_log_label_df=pd.DataFrame(
        {
        'sn': sn_list,
        'fault_time': fault_time_list,
        'server_model': server_model_list,
        'msg_cnt': msg_cnt_list,
        'fault_hour': fault_hour_list,
        'fault_minute': fault_minute_list,
        'nearest_msg_fault_time_delta': nearest_msg_fault_time_delta_list,
        'all_msg_1_cnt': all_msg_1_cnt_list,
        'all_msg_2_cnt': all_msg_2_cnt_list,
        'all_msg_3_cnt': all_msg_3_cnt_list,
        'all_msg_4_cnt': all_msg_4_cnt_list,
        'msg_1_cnt': msg_1_cnt_list,
        'msg_2_cnt': msg_2_cnt_list,
        'msg_3_cnt': msg_3_cnt_list,
        'msg_4_cnt': msg_4_cnt_list,
        'msg_hour_max': msg_hour_max_list,
        'msg_hour_min': msg_hour_min_list,
        'msg_hour_avg': msg_hour_avg_list,
        'msg_hour_median': msg_hour_median_list,
        'msg_hour_mode': msg_hour_mode_list,
        'msg_minute_max': msg_minute_max_list,
        'msg_minute_min': msg_minute_min_list,
        'msg_minute_avg': msg_minute_avg_list,
        'msg_minute_median': msg_minute_median_list,
        'msg_minute_mode': msg_minute_mode_list,
        'msg_log': msg_log_list,
        'all_msg_log': all_msg_log_list,
        'label': label_list
        }
    )
    return msg_log_label_df


# 使用去除标点符号之后的词训练word2vec模型和embedding向量
def TrainDropPuncWord2vecFeature(all_msg_log_list: list, FaultTime_log_correspond_label_df: pd.DataFrame) -> list:
    # 对日志进行分词
    raw_word_list = [word_tokenize(ith) for ith in all_msg_log_list]
    word_list=[]
    for i in range(len(raw_word_list)):
        xth=[]
        for word in raw_word_list[i]:
            word_drop=re.sub(r'[^\w]','',str(word)).lower()
            if word_drop:
                xth.append(word_drop)
        word_list.append(xth)
    # 使用去除空格和标签符号后的词，训练word2vec模型
    word2vec_model = Word2Vec(word_list,vector_size=100, alpha=0.03, window=5, min_count=1,max_vocab_size=None, sample=1e-3, seed=0, workers=12, min_alpha=0.0001,sg=1, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=50, null_word=0,trim_rule=None, sorted_vocab=1)
    
    # 训练词向量
    word2vec_vector_list=[]
    sn_list = []
    fault_time_list = []
    label_list = []

    # 使用去除空格和标点符号的分词方式
    for key, msg_log in FaultTime_log_correspond_label_df.groupby(['sn','fault_time','label']):
        sn_list.append(key[0])
        fault_time_list.append(key[1])
        label_list.append(key[2])
        embedding_vector_weighted_sum=[]
        last_msg_delta_time = datetime.strptime(msg_log.iloc[-1]['fault_time'],'%Y-%m-%d %H:%M:%S'
                            ) - datetime.strptime(msg_log.iloc[-1]['time'],'%Y-%m-%d %H:%M:%S')
        last_msg_delta_seconds = last_msg_delta_time.days * 24 * 60 * 60 + last_msg_delta_time.seconds
        if last_msg_delta_seconds == 0:
            last_msg_delta_seconds = 1
        for i, item in msg_log.iterrows():
            # 计算权重
            delta_time = datetime.strptime(item['fault_time'],'%Y-%m-%d %H:%M:%S'
                            ) - datetime.strptime(item['time'],'%Y-%m-%d %H:%M:%S')
            delta_seconds = delta_time.days * 24 * 60 * 60 + delta_time.seconds
            if delta_seconds == 0:
                delta_seconds = 1
            weight = last_msg_delta_seconds / delta_seconds
            # 对本条日志进行分词，去除空格和标点符号
            item_raw_word_list = word_tokenize(item['msg'])
            for word in item_raw_word_list:
                word_drop=re.sub(r'[^\w]','',str(word)).lower()
                if word_drop:
                    vector=word2vec_model.wv[word_drop].reshape(1,-1)[0]
                    if len(embedding_vector_weighted_sum)>0:
                        embedding_vector_weighted_sum=list(np.array(embedding_vector_weighted_sum)+weight * np.array(vector))
                    else:
                        embedding_vector_weighted_sum=list(np.array(vector))
        if len(embedding_vector_weighted_sum) == 0:
            embedding_vector_weighted_sum = np.array([0]*100)
        word2vec_vector_list.append(embedding_vector_weighted_sum)
    
    return word2vec_vector_list



# 使用原始分词后的词训练word2vec模型和embedding向量
def TrainAllWord2vecFeature(all_msg_log_list: list, FaultTime_log_correspond_label_df: pd.DataFrame) -> list:
    # 对日志进行分词
    raw_word_list = [word_tokenize(ith) for ith in all_msg_log_list]
    word2vec_model = Word2Vec(raw_word_list,vector_size=100, alpha=0.03, window=5, min_count=1,max_vocab_size=None, sample=1e-3, seed=0, workers=12, min_alpha=0.0001,sg=1, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=50, null_word=0,trim_rule=None, sorted_vocab=1)
    
    # 训练词向量
    word2vec_vector_list=[]
    sn_list = []
    fault_time_list = []
    label_list = []

    for key, msg_log in FaultTime_log_correspond_label_df.groupby(['sn','fault_time','label']):
        sn_list.append(key[0])
        fault_time_list.append(key[1])
        label_list.append(key[2])
        embedding_vector_weighted_sum=[]
        last_msg_delta_time = datetime.strptime(msg_log.iloc[-1]['fault_time'],'%Y-%m-%d %H:%M:%S'
                            ) - datetime.strptime(msg_log.iloc[-1]['time'],'%Y-%m-%d %H:%M:%S')
        last_msg_delta_seconds = last_msg_delta_time.days * 24 * 60 * 60 + last_msg_delta_time.seconds
        if last_msg_delta_seconds == 0:
            last_msg_delta_seconds = 1
        for i, item in msg_log.iterrows():
            # 计算权重
            delta_time = datetime.strptime(item['fault_time'],'%Y-%m-%d %H:%M:%S'
                            ) - datetime.strptime(item['time'],'%Y-%m-%d %H:%M:%S')
            delta_seconds = delta_time.days * 24 * 60 * 60 + delta_time.seconds
            if delta_seconds == 0:
                delta_seconds = 1
            weight = last_msg_delta_seconds / delta_seconds
            # 对本条日志进行分词
            item_raw_word_list = word_tokenize(item['msg'])
            for word in item_raw_word_list:
                vector=word2vec_model.wv[word].reshape(1,-1)[0]
                if len(embedding_vector_weighted_sum)>0:
                    embedding_vector_weighted_sum=list(np.array(embedding_vector_weighted_sum)+weight * np.array(vector))
                else:
                    embedding_vector_weighted_sum=list(np.array(vector))
        if len(embedding_vector_weighted_sum) == 0:
            embedding_vector_weighted_sum = np.array([0]*100)
        word2vec_vector_list.append(embedding_vector_weighted_sum)
    
    return word2vec_vector_list

## 采用按照fault_time划分日志的方式匹配标签和日志

In [13]:
# 读取sel日志数据
sel_log_df = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset.csv').drop_duplicates()
# 读取额外的日志数据
additional_sel_log_df=pd.read_csv('./pre_contest/dataset_B/additional_sel_log_dataset.csv').drop_duplicates()
# 读取训练标签数据：有重复数据！
train_label1 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset.csv')
train_label2 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset_s.csv')
train_label_df = pd.concat([train_label1,train_label2],axis=0).drop_duplicates()


# 合并日志和标签
sel_log_df['label'] = ''
train_label_df['time'] = train_label_df['fault_time']
train_label_df['msg'] = ''
train_label_df['server_model'] = train_label_df['sn'].map(dict(zip(sel_log_df['sn'],sel_log_df['server_model'])))
train_label_df = train_label_df[['sn', 'time', 'msg', 'server_model', 'label']]

log_label_df = pd.concat([sel_log_df,train_label_df], axis = 0).sort_values(by = 'time')
log_label_df['fault_time'] = ''
log_label_df = log_label_df[['sn', 'fault_time', 'msg', 'time', 'server_model', 'label']]

In [4]:
FaultTime_log_correspond_label_df, FaultTime_no_label_log_list = divideLogByFaultTime(log_label_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['label'] = temp_log[temp_log['label'] != '']['label'].iloc[0]
A value is try

## 训练word2vec模型

In [31]:
# 对所有日志进行去重
all_msg_log_list = list(set(list(additional_sel_log_df['msg'].drop_duplicates()) + list(sel_log_df['msg'].drop_duplicates())))
# 对日志进行分词
raw_word_list = [word_tokenize(ith) for ith in all_msg_log_list]
word_list=[]
for i in range(len(raw_word_list)):
    xth=[]
    for word in raw_word_list[i]:
        word_drop=re.sub(r'[^\w]','',str(word)).lower()
        if word_drop:
            xth.append(word_drop)
    word_list.append(xth)

In [35]:
len(all_msg_log_list)

9421

In [36]:
# 训练Word2Vec模型
# 使用未经处理的分词
# word2vec_model = Word2Vec(raw_word_list,vector_size=100, alpha=0.03, window=5, min_count=1,max_vocab_size=None, sample=1e-3, seed=0, workers=12, min_alpha=0.0001,sg=1, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=50, null_word=0,trim_rule=None, sorted_vocab=1)
# 使用去除空格和标签符号后的分词
word2vec_model = Word2Vec(word_list,vector_size=100, alpha=0.03, window=5, min_count=1,max_vocab_size=None, sample=1e-3, seed=0, workers=12, min_alpha=0.0001,sg=1, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=50, null_word=0,trim_rule=None, sorted_vocab=1)

## 对日志进行embedding，加权求和得到特征向量

In [89]:
word2vec_vector_list = TrainAllWord2vecFeature(all_msg_log_list, FaultTime_log_correspond_label_df)

In [90]:
feature=np.array(word2vec_vector_list)

## 训练xgb模型

In [92]:
# xgb模型参数
xgb_params = {
    'booster':'gbtree',
    'objective':'multi:softmax',   # 多分类问题
    'num_class':4,  # 类别数，与multi softmax并用
    'gamma':0.1,    # 用于控制是否后剪枝的参数，越大越保守，一般0.1 0.2的样子
    'max_depth':6,  # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2 正则化项参数，参数越大，模型越不容易过拟合
    'subsample':1, # 随机采样训练样本
    'colsample_bytree':1,# 这个参数默认为1，是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言，假设h在0.01附近，min_child_weight为1
    #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果，
    # 控制叶子节点中二阶导的和的最小值，该参数值越小，越容易过拟合
    'silent':0,  # 设置成1 则没有运行信息输入，最好是设置成0
    'eta':0.3,  # 如同学习率
    'seed':1000,
    'nthread':16,  #CPU线程数
    #'eval_metric':'auc'
}

# 指标评估
def macro_f1(label,prediction)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  np.sum((label==i) & (prediction==i))
        FP =  np.sum((label!= i) & (prediction == i))
        FN =  np.sum((label == i) & (prediction!= i))
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        
        print('Task %d:\n Prcesion %.2f, Recall %.2f, F1 %.2f' % (i+1, precision, recall, F1))
        
    return macro_F1

In [93]:
random.seed(0)
label=np.array(label_list)
val_mask = [random.random() < 0.3 for _ in range(len(feature))]
train_mask = [not xx for xx in val_mask]
val_feature = feature[val_mask]
val_label = label[val_mask]
train_feature = feature[train_mask]
train_label = label[train_mask]
train_data=xgb.DMatrix(train_feature,label=train_label)
train_feature=xgb.DMatrix(train_feature)
val_feature=xgb.DMatrix(val_feature)

In [94]:
xgb_model=xgb.train(xgb_params,train_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [95]:
train_pred=xgb_model.predict(train_feature)
val_pred=xgb_model.predict(val_feature)

In [96]:
macro_f1(train_label,train_pred)

Task 1:
 Prcesion 0.99, Recall 0.93, F1 0.96
Task 2:
 Prcesion 0.97, Recall 0.98, F1 0.97
Task 3:
 Prcesion 0.98, Recall 1.00, F1 0.99
Task 4:
 Prcesion 0.99, Recall 0.96, F1 0.97


0.9716585704102343

In [97]:
macro_f1(val_label,val_pred)
# 使用去除标签符号的单词训练
# 0.586243722832391

# 使用分词后的原始单词训练
# 0.5756302442328277

Task 1:
 Prcesion 0.46, Recall 0.23, F1 0.31
Task 2:
 Prcesion 0.65, Recall 0.71, F1 0.68
Task 3:
 Prcesion 0.90, Recall 0.95, F1 0.92
Task 4:
 Prcesion 0.84, Recall 0.81, F1 0.83


0.5756302442328277