## 1. 引入包

In [1]:
import os
import nltk
# nltk.download('punkt')
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier

## 2. 读取数据

In [2]:
# 读取sel日志，排序
sel_data = pd.read_csv('./preliminary_sel_log_dataset.csv')
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

## 3. 分词

In [3]:
# 取出每台服务器的最后十条日志
sn_list = sel_data['sn'].drop_duplicates(keep='first').to_list()
tail_msg_list = ['.'.join(sel_data[sel_data['sn']==i]['msg'].tail(10).to_list()) for i in sn_list]
tokenized_sent = [word_tokenize(s.lower()) for s in tail_msg_list]

## 4. 训练embbeding模型（Doc2Vec）

In [4]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
model = Doc2Vec(tagged_data, vector_size = 10, window = 2, min_count = 1, epochs = 10)

## 5. 构建树模型的训练集

In [None]:
label = pd.read_csv('./preliminary_train_label_dataset.csv')
label.sort_values(by=['sn', 'fault_time'], inplace=True)
label.reset_index(drop=True, inplace=True)
train_data = []
for i, row in label.iterrows():
    train_data.append(model.infer_vector(word_tokenize('.'.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
train_feature = np.array(train_data)
train_label = label['label'].values

## 6. 训练随机森林

In [None]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(train_feature, train_label)

## 7. 构建测试集

In [None]:
submit = pd.read_csv('./preliminary_submit_dataset.csv')
submit.sort_values(by=['sn', 'fault_time'], inplace=True)
submit.reset_index(drop=True, inplace=True)
test_data = []
for i, row in submit.iterrows():
    test_data.append(model.infer_vector(word_tokenize('. '.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
test_feature = np.array(test_data)

## 8. 预测并保存结果

In [None]:
test_label = rf.predict(test_feature)
submit['label'] = test_label
submit.to_csv('./preliminary_pred_df.csv', index=0)