In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

In [4]:
log_df = pd.read_csv('./log_template.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
log_df['time_gap'] = log_df['time'].dt.ceil('2h')

In [7]:
sentences_list = list()
for info, group in log_df.groupby(['sn', 'time_gap']):
    group = group.sort_values(by='time')
    sentences_list.append("\n".join(group['msg_lower'].values))

In [8]:
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(sentences_list)

TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [9]:
X_tfidf = tfv.transform(sentences_list)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

TruncatedSVD(n_components=16)

In [10]:
def get_tfidf_svd(sentence, n_components=16):
    X_tfidf = tfv.transform(sentence)
    X_svd = svd.transform(X_tfidf)
    return X_svd

In [11]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx in tqdm(range(dataset.shape[0])):
        row = dataset.iloc[idx]
        sn = row['sn']
        fault_time = row['fault_time']
        fault_time_ts = row['fault_time_ts']
        sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
        sub_log = sub_log.sort_values(by='time')

        df_tmp1 = sub_log[sub_log['time_ts'] - fault_time_ts >= -60 * 60 * 2]
        
        data = {
            'sn': sn,
            'fault_time': fault_time,
        }
        vec = get_tfidf_svd(['\n'.join(df_tmp1['msg_lower'].values.astype(str))])[0]
        for i in range(16):
            data['tfv_%d'%i] = vec[i]
        if data_type == 'train':
            data['label'] = row['label']
        ret.append(data)
    return ret

In [12]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [09:41<00:00, 28.56it/s]


In [13]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3030/3030 [01:42<00:00, 29.69it/s]


In [14]:
df_train.to_csv('train5.csv', index=False)
df_test.to_csv('test5.csv', index=False)