In [1]:
import gc
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, CatBoostRegressor
import lightgbm as lgb
from tqdm import tqdm

import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train1 = pd.read_csv('train1.csv', dtype={'server_model': 'str',
                                             'last_msg_id': 'str',
                                             'last_template_id': 'str'})
df_test1 = pd.read_csv('test1.csv', dtype={'server_model': 'str',
                                           'last_msg_id': 'str',
                                           'last_template_id': 'str'})

df_train2 = pd.read_csv('train2.csv', dtype={'tmp_appearance_1': 'str',
                                             'tmp_appearance_2': 'str',
                                             'tmp_appearance_3': 'str',
                                             'msg_appearance_1': 'str',
                                             'msg_appearance_2': 'str',
                                             'msg_appearance_3': 'str',
                                             'max_continuous_msg': 'str'})
df_test2 = pd.read_csv('test2.csv', dtype={'tmp_appearance_1': 'str',
                                           'tmp_appearance_2': 'str',
                                           'tmp_appearance_3': 'str',
                                           'msg_appearance_1': 'str',
                                           'msg_appearance_2': 'str',
                                           'msg_appearance_3': 'str',
                                           'max_continuous_msg': 'str'})

df_train3 = pd.read_csv('train3.csv')
df_test3 = pd.read_csv('test3.csv')

df_train4 = pd.read_csv('train4.csv')
df_test4 = pd.read_csv('test4.csv')

df_train5 = pd.read_csv('train5.csv')
df_test5 = pd.read_csv('test5.csv')

df_train6 = pd.read_csv('train_helper.csv')
df_test6 = pd.read_csv('test_helper.csv')

# df_train7 = pd.read_csv('train7.csv')
# df_test7 = pd.read_csv('test7.csv')

df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
    .merge(df_train3, on=['sn', 'fault_time', 'label'])\
    .merge(df_train4, on=['sn', 'fault_time', 'label'])\
    .merge(df_train5, on=['sn', 'fault_time', 'label'])\
    .merge(df_train6, on=['sn', 'fault_time', 'label'])
df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
    .merge(df_test3, on=['sn', 'fault_time'])\
    .merge(df_test4, on=['sn', 'fault_time'])\
    .merge(df_test5, on=['sn', 'fault_time'])\
    .merge(df_test6, on=['sn', 'fault_time'])

In [3]:
df_train.shape, df_test.shape

((16604, 124), (3030, 123))

In [4]:
bert_train = pd.read_csv('../bert/train.csv')
bert_test = pd.read_csv('../bert/test_b.csv')

df_train = pd.concat([df_train, bert_train.iloc[:, 9:]], axis=1)
df_test = pd.concat([df_test, bert_test.iloc[:, 9:]], axis=1)

# df_train = df_train.merge(bert_train, on=['sn', 'fault_time', 'label'])
# df_test = df_test.merge(bert_test, on=['sn', 'fault_time'])

In [5]:
df_train.shape, df_test.shape

((16604, 508), (3030, 507))

In [6]:
for name in ['last_msg_id', 'last_template_id', 'tmp_appearance_1', 'tmp_appearance_2', 'tmp_appearance_3',
             'msg_appearance_1', 'msg_appearance_2', 'msg_appearance_3', 'max_continuous_msg']:
    df_train[name] = df_train[name].fillna('NULL')
    df_test[name] = df_test[name].fillna('NULL')

In [7]:
def cat_model_train(x_train, y_train, x_val, y_val):
    NUM_CLASSES = y_train.nunique()
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    params = { 
        'task_type': 'CPU',
        'bootstrap_type': 'Bernoulli',
        'learning_rate': 0.03, 
        'eval_metric': 'MultiClass', 
        'loss_function': 'MultiClass', 
        'classes_count': NUM_CLASSES, 
        'iterations': 10000,
        'random_state': 42,
        'depth': 8,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'subsample': 0.8,
        'class_weights': class_weights,
        'early_stopping_rounds': 100,
        'cat_features': ['server_model', 'last_msg_id', 'last_template_id',
                         'tmp_appearance_1', 'tmp_appearance_2', 'tmp_appearance_3',
                         'msg_appearance_1', 'msg_appearance_2', 'msg_appearance_3',
                         'max_continuous_msg'],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train, 
               y_train, 
               eval_set=(x_val, y_val), 
               verbose=100)
    return model

In [8]:
# NUM_CLASSES = df_train['pre_label'].nunique()
# FOLDS = 10
# use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', 'label', 'pre_label']]

# target = TARGET
# y_pred = np.zeros((len(df_test), NUM_CLASSES))

FOLDS = 10
target = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', target]]
oof_pred = np.zeros((len(df_train),))

y_pred1 = np.zeros((len(df_test), 3))
y_pred2 = np.zeros((len(df_test), 2))

folds = GroupKFold(n_splits=FOLDS)
for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train['label'], df_train['sn'])):
    df_trian_sub = df_train.iloc[tr_ind].copy()
    df_valid_sub = df_train.iloc[val_ind].copy()
    
    x_train1, x_val1 = df_trian_sub[use_features], df_valid_sub[use_features]
    y_train1, y_val1 = df_trian_sub[target], df_valid_sub[target]
    y_train1 = y_train1.apply(lambda x: 0 if x <= 1 else x-1)
    y_val1 = y_val1.apply(lambda x: 0 if x <= 1 else x-1)
    model1 = cat_model_train(x_train1, y_train1, x_val1, y_val1)

    x_train2, x_val2 = df_trian_sub[df_trian_sub['label'] <= 1][use_features], \
                       df_valid_sub[df_valid_sub['label'] <= 1][use_features]
    y_train2, y_val2 = df_trian_sub[df_trian_sub['label'] <= 1][target],\
                       df_valid_sub[df_valid_sub['label'] <= 1][target]
    model2 = cat_model_train(x_train2, y_train2, x_val2, y_val2)
    
    y_pred1 += model1.predict_proba(df_test[use_features]) / folds.n_splits
    y_pred2 += model2.predict_proba(df_test[use_features]) / folds.n_splits

    val_pred = []
    val_proba = model1.predict_proba(x_val1)
    for i in range(val_proba.shape[0]):
        if np.argmax(val_proba[i]) == 0:
            proba = model2.predict_proba([x_val1.iloc[i]])
            val_pred.append(np.argmax(proba[0]))
        else:
            val_pred.append(np.argmax(val_proba[i])+1)

    score = f1_score(df_valid_sub[target], val_pred, average='macro')
    print(f'F1 score: {score}')

    oof_pred[val_ind] = val_pred

0:	learn: 1.0479539	test: 1.0485107	best: 1.0485107 (0)	total: 302ms	remaining: 50m 15s
100:	learn: 0.2347191	test: 0.2806276	best: 0.2806276 (100)	total: 25.7s	remaining: 42m 2s
200:	learn: 0.1767513	test: 0.2539307	best: 0.2539307 (200)	total: 50.8s	remaining: 41m 14s
300:	learn: 0.1443985	test: 0.2420481	best: 0.2420188 (293)	total: 1m 15s	remaining: 40m 44s
400:	learn: 0.1221294	test: 0.2364505	best: 0.2362845 (399)	total: 1m 40s	remaining: 40m 6s
500:	learn: 0.1064826	test: 0.2331059	best: 0.2331059 (500)	total: 2m 5s	remaining: 39m 34s
600:	learn: 0.0950628	test: 0.2305399	best: 0.2305399 (600)	total: 2m 29s	remaining: 39m 3s
700:	learn: 0.0850522	test: 0.2293919	best: 0.2293899 (699)	total: 2m 54s	remaining: 38m 35s
800:	learn: 0.0759342	test: 0.2293046	best: 0.2290562 (744)	total: 3m 19s	remaining: 38m 11s
900:	learn: 0.0692346	test: 0.2281656	best: 0.2281525 (887)	total: 3m 44s	remaining: 37m 44s
1000:	learn: 0.0634907	test: 0.2274431	best: 0.2271995 (994)	total: 4m 8s	remaini

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2047518293
bestIteration = 828

Shrink model to first 829 iterations.
0:	learn: 0.6880499	test: 0.6903182	best: 0.6903182 (0)	total: 208ms	remaining: 34m 43s
100:	learn: 0.4990982	test: 0.6217801	best: 0.6212837 (71)	total: 28.3s	remaining: 46m 10s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6212837375
bestIteration = 71

Shrink model to first 72 iterations.
F1 score: 0.7625589640305165
0:	learn: 1.0509393	test: 1.0506798	best: 1.0506798 (0)	total: 251ms	remaining: 41m 46s
100:	learn: 0.2376207	test: 0.2616295	best: 0.2616295 (100)	total: 25.5s	remaining: 41m 38s
200:	learn: 0.1797480	test: 0.2296123	best: 0.2296123 (200)	total: 50.4s	remaining: 40m 58s
300:	learn: 0.1477616	test: 0.2177186	best: 0.2177186 (300)	total: 1m 15s	remaining: 40m 27s
400:	learn: 0.1255110	test: 0.2092933	best: 0.2092933 (400)	total: 1m 40s	remaining: 40m 3s
500:	learn: 0.1087514	test: 0.2047540	best: 0.2047540 (500)

In [9]:
from sklearn.metrics import classification_report
print(classification_report(df_train[target], oof_pred))

              precision    recall  f1-score   support

           0       0.48      0.50      0.49      1476
           1       0.73      0.72      0.73      3387
           2       0.96      0.94      0.95      9320
           3       0.83      0.91      0.87      2421

    accuracy                           0.85     16604
   macro avg       0.75      0.77      0.76     16604
weighted avg       0.85      0.85      0.85     16604



In [10]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

macro_f1(df_train[target], oof_pred)

0.6764523238103658

In [11]:
y_pred = []
for i in range(y_pred1.shape[0]):
    if np.argmax(y_pred1[i]) == 0:
        proba = y_pred2[i]
        y_pred.append(np.argmax(proba))
    else:
        y_pred.append(np.argmax(y_pred1[i])+1)

In [12]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

sub = submit_df[['sn', 'fault_time']].copy()
sub['label'] = y_pred
display(sub.head())
sub['label'].value_counts() / sub.shape[0]

Unnamed: 0,sn,fault_time,label
0,0015fe530ad4,2020-05-01 23:48:17,2
1,00380f1435b0,2020-07-28 07:51:13,3
2,0045a71d0221,2020-07-02 06:33:54,1
3,004d5a7954e7,2020-08-24 08:27:55,2
4,004d5a7954e7,2020-08-24 09:42:45,1


2    0.527723
3    0.202640
1    0.180198
0    0.089439
Name: label, dtype: float64

In [13]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

label_df['label'].value_counts() / label_df.shape[0]

2    0.561311
1    0.203987
3    0.145808
0    0.088894
Name: label, dtype: float64

In [14]:
sub.to_csv('baseline3_gkf_sn_new.csv', index=False)