In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

In [2]:
df_train1 = pd.read_csv('train1.csv')
df_test1 = pd.read_csv('test1.csv')

df_train2 = pd.read_csv('train2.csv')
df_test2 = pd.read_csv('test2.csv')

df_train3 = pd.read_csv('train3.csv')
df_test3 = pd.read_csv('test3.csv')

df_train4 = pd.read_csv('train4.csv')
df_test4 = pd.read_csv('test4.csv')

# df_train5 = pd.read_csv('train6.csv')
# df_test5 = pd.read_csv('test6.csv')

# df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train3, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train4, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train5, on=['sn', 'fault_time', 'label'])
# df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
#     .merge(df_test3, on=['sn', 'fault_time'])\
#     .merge(df_test4, on=['sn', 'fault_time'])\
#     .merge(df_test5, on=['sn', 'fault_time'])

df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
    .merge(df_train3, on=['sn', 'fault_time', 'label'])\
    .merge(df_train4, on=['sn', 'fault_time', 'label'])
df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
    .merge(df_test3, on=['sn', 'fault_time'])\
    .merge(df_test4, on=['sn', 'fault_time'])

In [3]:
bert_train = pd.read_csv('../bert/train.csv')
bert_test = pd.read_csv('../bert/test.csv')

In [4]:
df_train = pd.concat([df_train, bert_train.iloc[:, 9:]], axis=1)
df_test = pd.concat([df_test, bert_test.iloc[:, 9:]], axis=1)

In [5]:
df_train.shape, df_test.shape

((16604, 1610), (3011, 1609))

In [6]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [7]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

In [8]:
# df_train.fillna(df_train[use_features].mean()).dropna().shape

In [9]:
from pytorch_tabnet.augmentations import ClassificationSMOTE
aug = ClassificationSMOTE(p=0.2)

In [25]:
from sklearn.metrics import f1_score, accuracy_score
import gc

In [30]:
max_epochs = 50

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].fillna(df_train[use_features].mean()).iloc[tr_ind],\
                         df_train[use_features].fillna(df_train[use_features].mean()).iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        clf = TabNetClassifier(
            n_d=128, n_a=128, n_steps=10,
            gamma=1.5, n_independent=2, n_shared=2,
            cat_emb_dim=1,
            lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=1e-3),
            scheduler_params = {"gamma": 0.95,
                             "step_size": 20},
            scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
        )
        
        clf.fit(
            X_train=x_train.values, y_train=y_train,
            eval_set=[(x_train.values, y_train), (x_val.values, y_val)],
            eval_name=['train', 'valid'],
            eval_metric=['logloss'],
            max_epochs=max_epochs, patience=10,
            batch_size=64, virtual_batch_size=64,
            augmentations=aug
        )
        
        oof_pred[val_ind] = clf.predict_proba(x_val.values) 
        y_pred += clf.predict_proba(df_test[use_features].fillna(df_train[use_features].mean()).values) / \
            folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        del x_train, x_val, y_train, y_val
        gc.collect()

In [31]:
run_ctb(df_train, df_test, use_features)

Fold 1




epoch 0  | loss: 3.80785 | train_logloss: 1.734   | valid_logloss: 1.69046 |  0:00:29s
epoch 1  | loss: 2.98868 | train_logloss: 2.69588 | valid_logloss: 2.55277 |  0:00:59s
epoch 2  | loss: 2.85191 | train_logloss: 2.14613 | valid_logloss: 2.15315 |  0:01:29s


KeyboardInterrupt: 