In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC,  LinearSVC
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report, precision_score
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.datasets import make_classification
import xgboost as xgb
rus = RandomUnderSampler(random_state=2021)

In [2]:
def calculate_precision(predict, real):
    cnt = 0
    print(type(predict), type(real))
    real = list(real)
    for i in range(len(predict)):
        if predict[i] == 1 and real[i] == 1:
            cnt += 1
    print(cnt, np.sum(predict))
    return cnt / np.sum(predict)

In [3]:
def train_plt(score):
    plt.barh(y=range(1, len(score)), width=score)
    plt.ylabel('epoch',fontsize=14)
    plt.xlabel('precision',fontsize=14)
    return np.mean(score), np.std(score)

In [4]:
def svm_classification(train, target, svm, k=5):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2021)
    y_predicts = []
    ys = []
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train.iloc[train_index, :], train.iloc[test_index, :]
        svm.fit(train_X, train_y)
        y_predict = svm.predict(test_X)
        y_predicts.append(y_predict)
        ys.append(test_y)
        print('epoch {} precision_score {}'.format(i, precision_score(test_y, y_predict, average='macro')))
    return y_predicts, ys
    
def svm_classification_2(X, svm, feat, k=5):
    y_predicts = []
    ys = []
    folds = KFold(n_splits=k, shuffle=True, random_state=2021)
    no = X['no'].drop_duplicates()
    for i, (idx_1, idx_2) in enumerate(folds.split(no)):
        trn_data = X[X['no'].isin(no.iloc[idx_1])]
        tst_data = X[X['no'].isin(no.iloc[idx_2])]
        train_X, train_y = trn_data[feat], trn_data['label']
        test_X, test_y = tst_data[feat], tst_data['label']
        svm.fit(train_X, train_y)
        y_predict = svm.predict(test_X)
        y_predicts.append(y_predict)
        ys.append(test_y)
        print('epoch {} precision_score {}'.format(i, precision_score(test_y, y_predict, average='macro')))
    return y_predicts, ys


In [5]:


def precision(y_true, y_predict):
    a = list(y_true > 0.5)
    b = list(y_predict.get_label())
    # print(a, b)
    score = precision_score(a, b)
    return 'precision', score, True


def lgb_classification(train, target, test, k=5):
    oof_preds = np.zeros((train.shape[0], ))
    oof_probs = np.zeros((train.shape[0], ))
    feature_importance_df = pd.DataFrame()
    offline_score = []
    output_preds = []
    aa = []
    xx = []
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2021)
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train.iloc[train_index, :], train.iloc[test_index, :]
        dtrain = lgb.Dataset(train_X, label=train_y)
        dval = lgb.Dataset(test_X, label=test_y)

        parameters = {
            'learning_rate': 0.05,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            # 'num_class': 2,
            'metric': 'None',
            'num_leaves': 63,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'min_data_in_leaf': 20,
            'verbose': -1,
            'nthread': 12,
            'random_state': 2021,

        }
        # 'metric': {'binary_logloss', 'auc'}
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round=5000,
            valid_sets=[dval],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=precision,
        )
        a = lgb_model.predict(
            test_X, num_iteration=lgb_model.best_iteration) > 0.5
        aa.append(a)
        xx.append(test_y)
        oof_preds[test_index] = lgb_model.predict(
            test_X, num_iteration=lgb_model.best_iteration) > 0.6
        output_preds.append(lgb_model.predict(
            test, num_iteration=lgb_model.best_iteration))
        offline_score.append(lgb_model.best_score['valid'])

        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = train.columns
        fold_importance_df["importance"] = lgb_model.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
    print('feature importance:')
    a = feature_importance_df.groupby(
        ['feature'])['importance'].mean().sort_values(ascending=False)
    print(a.head(15))
    print('confusion matrix:')
    b = confusion_matrix(target, oof_preds)
    print(b)
    print('classfication report:')
    c = classification_report(target, oof_preds)
    print(c)
    # return oof_probs, np.mean(offline_score)
    return a, b, c, output_preds, aa, xx


def lgb_classification_2(X, test, feat, k=5):
    feature_importance_df = pd.DataFrame()
    offline_score = []
    output_preds = []
    aa = []
    xx = []
    folds = KFold(n_splits=k, shuffle=True, random_state=2021)
    no = X['no'].drop_duplicates()
    for i, (idx_1, idx_2) in enumerate(folds.split(no)):
        trn_data = X[X['no'].isin(no.iloc[idx_1])]
        tst_data = X[X['no'].isin(no.iloc[idx_2])]
        train_X, train_y = trn_data[feat], trn_data['label']
        test_X, test_y = tst_data[feat], tst_data['label']
        dtrain = lgb.Dataset(train_X, label=train_y)
        dval = lgb.Dataset(test_X, label=test_y)

        parameters = {
            'learning_rate': 0.05,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            # 'num_class': 2,
            'metric': 'None',
            'num_leaves': 63,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'min_data_in_leaf': 20,
            'verbose': -1,
            'nthread': 12,
            'random_state': 2021,

        }
        # 'metric': {'binary_logloss', 'auc'}
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round=5000,
            valid_sets=[dval],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=precision,
        )
        a = lgb_model.predict(
            test_X, num_iteration=lgb_model.best_iteration) > 0.5
        aa.append(a)
        xx.append(test_y)
        output_preds.append(lgb_model.predict(
            test, num_iteration=lgb_model.best_iteration))
        offline_score.append(lgb_model.best_score['valid'])

        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feat
        fold_importance_df["importance"] = lgb_model.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
    print('feature importance:')
    a = feature_importance_df.groupby(
        ['feature'])['importance'].mean().sort_values(ascending=False)
    print(a.head(15))
    # return oof_probs, np.mean(offline_score)
    return a, output_preds, aa, xx


In [6]:

def xgb_precision(y_true, y_predict):
    a = list(y_true > 0.5)
    b = list(y_predict.get_label())
    score = precision_score(a, b)
    return 'precision', score


def xgb_classification(X, test, feat, k=5):
    # feature_importance_df = pd.DataFrame()
    # offline_score = []
    output_preds = []
    # aa = []
    # xx = []
    folds = KFold(n_splits=k, shuffle=True, random_state=2021)
    no = X['no'].drop_duplicates()
    for i, (idx_1, idx_2) in enumerate(folds.split(no)):
        trn_data = X[X['no'].isin(no.iloc[idx_1])]
        tst_data = X[X['no'].isin(no.iloc[idx_2])]
        train_X, train_y = trn_data[feat], trn_data['label']
        test_X, test_y = tst_data[feat], tst_data['label']

        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'gamma': 0.1,
            'max_depth': 8,
            # 'alpha': 0,
            # 'lambda': 0,
            'subsample': 0.7,
            'colsample_bytree': 0.5,
            'min_child_weight': 3,
            'silent': 0,
            'eta': 0.01,
            'nthread': -1,
            'seed': 2021,
        }

        dtrain = xgb.DMatrix(train_X, label=train_y)
        dtest = xgb.DMatrix(test_X, label=test_y)
        evals = [(dtrain, 'train'), (dtest, 'valid')]

        xgb_model = xgb.train(params, dtrain, num_boost_round=2000,
                              evals=evals, early_stopping_rounds=100, 
                              verbose_eval=100, feval=xgb_precision)
        oof_pred = xgb_model.predict(xgb.DMatrix(test), ntree_limit = xgb_model.best_ntree_limit)
        output_preds.append(oof_pred)
        y_predict = xgb_model.predict(xgb.DMatrix(test_X), ntree_limit = xgb_model.best_ntree_limit) > 0.5
        print('epoch {} precision : {}'.format(i, precision_score(test_y, y_predict)))
    return output_preds


In [None]:
df = pd.read_csv('res/final_13.csv')

In [146]:
def read_data_to_train(df, ile='../res/final_13.csv'):
    # df = pd.read_csv(file)
    y = df['label']
    no = df['no']
    no_cols = ['Unnamed: 0', 'label', 'no']
    # df = df[[i for i in df.columns if i not in no_cols]]
    print(df.shape)
    pca = PCA(n_components=0.9)
    X = pca.fit_transform(df[[i for i in df.columns if i not in no_cols]])
    X = pd.DataFrame(data=X, columns=list(range(X.shape[1])))
    X['no'] = no
    X['label'] = y
    print(X.shape)
    p_no = df[df['label'] == 1]['no'].drop_duplicates()

    ## 将所有正例作为验证集合
    all_p = X[X['no'].isin(p_no)]
    part_np = X[~X['no'].isin(p_no)].sample(n=all_p.shape[0], replace=False, random_state=2021)
    all_val = pd.concat([all_p, part_np])

    val_no = p_no.sample(n=p_no.shape[0]//10, replace=False, random_state=2021)
    print(val_no.shape, p_no.shape)
    p_no = p_no[~ p_no.isin(val_no)]
    # print(p_no)
    val_X = X[X['no'].isin(val_no)]
    trn_X = X[~ X['no'].isin(val_no)]
    feat = [i for i in trn_X.columns if i not in ['label', 'no']]
    trn_X['id'] = list(range(trn_X.shape[0]))
    X_resampled, y_resampled = rus.fit_resample(trn_X, trn_X['label'])
    trn_no = X_resampled['id'].drop_duplicates()
    print(trn_no.shape, p_no.shape)
    # a = trn_X[~trn_X['no'].isin(trn_no)]
    a = trn_X[~trn_X['id'].isin(trn_no)]
    print(sorted(Counter(y_resampled).items()),
          trn_X.shape, trn_no.shape, a.shape, val_X.shape)
    val_X_np = a.sample(n=val_X.shape[0], replace=False, random_state=2021)
    print(val_X_np[val_X_np['label'] == 1]['no'].drop_duplicates().shape)
    val_X = pd.concat([val_X, val_X_np])
    return val_X, X_resampled, y_resampled, feat, all_val


In [147]:
val_X, X_resampled, y_resampled, feat, all_val = read_data_to_train(df, 'res/final_13.csv')

(26329, 420)
(26329, 17)
(31,) (316,)
(10104,) (285,)
[(0, 5052), (1, 5052)] (25759, 18) (10104,) (15655, 18) (570, 17)
(0,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trn_X['id'] = list(range(trn_X.shape[0]))


In [148]:
all_val.shape

(11244, 17)

In [131]:
svm_clf = Pipeline(
    (
        ('scaler', StandardScaler()),
        ('svm_clf', SVC(kernel='rbf', C=10, gamma=0.1))
    )
)
y_predict, y_true = svm_classification_2(X_resampled, svm_clf, feat)
a = svm_clf.predict(val_X[feat])
print(calculate_precision(a, val_X['label']))

epoch 0 precision_score 0.5152961586811209
epoch 1 precision_score 0.5249839846252402
epoch 2 precision_score 0.5080113442631597
epoch 3 precision_score 0.5129904839433401
epoch 4 precision_score 0.5172058340496607
<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
396 752
0.526595744680851


In [155]:
a = svm_clf.predict(all_val[feat])
print(calculate_precision(a, all_val['label']))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
4566 8019
0.5693976805087916


In [150]:
a, d,e, f = lgb_classification_2(X_resampled, all_val[feat], feat)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's precision: 0.520926
Early stopping, best iteration is:
[63]	valid_0's precision: 0.532502
Training until validation scores don't improve for 100 rounds
[100]	valid_0's precision: 0.460152
Early stopping, best iteration is:
[1]	valid_0's precision: 0.521822
Training until validation scores don't improve for 100 rounds
[100]	valid_0's precision: 0.448615
[200]	valid_0's precision: 0.461126
[300]	valid_0's precision: 0.456658
Early stopping, best iteration is:
[242]	valid_0's precision: 0.468275
Training until validation scores don't improve for 100 rounds
[100]	valid_0's precision: 0.506944
Early stopping, best iteration is:
[1]	valid_0's precision: 0.956019
Training until validation scores don't improve for 100 rounds
[100]	valid_0's precision: 0.488789
Early stopping, best iteration is:
[1]	valid_0's precision: 0.676009
feature importance:
feature
0     2195.021768
11    2186.973496
6     1618.218329
1     16

In [152]:
x = np.mean(d, axis=0)
x = x > 0.5
print(calculate_precision(x, all_val['label']))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
4655 5306
0.8773087071240105


In [132]:
a = xgb_classification(X_resampled, val_X[feat], feat)

[0]	train-auc:0.635295	valid-auc:0.507017	train-precision:0.454823	valid-precision:0.342832
Multiple eval metrics have been passed: 'valid-precision' will be used for early stopping.

Will train until valid-precision hasn't improved in 100 rounds.


[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 0 pruned nodes, max_depth=8
[08:57:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 2 pruned no

[100]	train-auc:0.881511	valid-auc:0.603274	train-precision:0.788241	valid-precision:0.560107
Stopping. Best iteration:
[0]	train-auc:0.635295	valid-auc:0.507017	train-precision:0.454823	valid-precision:0.342832



[08:57:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 162 extra nodes, 0 pruned nodes, max_depth=8
[08:57:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 2 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 2 pruned nodes, max_depth=8


epoch 0 precision : 0.5223880597014925
[0]	train-auc:0.686462	valid-auc:0.510381	train-precision:0.64057	valid-precision:0.490512
Multiple eval metrics have been passed: 'valid-precision' will be used for early stopping.

Will train until valid-precision hasn't improved in 100 rounds.


[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 2 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 194 extra nodes, 2 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 2 pruned nodes, max_depth=8
[08:57:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned no

[100]	train-auc:0.876027	valid-auc:0.53732	train-precision:0.777389	valid-precision:0.495256


[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 4 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 4 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 2 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 190 extra nodes, 2 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 2 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 146 extra nodes, 0 pruned nodes, max_depth=8


Stopping. Best iteration:
[3]	train-auc:0.756851	valid-auc:0.511828	train-precision:0.674587	valid-precision:0.472486

epoch 1 precision : 0.5102459016393442
[0]	train-auc:0.658734	valid-auc:0.557978	train-precision:0.441648	valid-precision:0.300268
Multiple eval metrics have been passed: 'valid-precision' will be used for early stopping.

Will train until valid-precision hasn't improved in 100 rounds.


[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 136 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 140 extra nodes, 2 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 194 extra nodes, 2 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned no

[100]	train-auc:0.863325	valid-auc:0.567938	train-precision:0.708365	valid-precision:0.436997
Stopping. Best iteration:
[0]	train-auc:0.658734	valid-auc:0.557978	train-precision:0.441648	valid-precision:0.300268



[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 146 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 2 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 2 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=8


epoch 2 precision : 0.56
[0]	train-auc:0.661652	valid-auc:0.529505	train-precision:0.645415	valid-precision:0.537037
Multiple eval metrics have been passed: 'valid-precision' will be used for early stopping.

Will train until valid-precision hasn't improved in 100 rounds.


[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 2 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 2 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 2 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 6 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=8
[08:57:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 2 pruned no

[100]	train-auc:0.874581	valid-auc:0.559145	train-precision:0.844078	valid-precision:0.568287
Stopping. Best iteration:
[1]	train-auc:0.694175	valid-auc:0.53489	train-precision:0.66595	valid-precision:0.528935

epoch 3 precision : 0.4790356394129979
[0]	train-auc:0.638257	valid-auc:0.519514	train-precision:0.794231	valid-precision:0.727578
Multiple eval metrics have been passed: 'valid-precision' will be used for early stopping.

Will train until valid-precision hasn't improved in 100 rounds.


[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 0 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 2 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=8
[08:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 4 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=8
[08:57:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned no

[100]	train-auc:0.879237	valid-auc:0.530407	train-precision:0.818029	valid-precision:0.507848
Stopping. Best iteration:
[3]	train-auc:0.757584	valid-auc:0.532524	train-precision:0.677163	valid-precision:0.492152

epoch 4 precision : 0.5219976218787158


[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 2 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 2 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=8
[08:57:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=8


In [196]:
t = x
P = all_val[all_val['label'] == 1]
TP = all_val[(all_val['label'] == 1) & (all_val['label'] == t)]
FP = all_val[(all_val['label'] == 0) & (all_val['label'] != t)]
TP = TP[feat + ['no', 'label']]
FP = FP[feat + ['no', 'label']]

N = all_val[all_val['label'] == 0]
FN = all_val[(all_val['label'] == 1) & (all_val['label'] != t)]
TN = all_val[(all_val['label'] == 0) & (all_val['label'] == t)]
TN = TN[feat + ['no', 'label']]
FN = FN[feat + ['no', 'label']]

In [197]:
all_val.shape, TP.shape, FP.shape, TN.shape, FN.shape 

((11244, 17), (4655, 17), (651, 17), (4971, 17), (967, 17))

In [199]:
tp_no, fp_no = TP['no'].drop_duplicates(), FP['no'].drop_duplicates()
tn_no, fn_no = TN['no'].drop_duplicates(), FN['no'].drop_duplicates()

In [200]:
tp_no.shape[0], fp_no.shape[0], tn_no.shape[0], fn_no.shape[0]

(303, 261, 562, 159)

In [193]:
tp_ls = []
for i in tp_no.index:
    tp_ls.append(tp_no[i])
np_ls = []
for i in fp_no.index:
    np_ls.append(fp_no[i])

In [194]:
[i for i in tp_ls if i in np_ls]

[]

In [203]:
np_ls = sorted(np_ls, key=lambda x: int(x.split('_')[1]))

In [204]:
with open('np_no.txt', 'w') as f:
    for i in np_ls:
        f.writelines(i+'\n')

In [None]:
# 对比分错的no有多少是真正对的
