In [1]:
import math
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# load data
annual_report_info = pd.read_csv('./train/annual_report_info.csv')
base_info = pd.read_csv('./train/base_info.csv')
change_info = pd.read_csv('./train/change_info.csv')
entprise_info = pd.read_csv('./train/entprise_info.csv')
news_info = pd.read_csv('./train/news_info.csv')
other_info = pd.read_csv('./train/other_info.csv')
tax_info = pd.read_csv('./train/tax_info.csv')
entprise_evaluate = pd.read_csv('./entprise_evaluate.csv')


data = pd.concat([entprise_info, entprise_evaluate.rename(columns={'score': 'label'})], ignore_index=True)

In [2]:
# extract features

def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=5)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(
            enumerate(folds.split(df_train, y=df_train['label']))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(
                tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

        df_test[f'{col}_target_enc'] = df_test[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())

        df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
    return pd.concat([df_train, df_test], ignore_index=True)

In [3]:
def extract_base_info_info(data):
    data['district_FLAG1'] = (data['orgid'].fillna('').apply(lambda x: str(x)[:6]) ==
                              data['oplocdistrict'].fillna('').apply(lambda x: str(x)[:6])).astype(int)
    data['district_FLAG2'] = (data['orgid'].fillna('').apply(lambda x: str(x)[:6]) ==
                              data['jobid'].fillna('').apply(lambda x: str(x)[:6])).astype(int)
    data['district_FLAG3'] = (data['oplocdistrict'].fillna('').apply(lambda x: str(x)[:6]) ==
                              data['jobid'].fillna('').apply(lambda x: str(x)[:6])).astype(int)

    data['person_SUM'] = data[['empnum', 'parnum', 'exenum']].sum(1)
    data['person_NULL_SUM'] = data[['empnum', 'parnum', 'exenum']].isnull().astype(int).sum(1)

    data['empnum'] = data['empnum'].fillna(-1)
    data['compform'] = data['compform'].fillna(-1)
    data['parnum'] = data['parnum'].fillna(-1)
    data['exenum'] = data['exenum'].fillna(-1)
    data['opform'] = data['opform'].fillna('-1')
    data['venind'] = data['venind'].fillna(-1)
    data['reccap'] = data['reccap'].fillna(-1)
    data['regcap'] = data['regcap'].fillna(-1)
    data['venind_isna'] = (data['venind'] < 0).astype(np.int8)
    data['reccap_isna'] = (data['reccap'] < 0).astype(np.int8)
    data['regcap_isna'] = (data['regcap'] < 0).astype(np.int8)
    data['enttypeitem'] = data['enttypeitem'].fillna(-1)
    data['enttypeminu'] = data['enttypeminu'].fillna(-1)

    # 未缺失数目
    data['not_na_count'] = 0
    for i in range(len(data)):
        if not math.isnan(data['congro'][i]):
            data['not_na_count'][i] += 1
        if not math.isnan(data['protype'][i]):
            data['not_na_count'][i] += 1
        if not math.isnan(data['forreccap'][i]):
            data['not_na_count'][i] += 1
        if not math.isnan(data['forregcap'][i]):
            data['not_na_count'][i] += 1

    drop = ['ptbusscope', 'midpreindcode',
            'protype', 'forreccap',
            'forregcap', 'congro']
    for f in drop:
        del data[f]

    data['opto'] = data['opto'].fillna('3099-12-31')
    data['opto_isna'] = (data['opto'].apply(lambda x: int(x.split('-')[0]) == 3099)).astype(np.int8)
    data['year_to'] = data['opto'].apply(lambda x: int(x.split('-')[0]))
    data['year_from'] = data['opfrom'].apply(lambda x: int(x.split('-')[0]))
    data['year_range'] = data['year_to'] - data['year_from']
    del data['opfrom']
    del data['opto']
    data['cap_diff'] = data['reccap'] - data['regcap']

    data['dom_len'] = data['dom'].apply(lambda x: len(x) / 16)
    data['oploc_len'] = data['oploc'].apply(lambda x: len(x) / 16)
    data['oploc'] = data['oploc'].apply(lambda x: x[:16])
    data['len_diff'] = data.apply(lambda x: x['oploc_len'] - x['dom_len'], axis=1)
    data['opscope_legal'] = data['opscope'].apply(lambda x: 1 if '依法须经批准的项目' in x else 0)
    del data['dom'], data['opscope']

    lb = LabelEncoder()
    data['industryphy'] = lb.fit_transform(data['industryphy'])
    data['opform'] = lb.fit_transform(data['opform'])
    data['oploc'] = lb.fit_transform(data['oploc'])
    data['industryco'] = lb.fit_transform(data['industryco'])

    # count encode
    for col in ['oplocdistrict', 'industryphy', 'enttype', 'state', 'orgid',
                'adbusign', 'townsign', 'oploc', 'regtype', 'enttypegb',
                'enttypeitem', 'enttypeminu', 'jobid', 'industryco']:
        data[col + '_COUNT'] = data[col].map(data[col].value_counts())
        col_idx = data[col].value_counts()
        for idx in col_idx[col_idx < 10].index:
            data[col] = data[col].replace(idx, -1)

    data = kfold_mean(data[~data['label'].isna()], data[data['label'].isna()], 'label',
                      ['oplocdistrict', 'industryphy', 'enttype', 'state', 'orgid',
                       'adbusign', 'townsign', 'oploc', 'regtype', 'enttypegb',
                       'enttypeitem', 'enttypeminu', 'jobid', 'industryco'])

    return data

In [4]:
def extract_annual_report_info(annual_report_):
    annual_report = annual_report_.copy().sort_values(by=['id', 'ANCHEYEAR'])
    result = annual_report.groupby('id').agg(
        {
            'ANCHEYEAR': ['nunique', lambda x: x.diff().max(), lambda x: x.diff().mean()],
            'EMPNUM': ['mean', 'std', lambda x: x.diff().mean(), lambda x: x.diff().std()],
            'COLGRANUM': ['mean', 'std'],
            'RETSOLNUM': ['mean', 'std'],
            'DISPERNUM': ['mean', 'std'],
            'UNENUM': ['mean', 'std'],
            'COLEMPLNUM': ['mean', 'std'],
            'RETEMPLNUM': ['mean', 'std'],
            'DISEMPLNUM': ['mean', 'std'],
            'UNEEMPLNUM': ['mean', 'std']
        }
    ).reset_index()
    result.columns = ['id',
                      'year_nunique',
                      'year_diff_max',
                      'year_diff_mean',
                      'EMPNUM_mean',
                      'EMPNUM_std',
                      'EMPNUM_diff_mean',
                      'EMPNUM_diff_std',
                      'COLGRANUM_mean',
                      'COLGRANUM_std',
                      'RETSOLNUM_mean',
                      'RETSOLNUM_std',
                      'DISPERNUM_mean',
                      'DISPERNUM_std',
                      'UNENUM_mean',
                      'UNENUM_std',
                      'COLEMPLNUM_mean',
                      'COLEMPLNUM_std',
                      'RETEMPLNUM_mean',
                      'RETEMPLNUM_std',
                      'DISEMPLNUM_mean',
                      'DISEMPLNUM_std',
                      'UNEEMPLNUM_mean',
                      'UNEEMPLNUM_std']

    for f in ['STATE', 'BUSSTNAME', 'PUBSTATE', 'EMPNUMSIGN', 'WEBSITSIGN', 'FORINVESTSIGN', 'STOCKTRANSIGN']:
        result[[f'{f}_count_vec_{i}' for i in range(annual_report[f].nunique(dropna=False))]] = pd.DataFrame(
            CountVectorizer(vocabulary=annual_report[f].unique().astype(str)).fit_transform(
                annual_report.groupby('id')[f].apply(lambda x: ' '.join(list(x.astype(str))))).toarray(),
            index=result.index)
    return result

In [5]:
def extract_change_info(change):
    result = change.groupby('id').agg(
        {
            'bgxmdm': ['nunique', lambda x: x.nunique() / len(x)],
            'bgq': ['nunique', lambda x: x.nunique() / len(x)],
            'bgh': ['nunique', lambda x: x.nunique() / len(x)]
        }
    ).reset_index()
    result.columns = ['id', 'bgxmdm_nunique', 'bgxmdm_nunique_rto', 'bgq_nunique', 'bgq_nunique_rto', 'bgh_nunique',
                      'bgh_nunique_rto']
    return result


def extract_other_info(others):
    others['other_SUM'] = others[['legal_judgment_num', 'brand_num', 'patent_num']].sum(1)
    others['other_NULL_SUM'] = others[['legal_judgment_num', 'brand_num', 'patent_num']].isnull().astype(
        int).sum(1)
    result = others.groupby('id').agg(
        {
            'other_SUM': ['mean'],
            'other_NULL_SUM': ['mean']
        }
    ).reset_index()
    result.columns = ['id',
                      'other_SUM',
                      'other_NULL_SUM']
    return result


In [6]:
def extract_news_info(news):
    news_info['public_date'] = news_info['public_date'].apply(lambda x: x if '-' in str(x) else np.nan)
    news_info['public_date'] = pd.to_datetime(news_info['public_date'])
    news_info['public_date'] = (datetime.now() - news_info['public_date']).dt.days

    result = news.groupby('id').agg({'public_date': ['count']}).reset_index()
    result.columns = ['id', 'public_date_COUNT']
    for f in ['positive_negtive']:
        result[[f'{f}_count_vec_{i}' for i in range(news[f].nunique(dropna=False))]] = pd.DataFrame(
            CountVectorizer(vocabulary=news[f].unique().astype(str)).fit_transform(
                news.groupby('id')[f].apply(lambda x: ' '.join(list(x.astype(str))))).toarray(),
            index=result.index)
    return result

In [7]:
def extract_tax_info(tax):
    tax['income'] = tax['TAX_AMOUNT'] / tax['TAX_RATE']
    result = tax.groupby('id').agg(
        {
            'TAX_CATEGORIES': ['count'],
            'TAX_ITEMS': ['count'],
            'TAXATION_BASIS': ['mean', 'std', 'count'],
            'TAX_RATE': ['mean', 'std'],
            'TAX_AMOUNT': ['mean', 'std', 'max', 'min'],
        }
    ).reset_index()
    result.columns = ['id',
                      'TAX_CATEGORIES_count',
                      'TAX_ITEMS_count',
                      'TAXATION_BASIS_mean',
                      'TAXATION_BASIS_std',
                      'TAXATION_BASIS_count',
                      'TAX_RATE_mean',
                      'TAX_RATE_std',
                      'TAX_AMOUNT_mean',
                      'TAX_AMOUNT_std',
                      'TAX_AMOUNT_max',
                      'TAX_AMOUNT_min',
                      ]

    for f in ['TAX_CATEGORIES']:
        result[[f'{f}_count_vec_{i}' for i in range(tax[f].nunique(dropna=False))]] = pd.DataFrame(
            CountVectorizer(vocabulary=tax[f].unique().astype(str)).fit_transform(
                tax.groupby('id')[f].apply(lambda x: ' '.join(list(x.astype(str))))).toarray(),
            index=result.index)

    # tax_items_tfidf
    tax_items_tfidf = TfidfVectorizer(vocabulary=tax['TAX_ITEMS'].unique().astype(str), min_df=20).fit_transform(
        tax.groupby('id').apply(lambda x: ' '.join(list(x))).tolist())

    lda = LatentDirichletAllocation(n_jobs=-1, n_components=8)
    result[[
        f'lda_tax_items{i + 1}' for i in range(lda.n_components)
    ]] = pd.DataFrame(lda.fit_transform(
        tax_items_tfidf),
        index=result.index)

    nmf = NMF(n_components=8)
    result[[
        f'nmf_tax_items{i + 1}' for i in range(nmf.n_components)
    ]] = pd.DataFrame(nmf.fit_transform(
        tax_items_tfidf),
        index=result.index)

    svd = TruncatedSVD(n_components=8)
    result[[
        f'svd_tax_items{i + 1}' for i in range(svd.n_components)
    ]] = pd.DataFrame(svd.fit_transform(
        tax_items_tfidf),
        index=result.index)

    return result

In [8]:
data = pd.merge(left=data, right=base_info, how='left', on='id')
data = extract_base_info_info(data)
data = data.merge(pd.read_csv('tfidf_decomposition.csv'), how='left', on='id')


def combine_info(data, info_data, extract_func):
    info = extract_func(info_data)
    data = data.merge(info, on='id', how='left')
    return data


for info_data, extract_func in tqdm([
    (annual_report_info, extract_annual_report_info),
    (change_info, extract_change_info),
    (tax_info, extract_tax_info),
    # (other_info, extract_other_info),
    (news_info, extract_news_info),
], desc='extract features'):
    data = combine_info(data, info_data, extract_func)


def filter_col_by_nan(df, ratio=0.05):
    cols = []
    for col in df.columns:
        if df[col].isna().mean() >= (1 - ratio):
            cols.append(col)
    return cols

5it [00:00, 43.85it/s]
extract features: 100%|██████████| 4/4 [00:14<00:00,  3.67s/it]


In [9]:
drop_columns = ['id', 'label']
cat_features = ['oplocdistrict', 'industryphy', 'enttype', 'state',
                'opto_isna', 'industryco', 'oploc', 'orgid', 'adbusign',
                'townsign', 'regtype', 'enttypegb', 'compform', 'jobid',
                'venind', 'enttypeitem', 'enttypeminu', 'opform',
                'opscope_legal', 'regcap_isna', 'reccap_isna', 'venind_isna']
data[cat_features] = data[cat_features].astype('category')
num_features = [f for f in data.columns if f not in drop_columns + cat_features]
data = data.drop(filter_col_by_nan(data[num_features], 0.01), axis=1)
data[num_features] = data[num_features].fillna(-1)
features = num_features
train, X_submit = data[~data['label'].isna()], data[data['label'].isna()]

train['sample_weight'] = 1


# train & predict
def my_metric(y_true, y_pred, sample_weight=None):
    f1 = f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), sample_weight=sample_weight)
    p = precision_score(y_true, np.where(y_pred >= 0.5, 1, 0), sample_weight=sample_weight)
    r = recall_score(y_true, np.where(y_pred >= 0.5, 1, 0), sample_weight=sample_weight)
    return 0.5 * p + 0.3 * r + 0.2 * f1

In [10]:
def kfold_xgb(train, X_submit, target, param, seed=2023):
    kfout = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    kfin = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    y_pred = target - target
    y_submit = 0

    for fold, (train_index, test_index) in tqdm(enumerate(kfout.split(train, target))):
        X, X_test = train.iloc[train_index], train.iloc[test_index]
        y, y_test = target.iloc[train_index], target.iloc[test_index]
        y_test_hat = 0
        for train_index, val_index in kfin.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            model = xgb.XGBClassifier(n_estimators=2023,
                                      random_state=seed,
                                      n_jobs=-1,
                                      tree_method='hist',
                                      **param
                                      )
            model.fit(X_train[features], y_train, sample_weight=X_train['sample_weight'],
                      eval_set=[(X_train[features], y_train), (X_val[features], y_val)],
                      eval_metric=['auc'], sample_weight_eval_set=[X_train['sample_weight'], X_val['sample_weight']],
                      early_stopping_rounds=25, verbose=2)
            y_test_hat += model.predict_proba(X_test[features])[:, 1] / kfin.n_splits
            y_submit += model.predict_proba(X_submit[features])[:, 1] / (kfin.n_splits * kfout.n_splits)

        fold_f1 = my_metric(y_test, np.where(y_test_hat >= 0.5, 1, 0), sample_weight=X_test['sample_weight'])
        print(f'{fold + 1}fold f1: {fold_f1}')
        y_pred.loc[y_test.index] = y_test_hat

    f1 = my_metric(target, np.where(y_pred >= 0.5, 1, 0), sample_weight=train['sample_weight'])
    auc = roc_auc_score(target, y_pred, sample_weight=train['sample_weight'])
    print(f'all f1: {f1}, auc: {auc}')
    return y_pred, y_submit

In [11]:
y_pred_all_para = []
y_submit_all_para = []
params = [
    {'max_depth': 12,
     'learning_rate': np.random.uniform(0.004, 0.009),
     'colsample_bytree': np.random.uniform(0.7, 0.85),
     'min_child_weight': 0,
     'reg_alpha': np.random.uniform(0.66, 0.88),
     'reg_lambda': np.random.uniform(1.2, 1.5),
     'subsample': 0.992},
    {'max_depth': 18,
     'learning_rate': np.random.uniform(0.025, 0.065),
     'colsample_bytree': np.random.uniform(0.85, 1),
     'min_child_weight': 0,
     'reg_alpha': np.random.uniform(0.15, 0.55),
     'reg_lambda': np.random.uniform(0.6, 0.9),
     'subsample': 0.996}
]
for i, param in tqdm(enumerate(params)):
    y_pred = 0
    y_submit = 0

    y_pred_seed, y_submit_seed = kfold_xgb(train, X_submit, train['label'], param)
    y_pred += y_pred_seed
    y_submit += y_submit_seed

    f1 = my_metric(train['label'], np.where(y_pred >= 0.5, 1, 0), sample_weight=train['sample_weight'])
    auc = roc_auc_score(train['label'], y_pred, sample_weight=train['sample_weight'])
    print(f'round {i + 1},  all seed f1: {f1}, all seed auc: {auc}')

    y_pred_all_para.append(y_pred)
    y_submit_all_para.append(y_submit)

0it [00:00, ?it/s]
0it [00:00, ?it/s][A

[0]	validation_0-auc:0.98678	validation_1-auc:0.99214
[2]	validation_0-auc:0.99669	validation_1-auc:0.99277
[4]	validation_0-auc:0.99682	validation_1-auc:0.99333
[6]	validation_0-auc:0.99683	validation_1-auc:0.99333
[8]	validation_0-auc:0.99686	validation_1-auc:0.99344
[10]	validation_0-auc:0.99684	validation_1-auc:0.99349
[12]	validation_0-auc:0.99681	validation_1-auc:0.99359
[14]	validation_0-auc:0.99685	validation_1-auc:0.99351
[16]	validation_0-auc:0.99683	validation_1-auc:0.99357
[18]	validation_0-auc:0.99694	validation_1-auc:0.99363
[20]	validation_0-auc:0.99693	validation_1-auc:0.99367
[22]	validation_0-auc:0.99692	validation_1-auc:0.99379
[24]	validation_0-auc:0.99693	validation_1-auc:0.99387
[26]	validation_0-auc:0.99694	validation_1-auc:0.99387
[28]	validation_0-auc:0.99694	validation_1-auc:0.99383
[30]	validation_0-auc:0.99703	validation_1-auc:0.99400
[32]	validation_0-auc:0.99702	validation_1-auc:0.99391
[34]	validation_0-auc:0.99720	validation_1-auc:0.99390
[36]	validation


1it [01:08, 68.03s/it][A

1fold f1: 0.8425922323184983
[0]	validation_0-auc:0.98954	validation_1-auc:0.98899
[2]	validation_0-auc:0.99645	validation_1-auc:0.99347
[4]	validation_0-auc:0.99628	validation_1-auc:0.99405
[6]	validation_0-auc:0.99632	validation_1-auc:0.99403
[8]	validation_0-auc:0.99647	validation_1-auc:0.99415
[10]	validation_0-auc:0.99647	validation_1-auc:0.99410
[12]	validation_0-auc:0.99656	validation_1-auc:0.99412
[14]	validation_0-auc:0.99665	validation_1-auc:0.99411
[16]	validation_0-auc:0.99663	validation_1-auc:0.99403
[18]	validation_0-auc:0.99662	validation_1-auc:0.99401
[20]	validation_0-auc:0.99664	validation_1-auc:0.99400
[22]	validation_0-auc:0.99680	validation_1-auc:0.99405
[24]	validation_0-auc:0.99682	validation_1-auc:0.99412
[26]	validation_0-auc:0.99686	validation_1-auc:0.99416
[28]	validation_0-auc:0.99693	validation_1-auc:0.99405
[30]	validation_0-auc:0.99693	validation_1-auc:0.99407
[32]	validation_0-auc:0.99692	validation_1-auc:0.99403
[34]	validation_0-auc:0.99711	validation_


2it [01:36, 44.86s/it][A

2fold f1: 0.8266328339846577
[0]	validation_0-auc:0.98988	validation_1-auc:0.98600
[2]	validation_0-auc:0.99790	validation_1-auc:0.99269
[4]	validation_0-auc:0.99793	validation_1-auc:0.99323
[6]	validation_0-auc:0.99793	validation_1-auc:0.99322
[8]	validation_0-auc:0.99797	validation_1-auc:0.99338
[10]	validation_0-auc:0.99792	validation_1-auc:0.99333
[12]	validation_0-auc:0.99795	validation_1-auc:0.99332
[14]	validation_0-auc:0.99806	validation_1-auc:0.99328
[16]	validation_0-auc:0.99810	validation_1-auc:0.99337
[18]	validation_0-auc:0.99829	validation_1-auc:0.99400
[20]	validation_0-auc:0.99826	validation_1-auc:0.99398
[22]	validation_0-auc:0.99828	validation_1-auc:0.99402
[24]	validation_0-auc:0.99825	validation_1-auc:0.99399
[26]	validation_0-auc:0.99832	validation_1-auc:0.99390
[28]	validation_0-auc:0.99838	validation_1-auc:0.99383
[30]	validation_0-auc:0.99839	validation_1-auc:0.99382
[32]	validation_0-auc:0.99840	validation_1-auc:0.99382
[34]	validation_0-auc:0.99860	validation_


3it [02:11, 40.09s/it][A

3fold f1: 0.8611589453827924
[0]	validation_0-auc:0.98861	validation_1-auc:0.98273
[2]	validation_0-auc:0.99577	validation_1-auc:0.99152
[4]	validation_0-auc:0.99569	validation_1-auc:0.99172
[6]	validation_0-auc:0.99559	validation_1-auc:0.99235
[8]	validation_0-auc:0.99618	validation_1-auc:0.99204
[10]	validation_0-auc:0.99630	validation_1-auc:0.99230
[12]	validation_0-auc:0.99624	validation_1-auc:0.99290
[14]	validation_0-auc:0.99626	validation_1-auc:0.99283
[16]	validation_0-auc:0.99637	validation_1-auc:0.99294
[18]	validation_0-auc:0.99636	validation_1-auc:0.99286
[20]	validation_0-auc:0.99645	validation_1-auc:0.99278
[22]	validation_0-auc:0.99652	validation_1-auc:0.99291
[24]	validation_0-auc:0.99658	validation_1-auc:0.99273
[26]	validation_0-auc:0.99670	validation_1-auc:0.99296
[28]	validation_0-auc:0.99669	validation_1-auc:0.99291
[30]	validation_0-auc:0.99675	validation_1-auc:0.99301
[32]	validation_0-auc:0.99684	validation_1-auc:0.99300
[34]	validation_0-auc:0.99702	validation_


4it [03:12, 48.56s/it][A

4fold f1: 0.8518504321973577
[0]	validation_0-auc:0.98989	validation_1-auc:0.98363
[2]	validation_0-auc:0.99684	validation_1-auc:0.98994
[4]	validation_0-auc:0.99661	validation_1-auc:0.98996
[6]	validation_0-auc:0.99683	validation_1-auc:0.99048
[8]	validation_0-auc:0.99678	validation_1-auc:0.99047
[10]	validation_0-auc:0.99674	validation_1-auc:0.99055
[12]	validation_0-auc:0.99671	validation_1-auc:0.99048
[14]	validation_0-auc:0.99670	validation_1-auc:0.99052
[16]	validation_0-auc:0.99669	validation_1-auc:0.99045
[18]	validation_0-auc:0.99669	validation_1-auc:0.99061
[20]	validation_0-auc:0.99668	validation_1-auc:0.99061
[22]	validation_0-auc:0.99677	validation_1-auc:0.99097
[24]	validation_0-auc:0.99680	validation_1-auc:0.99089
[26]	validation_0-auc:0.99706	validation_1-auc:0.99120
[28]	validation_0-auc:0.99722	validation_1-auc:0.99136
[30]	validation_0-auc:0.99738	validation_1-auc:0.99139
[32]	validation_0-auc:0.99736	validation_1-auc:0.99139
[34]	validation_0-auc:0.99743	validation_


5it [03:49, 45.90s/it][A
1it [03:49, 229.58s/it]

5fold f1: 0.8367531009107304
all f1: 0.8429647996620473, auc: 0.9818195821442909
round 1,  all seed f1: 0.8429647996620473, all seed auc: 0.9818195821442909



0it [00:00, ?it/s][A

[0]	validation_0-auc:0.98992	validation_1-auc:0.99184
[2]	validation_0-auc:0.99099	validation_1-auc:0.99265
[4]	validation_0-auc:0.99103	validation_1-auc:0.99308
[6]	validation_0-auc:0.99185	validation_1-auc:0.99299
[8]	validation_0-auc:0.99212	validation_1-auc:0.99332
[10]	validation_0-auc:0.99290	validation_1-auc:0.99331
[12]	validation_0-auc:0.99307	validation_1-auc:0.99357
[14]	validation_0-auc:0.99312	validation_1-auc:0.99357
[16]	validation_0-auc:0.99317	validation_1-auc:0.99349
[18]	validation_0-auc:0.99321	validation_1-auc:0.99358
[20]	validation_0-auc:0.99478	validation_1-auc:0.99392
[22]	validation_0-auc:0.99482	validation_1-auc:0.99405
[24]	validation_0-auc:0.99561	validation_1-auc:0.99403
[26]	validation_0-auc:0.99565	validation_1-auc:0.99418
[28]	validation_0-auc:0.99645	validation_1-auc:0.99423
[30]	validation_0-auc:0.99885	validation_1-auc:0.99434
[32]	validation_0-auc:0.99893	validation_1-auc:0.99442
[34]	validation_0-auc:0.99898	validation_1-auc:0.99447
[36]	validation


1it [00:47, 47.94s/it][A

1fold f1: 0.8501013931017924
[0]	validation_0-auc:0.99261	validation_1-auc:0.98979
[2]	validation_0-auc:0.99280	validation_1-auc:0.99311
[4]	validation_0-auc:0.99339	validation_1-auc:0.99316
[6]	validation_0-auc:0.99350	validation_1-auc:0.99327
[8]	validation_0-auc:0.99618	validation_1-auc:0.99341
[10]	validation_0-auc:0.99632	validation_1-auc:0.99319
[12]	validation_0-auc:0.99639	validation_1-auc:0.99315
[14]	validation_0-auc:0.99725	validation_1-auc:0.99311
[16]	validation_0-auc:0.99730	validation_1-auc:0.99330
[18]	validation_0-auc:0.99734	validation_1-auc:0.99328
[20]	validation_0-auc:0.99738	validation_1-auc:0.99330
[22]	validation_0-auc:0.99741	validation_1-auc:0.99347
[24]	validation_0-auc:0.99743	validation_1-auc:0.99345
[26]	validation_0-auc:0.99744	validation_1-auc:0.99352
[28]	validation_0-auc:0.99745	validation_1-auc:0.99360
[30]	validation_0-auc:0.99822	validation_1-auc:0.99362
[32]	validation_0-auc:0.99900	validation_1-auc:0.99366
[34]	validation_0-auc:0.99903	validation_


2it [01:38, 49.59s/it][A

2fold f1: 0.8031216931216931
[0]	validation_0-auc:0.99139	validation_1-auc:0.98604
[2]	validation_0-auc:0.99142	validation_1-auc:0.98634
[4]	validation_0-auc:0.99314	validation_1-auc:0.98950
[6]	validation_0-auc:0.99316	validation_1-auc:0.98962
[8]	validation_0-auc:0.99471	validation_1-auc:0.98997
[10]	validation_0-auc:0.99701	validation_1-auc:0.98989
[12]	validation_0-auc:0.99704	validation_1-auc:0.98990
[14]	validation_0-auc:0.99719	validation_1-auc:0.98979
[16]	validation_0-auc:0.99726	validation_1-auc:0.99310
[18]	validation_0-auc:0.99728	validation_1-auc:0.99324
[20]	validation_0-auc:0.99733	validation_1-auc:0.99341
[22]	validation_0-auc:0.99966	validation_1-auc:0.99341
[24]	validation_0-auc:0.99974	validation_1-auc:0.99346
[26]	validation_0-auc:0.99975	validation_1-auc:0.99352
[28]	validation_0-auc:0.99981	validation_1-auc:0.99352
[30]	validation_0-auc:0.99986	validation_1-auc:0.99360
[32]	validation_0-auc:0.99989	validation_1-auc:0.99357
[34]	validation_0-auc:0.99991	validation_


3it [02:31, 50.85s/it][A

3fold f1: 0.874140181321642
[0]	validation_0-auc:0.99333	validation_1-auc:0.98158
[2]	validation_0-auc:0.99750	validation_1-auc:0.98193
[4]	validation_0-auc:0.99759	validation_1-auc:0.98205
[6]	validation_0-auc:0.99777	validation_1-auc:0.98228
[8]	validation_0-auc:0.99781	validation_1-auc:0.98290
[10]	validation_0-auc:0.99783	validation_1-auc:0.98282
[12]	validation_0-auc:0.99790	validation_1-auc:0.98275
[14]	validation_0-auc:0.99795	validation_1-auc:0.98273
[16]	validation_0-auc:0.99800	validation_1-auc:0.98290
[18]	validation_0-auc:0.99803	validation_1-auc:0.98305
[20]	validation_0-auc:0.99805	validation_1-auc:0.98323
[22]	validation_0-auc:0.99816	validation_1-auc:0.98322
[24]	validation_0-auc:0.99823	validation_1-auc:0.98334
[26]	validation_0-auc:0.99902	validation_1-auc:0.98340
[28]	validation_0-auc:0.99905	validation_1-auc:0.98358
[30]	validation_0-auc:0.99906	validation_1-auc:0.98353
[32]	validation_0-auc:0.99907	validation_1-auc:0.98356
[34]	validation_0-auc:0.99907	validation_1


4it [03:35, 56.26s/it][A

4fold f1: 0.8468398268398267
[0]	validation_0-auc:0.99180	validation_1-auc:0.98257
[2]	validation_0-auc:0.99512	validation_1-auc:0.98566
[4]	validation_0-auc:0.99548	validation_1-auc:0.98712
[6]	validation_0-auc:0.99611	validation_1-auc:0.98777
[8]	validation_0-auc:0.99635	validation_1-auc:0.98794
[10]	validation_0-auc:0.99653	validation_1-auc:0.98801
[12]	validation_0-auc:0.99658	validation_1-auc:0.98819
[14]	validation_0-auc:0.99660	validation_1-auc:0.98809
[16]	validation_0-auc:0.99662	validation_1-auc:0.98814
[18]	validation_0-auc:0.99664	validation_1-auc:0.98818
[20]	validation_0-auc:0.99666	validation_1-auc:0.98833
[22]	validation_0-auc:0.99667	validation_1-auc:0.98830
[24]	validation_0-auc:0.99744	validation_1-auc:0.98839
[26]	validation_0-auc:0.99745	validation_1-auc:0.98850
[28]	validation_0-auc:0.99746	validation_1-auc:0.98852
[30]	validation_0-auc:0.99747	validation_1-auc:0.98850
[32]	validation_0-auc:0.99748	validation_1-auc:0.98863
[34]	validation_0-auc:0.99900	validation_


5it [04:29, 53.99s/it][A
2it [08:19, 249.80s/it]

5fold f1: 0.8386185243328101
all f1: 0.8419558431325924, auc: 0.9916691776422732
round 2,  all seed f1: 0.8419558431325924, all seed auc: 0.9916691776422732





In [12]:
y_pred_all_para_mean = np.column_stack(y_pred_all_para).mean(axis=1)
y_submit_all_para_mean = np.column_stack(y_submit_all_para).mean(axis=1)
f1 = my_metric(train['label'], np.where(y_pred_all_para_mean >= 0.5, 1, 0), sample_weight=train['sample_weight'])
auc = roc_auc_score(train['label'], y_pred_all_para_mean, sample_weight=train['sample_weight'])
print(f'all para mean f1: {f1}, all para mean auc: {auc}')

all para mean f1: 0.8449814618621958, all para mean auc: 0.9895034611816387


In [13]:
def find_best_t(y_pred, y_true):
    best_score = my_metric(y_true, np.where(y_pred >= 0.5, 1, 0))
    t = 0.5
    for i in tqdm(range(450, 551)):
        current_score = my_metric(y_true, np.where(y_pred >= i / 1000, 1, 0))
        if current_score > best_score:
            best_score = current_score
            t = i / 1000
            print(f'best score: {best_score}, best t: {t}')
    return t


t = find_best_t(y_pred_all_para_mean, train['label'])

  8%|▊         | 8/101 [00:00<00:01, 77.39it/s]

best score: 0.8451833275618137, best t: 0.458
best score: 0.8456652599477683, best t: 0.459


 16%|█▌        | 16/101 [00:00<00:01, 76.18it/s]

best score: 0.8461108331157343, best t: 0.466
best score: 0.8465967689641821, best t: 0.467
best score: 0.8470835675813039, best t: 0.469
best score: 0.8475712313607365, best t: 0.47


 24%|██▍       | 24/101 [00:00<00:01, 72.88it/s]

best score: 0.8480475692434963, best t: 0.472
best score: 0.8490295950367738, best t: 0.473


100%|██████████| 101/101 [00:01<00:00, 74.61it/s]


In [15]:
submit = pd.read_csv('./entprise_evaluate.csv')
submit['score'] = submit['id'].map(pd.Series(np.where(y_submit_all_para_mean >= t, 1, 0), index=X_submit['id']))
submit.to_csv('submission.csv', index=False)