In [3]:
"""
作者：librauee
微信公众号：老肥码码码
日期：2020.12.15
线上得分：0.873423
截至日期排名：2
"""
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

warnings.filterwarnings('ignore')

In [None]:
# 数据读取并做简单的特征工程
train = pd.read_csv('train.csv')
test = pd.read_csv('test_noLabel.csv')
y = train['Label']

data = pd.concat([train, test], axis=0).reset_index(drop=True)

data['assets_1'] = data['assets_1'].fillna(data['assets_1'].mean())
data['assets_2'] = data['assets_2'].fillna(data['assets_2'].mean())
data['assets_sum'] = data['assets_2'] + data['assets_1']

data['expenditure_1'] = data['expenditure_1'].fillna(data['expenditure_1'].mean())
data['expenditure_2'] = data['expenditure_2'].fillna(data['expenditure_2'].mean())
data['expenditure_3'] = data['expenditure_3'].fillna(data['expenditure_3'].mean())
data['expenditure_sum'] = data['expenditure_1'] + data['expenditure_2'] + data['expenditure_3']

data['investment'] = data['investment'].fillna(data['investment'].mean())
data['capital'] = data['capital'].fillna(data['capital'].mean())
data['income'] = data['income'].fillna(data['income'].mean())

data['stock'] = data['stock'].fillna(data['stock'].mean())
data['tax'] = data['tax'].fillna(data['tax'].mean())

data['investment-capital'] = data['investment'] - data['capital']
data['investment/capital'] = data['investment'] / data['capital']
data['income-expenditure'] = data['income'] - data['expenditure_sum']

data['stock/tax'] = data['stock'] / data['tax']

In [None]:
# 过采样， 扩充正样本至与负样本数量相同
from imblearn.over_sampling import RandomOverSampler


train, test = data[~data['Label'].isna()], data[data['Label'].isna()]
train.drop(['ID', 'Label'], axis=1, inplace=True)
X_test = test.drop(['ID', 'Label'], axis=1)
 
ros = RandomOverSampler(random_state=0)
X_train, y = ros.fit_sample(train, y)

features = X_train.columns

cat_cols = ['code']
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [14]:
# 不同种子lgb 概率均值
predictions_lgb = np.zeros((len(X_test)))
params = {
          'objective':'binary',
          'metric':'binary_error', 
          'verbose':-1,
          'num_iterations': 10000, 
}

seeds = [2019, 2020, 2021]
for seed in seeds:
    KF = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof_lgb = np.zeros(len(X_train))
    for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
        print("fold n°{}".format(fold_))
        print('trn_idx:',trn_idx)
        print('val_idx:',val_idx)
        trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])    
        val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
        num_round = 10000
        clf = lgb.train(
            params,
                        trn_data,
                        num_round,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=200,  
            categorical_feature=cat_cols,
        )

        oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5 / len(seeds)
    print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
    print("F1: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb], average='macro')))

fold n°0
trn_idx: [    1     2     3 ... 35775 35776 35777]
val_idx: [    0     6     9 ... 35729 35753 35755]
Training until validation scores don't improve for 200 rounds
[500]	training's binary_error: 0.0339599	valid_1's binary_error: 0.0480715
[1000]	training's binary_error: 0.019775	valid_1's binary_error: 0.039128
[1500]	training's binary_error: 0.0171197	valid_1's binary_error: 0.0354947
Early stopping, best iteration is:
[1651]	training's binary_error: 0.0167004	valid_1's binary_error: 0.0343767
fold n°1
trn_idx: [    0     1     2 ... 35774 35775 35777]
val_idx: [    7    23    25 ... 35762 35764 35776]
Training until validation scores don't improve for 200 rounds
[500]	training's binary_error: 0.0317588	valid_1's binary_error: 0.0487703
[1000]	training's binary_error: 0.0204039	valid_1's binary_error: 0.0406652
[1500]	training's binary_error: 0.0176787	valid_1's binary_error: 0.0361934
[2000]	training's binary_error: 0.0170498	valid_1's binary_error: 0.034796
Early stopping, 

AUC score: 0.9852610407954787
F1: 0.9652440466603518


In [16]:
# 提交
submit = pd.read_csv('submit_example.csv')
submit['Label'] = [1 if i >= 0.5 else 0 for i in predictions_lgb]
submit.to_csv('submit_seed.csv', index=False)