# 导入第三方包

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier as cat
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error, precision_recall_curve, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
from datetime import datetime
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')

from sklearn.manifold import TSNE # 导入tsne包
from sklearn.decomposition import PCA, KernelPCA # PCA
from sklearn.manifold import Isomap # Isomap
from sklearn.impute import KNNImputer

# 数据读取与基本处理

## 读取数据

In [None]:
# 训练集数据
acct_train = pd.read_csv('../input/赛题B_预赛数据/训练集/acct_train.csv')
bhv_train = pd.read_csv('../input/赛题B_预赛数据/训练集/bhv_train.csv')
cust_train = pd.read_csv('../input/赛题B_预赛数据/训练集/cust_train.csv')
train_label = pd.read_csv('../input/赛题B_预赛数据/训练集/train_label.csv')

In [None]:
# 测试集数据
acct_test = pd.read_csv('../input/赛题B_预赛数据/测试集/acct_test.csv')
bhv_test = pd.read_csv('../input/赛题B_预赛数据/测试集/bhv_test.csv')
cust_test = pd.read_csv('../input/赛题B_预赛数据/测试集/cust_test.csv')

## 数据拼接

In [None]:
# train_data
train_data = pd.merge(acct_train, bhv_train, on="id", how="left")
train_data = pd.merge(train_data, cust_train, on="id", how="left")
train_data = pd.merge(train_data, train_label, on="id", how="left")

In [None]:
# test_data
test_data = pd.merge(acct_test, bhv_test, on="id", how="left")
test_data = pd.merge(test_data, cust_test, on="id", how="left")

In [None]:
# 拼接 train 和 test
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
data.head()

## 数据处理

In [None]:
# 类别型特征编码处理
cat_f = ['b2', 'b3', 'b28']
for f in tqdm(cat_f):
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f].fillna('nan'))

# 特征工程

## 重要特征交叉

In [None]:
topn = ["b26", "b22", "b18", "b25", "b10", "b11", "b23", "b21", 
        "b20", "b13", "b27", "b24", "b17", "b14", "b12", "b15", 
        "b16", "b29", "b8", "b19"] 
for i in range(len(topn)):
    for j in range(i+1, len(topn)):
        data[f"{topn[i]}+{topn[j]}"] = data[topn[i]] + data[topn[j]]
        data[f"{topn[i]}-{topn[j]}"] = data[topn[i]] - data[topn[j]]
        data[f"{topn[i]}*{topn[j]}"] = data[topn[i]] * data[topn[j]]
        data[f"{topn[i]}/{topn[j]}"] = data[topn[i]] / (data[topn[j]]+1e-5)

## 计算额度使用率、账单金额、月消费金额的变异系数

### 近3个月内

In [None]:
# 计算变异系数以了解用户信用卡额度使用率、账单金额、月消费金额的稳定性

# 变异系数计算方式为 标准差 / 均值
# 3个月内变异系数
# 计算近三个月额度使用率的变异系数
data["ed_mid"] = data["b13"]*3 - data["b14"]- data["b15"] # 如此就知道近三个月中每个月的额度使用率
data['ed_byxs'] = data[['b14', 'ed_mid', 'b15']].apply(lambda x: np.std(x) / np.mean(x), axis=1)
# 计算近三个月账单的变异系数
data["zd_mid"] = data["b19"]*3 - data["b20"]- data["b21"] 
data['zd_byxs'] = data[['b20', 'zd_mid', 'b21']].apply(lambda x: np.std(x) / (np.mean(x)), axis=1)
# 计算近三个月月消费金额的变异系数
data["xf_mid"] = data["b25"]*3 - data["b26"]- data["b27"] 
data['xf_byxs'] = data[['b26', 'xf_mid', 'b27']].apply(lambda x: np.std(x) / (np.mean(x)), axis=1)


### 3个月间（近半年的前3个月与近3个月）

In [None]:
# 3个月间变异系数
# ed
data["half_year_ed_sum"] = data["b10"]*6 # 近6个月
data["3_month_ed_sum"] = data["b13"]*3 # 近3个月
data["half_year-3_month_ed"] = data["half_year_ed_sum"]-data["3_month_ed_sum"]
data['ed_byxs_3_month'] = data[['half_year-3_month_ed', '3_month_ed_sum']].apply(lambda x: np.std(x) / np.mean(x), axis=1)

# zd
data["half_year_zd_sum"] = data["b16"]*6 # 近6个月
data["3_month_zd_sum"] = data["b19"]*3 # 近3个月
data["half_year-3_month_zd"] = data["half_year_zd_sum"]-data["3_month_zd_sum"]
data['zd_byxs_3_month'] = data[['half_year-3_month_zd', '3_month_zd_sum']].apply(lambda x: np.std(x) / np.mean(x), axis=1)

# xf
data["half_year_xf_sum"] = data["b22"]*6 # 近6个月
data["3_month_xf_sum"] = data["b25"]*3 # 近3个月
data["half_year-3_month_xf"] = data["half_year_xf_sum"]-data["3_month_xf_sum"]
data['xf_byxs_3_month'] = data[['half_year-3_month_xf', '3_month_xf_sum']].apply(lambda x: np.std(x) / np.mean(x), axis=1)

## 计算额度使用率、账单金额、月消费金额的偏度与峰度

In [None]:
## 了解用户的信用卡额度使用率、账单金额、月消费金额是否具有偏态和峰态

# 3个月内偏度和峰度
from scipy.stats import kurtosis, skew
data['ed_sk'] = data[['b14', 'ed_mid', 'b15']].apply(lambda x: skew(x), axis=1) # 注意加上 axis=1
data['ed_ku'] = data[['b14', 'ed_mid', 'b15']].apply(lambda x: kurtosis(x), axis=1)

data['zd_sk'] = data[['b20', 'zd_mid', 'b21']].apply(lambda x: skew(x), axis=1)
data['zd_ku'] = data[['b20', 'zd_mid', 'b21']].apply(lambda x: kurtosis(x), axis=1)

data['xf_sk'] = data[['b26', 'xf_mid', 'b27']].apply(lambda x: skew(x), axis=1)
data['xf_ku'] = data[['b26', 'xf_mid', 'b27']].apply(lambda x: kurtosis(x), axis=1)

In [None]:
# 删除构造的可能造成冗余的特征
data = data.drop(["ed_mid", "zd_mid", "xf_mid"], axis=1)
data = data.drop(["half_year_ed_sum", "3_month_ed_sum", "half_year_zd_sum", "3_month_zd_sum", "half_year_xf_sum", "3_month_xf_sum"], axis=1)

## 特征分箱+异常值处理

In [None]:
# 年龄 b1
# 观察取值来分箱
def get_b1_seg(x):
    if x >=21 and x <= 22:
        return 0 # 在以上取值区间内，返回0
    elif x > 22 and x <= 24:
        return 1 # 返回1
    elif x > 24 and x <= 25:
        return 2 # 返回2
    elif x > 25 and x <= 26:
        return 3 # 返回3
    elif x > 26 and x <= 27:
        return 4 # 返回4
    elif x > 27 and x <= 28:
        return 5 # 返回5
data["b1_seg"] = data["b1"].apply(lambda x: get_b1_seg(x)) # 对年龄特征进行分箱

In [None]:
# b26 近3个月最大消费金额
def get_b26_seg(x):
    if x <=0 : # 存在异常值
        return 0
    if x >=0 and x <= 8000:
        return 1
    if x >=8000 and x <= 40000:
        return 2
    elif x > 40000 and x <= np.max(data["b26"]):
        return 3
data["b26_seg"] = data["b26"].apply(lambda x: get_b26_seg(x))
data = data.drop(["b26"], axis=1)

In [None]:
# b10 近半年平均额度使用率
def get_b10_seg(x):
    if x >=0 and x <= 0.12:
        return 0
    if x >=0.12 and x <= 0.375:
        return 1
    if x >=0.375 and x <= 1:
        return 2
    if x >=1 and x <= np.max(data["b10"]):
        return 3
data["b10_seg"] = data["b10"].apply(lambda x: get_b10_seg(x))

In [None]:
# b11 近半年最大额度使用率
def get_b11_seg(x):
    if x >=0 and x <= 0.2:
        return 0
    if x >=0.2 and x <= 0.8:
        return 1
    if x >=0.8 and x <= 1.5:
        return 2
    if x >=1.5 and x <= np.max(data["b11"]):
        return 3
data["b11_seg"] = data["b11"].apply(lambda x: get_b11_seg(x))

In [None]:
# b23 近半年最大月消费金额
def get_b23_seg(x):
    if x >=0 and x <= 2000:
        return 0
    if x >=2000 and x <= 10000:
        return 1
    if x >=10000 and x <= 20000:
        return 2
    if x >=20000 and x <= np.max(data["b23"]):
        return 3
data["b23_seg"] = data["b23"].apply(lambda x: get_b23_seg(x))

In [None]:
# b21 近三个月最小账单金额
def get_b21_seg(x):
    if x <=0 : # 异常值
        return 0
    if x >=0 and x <= 10000:
        return 1
    if x >=10000 and x <= np.max(data["b21"]):
        return 2
data["b21_seg"] = data["b21"].apply(lambda x: get_b21_seg(x))
data = data.drop(["b21"], axis=1)

In [None]:
# b20 近三个月最大账单金额
def get_b20_seg(x):
    if x <=0 : # 异常值
        return 0 
    if x >=0 and x <= 10000:
        return 1
    if x >=10000 and x <= 20000:
        return 2
    if x >=20000 and x <= np.max(data["b20"]):
        return 3
data["b20_seg"] = data["b20"].apply(lambda x: get_b20_seg(x))
data = data.drop(["b20"], axis=1)

In [None]:
# b27 近三个月最小消费金额
def get_b27_seg(x):
    if x >=0 and x <= 5000:
        return 0
    if x >=5000 and x <= 10000:
        return 1
    if x >=10000 and x <= np.max(data["b27"]): # 存在异常值
        return 2
data["b27_seg"] = data["b27"].apply(lambda x: get_b27_seg(x))
data = data.drop(["b27"], axis=1)

In [None]:
# 异常值特征：b20 b21 b26 b27

## 时间特征处理

In [None]:
#时间差值
data['6-5'] = data['b6'] - data['b5'] # 激活时间与发卡时间的差值
data['7-6'] = data['b7'] - data['b6'] # 首次交易时间与激活时间的差值
data['7-5'] = data['b7'] - data['b5'] # 首次交易时间与发卡时间的差值

# 特征筛选

## 删除只有单一取值的特征

In [None]:
# 删除单一取值的特征
drop_cols= ["id", "label"]
for f in data.columns:
    if data[f].nunique() < 2: # nunique表示特征取不同值的数量 < 2表示只有单一取值
        drop_cols.append(f)

## 删除缺失率过高的特征

In [None]:
# 删除缺失率过高的特征
def dropNaN(df, p, col):
    na_sum = df[col].isna().sum()
    percent_value = na_sum / len(df[col])
    if percent_value >= p:
        df = df.drop([col], axis=1)
    return df
for c in data.columns:
    data = dropNaN(data, 0.95, c) # 设置阈值为0.95，删除缺失率超出该阈值的特征

# 划分训练集和测试集

In [None]:
# 根据是否有标签划分训练集和测试集
df_test = data[data["label"].isnull() == True].copy().reset_index(drop=True)
df_train = data[~data["label"].isnull() == True].copy().reset_index(drop=True)

In [None]:
features_name = [f for f in df_train.columns if f not in drop_cols] # 所用的特征
x_train = df_train[features_name].reset_index(drop=True)
x_test = df_test[features_name].reset_index(drop=True)
y = df_train["label"].reset_index(drop=True)
print(len(features_name))

# 模型训练

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name, params):
    
    folds = 5
    seed = 2023
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) # 划分数据
    
    oof = np.zeros(train_x.shape[0]) # 初始化一个大小为n（n=训练集行数），值全为0的数组 用于存放每折验证集的预测概率
    predict = np.zeros(test_x.shape[0]) # 初始化一个大小为n（n=测试集行数），值全为0的数组 用于存放预测概率

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y) # 该折训练集矩阵
            valid_matrix = clf.Dataset(val_x, label=val_y) # 该折验证集矩阵
    
            # 模型训练 valid_sets也可以只放valid_matrix verbose_eval表示打印信息的间隔 early_stopping_rounds表示早停，
            # 防止过拟合，表示在验证集上,当连续n次迭代,分数没有提高后,提前终止训练
            model = clf.train(params, train_matrix, 100000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=100, early_stopping_rounds=500) 
            val_pred = model.predict(val_x, num_iteration=model.best_iteration) # 预测该折验证集 最优迭代次数
            test_pred = model.predict(test_x, num_iteration=model.best_iteration) # 该折训练下的模型来预测测试集
            
                 
        if clf_name == "cat":
            
            model = clf(
            n_estimators=10000,
            random_seed=1024,
            eval_metric='AUC',
            learning_rate=0.05,
            max_depth=5,
            early_stopping_rounds=500,
            metric_period=500,
                    )

            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True,
                      verbose=1)
            
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
            
        oof[valid_index] = val_pred # 将每一折验证集的预测结果放入原先的初始化矩阵中（每一折会对应索引valid_index）
        
        predict += test_pred / folds 
        
        
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
    return oof, predict

In [None]:
# lgb模型训练 第一套参数
params1 = {
    'learning_rate': 0.01, # 学习率
    'boosting_type': 'gbdt', # 基学习器
    'objective': 'binary',  # 采用的目标函数是binary，说明任务类型是二分类
    'metric': 'auc', # 评估指标
    'num_leaves': 63, # 叶子数
    'feature_fraction': 0.8, # 在训练时，对某一棵树，随机选取的特征比例,调小该参数可以防止过拟合，加快运算速度
    'bagging_fraction': 0.8, # 训练样本的采样比例,调小该参数可以防止过拟合，加快运算速度
    'bagging_freq': 5, # 采样频率
    'seed': 2022, # 随机数种子
    'bagging_seed': 1, # bagging种子
    'feature_fraction_seed': 7,
    'min_data_in_leaf': 20, # 叶子节点上的最小数据样本量
    'verbose': -1, 
    'n_jobs':8
} 
start_time = datetime.now()
lgb_oof1, lgb_pred1 = cv_model(lgb, x_train, y, x_test, 'lgb', params1)
end_time = datetime.now()
print(end_time - start_time)

In [None]:
# lgb模型训练 第二套参数
params2 = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc', 
    'min_child_weight': 5,
    'num_leaves': 2 ** 5, 
    'lambda_l2': 10, # l2正则化
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 4, 
    'learning_rate': 0.01, 
    'seed': 2020,
    'n_jobs':8,
    'verbose': -1
            }
start_time = datetime.now()
lgb_oof2, lgb_pred2 = cv_model(lgb, x_train, y, x_test, 'lgb', params2)
end_time = datetime.now()
print(end_time - start_time)

In [None]:
# cat模型训练
start_time = datetime.now()
cat_oof, cat_pred = cv_model(cat, x_train, y, x_test, 'cat', 5)
end_time = datetime.now()
print(end_time - start_time)

# 模型加权融合

In [None]:
lgb_auc1 = roc_auc_score(df_train['label'], lgb_oof1)
print(lgb_auc1)
lgb_auc2 = roc_auc_score(df_train['label'], lgb_oof2)
print(lgb_auc2)
cat_auc = roc_auc_score(df_train['label'], cat_oof)
print(cat_auc)

In [None]:
lgb_w1 = lgb_auc1 / (lgb_auc1 + lgb_auc2 + cat_auc)
print(lgb_w1)
lgb_w2 = lgb_auc2 / (lgb_auc1 + lgb_auc2 + cat_auc)
print(lgb_w2)
cat_w = cat_auc / (lgb_auc1 + lgb_auc2 + cat_auc)
print(cat_w)

In [None]:
oof = lgb_w1 * lgb_oof1 + lgb_w2 * lgb_oof2 + cat_w * cat_oof
pred = lgb_w1 * lgb_pred1 + lgb_w2 * lgb_pred2 + cat_w * cat_pred

# 搜索最佳分类阈值

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.05,0.8,0.001):
    preds = (oof.reshape((-1))>threshold).astype('int')
    f = f1_score(y.values.reshape((-1)), preds, average='macro') 
    scores.append(f)
    thresholds.append(threshold)
    
    if f > best_score:
        best_score = f
        best_threshold = threshold
    print(f'{threshold:.03f}, {f}')
print("==============================")
print(f'{best_threshold:.03f}, {best_score}')

# 模型线下得分

In [None]:
# score
from sklearn.metrics import roc_auc_score
auc_p = roc_auc_score(df_train['label'], oof)
score = 0.3*auc_p+0.7*best_score
print(score)

# 生成提交文件

In [None]:
label_pre=[1 if x >= 0.195 else 0 for x in pred]

In [None]:
submit = pd.DataFrame()
submit['id'] = df_test['id']
submit['pred_prob'] = pred
submit['pred_label'] = label_pre

# 可解释性分析

In [None]:
!pip install shap

In [None]:
import shap
import matplotlib.pyplot as plt
shap.initjs()
explainer = shap.TreeExplainer(lgb_model2)
shap_values = explainer.shap_values(x_test)

fig, ax = plt.subplots()
shap.summary_plot(shap_values[1], x_test, show=False, plot_size=(8, 10))
plt.show()