In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgb
import operator
import time

# %%

# 1.读取文件
train = pd.read_csv("new_data/train.csv")
train_target = pd.read_csv('new_data/train_target.csv')
train = train.merge(train_target, on='id')
test = pd.read_csv("new_data/test.csv")
print(train.shape)
print(test.shape)
# 2.合并数据
test['target'] = -1
data = pd.concat([train, test], sort=False, axis=0)
print(train.shape)
print(test.shape)
print(data.shape)

# %%

# 简单数据描述
stats = []
for col in train.columns:
    stats.append((col, train[col].nunique(), train[col].isnull().sum() * 100 / train.shape[0],
                  train[col].value_counts(normalize=True, dropna=False).values[0] * 100, train[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values',
                                        'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Unique_values', ascending=False)[:30]

# %%

stats = []
for col in test.columns:
    stats.append((col, test[col].nunique(), test[col].isnull().sum() * 100 / test.shape[0],
                  test[col].value_counts(normalize=True, dropna=False).values[0] * 100, test[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values',
                                        'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Unique_values', ascending=False)[:30]

# %%

# 特征工程
# 根据 unique values确定

no_feas = ['id', 'target'] + ['certId', 'bankCard', 'dist', 'residentAddr','certValidStop', 'certValidBegin']
data['certPeriod'] = data['certValidStop'] - data['certValidBegin']
numerical_features = ['certValidStop', 'certValidBegin', 'lmt', 'age', 'certPeriod']
# data['certBalidStop_certValidBegin_ratio']=data ['certBalidStop']/data['certValidBegin']
# data['lmt_age_ratio']=data ['lmt']/data['age']
# data['certPeriod_age_ratio']=data ['certPeriod']/data['age']
#
# data['lmt_age_mul']=data ['lmt']*data['age']
# data['certPeriod_age_mul']=data ['certPeriod']*data['age']

categorical_features = [fea for fea in data.columns if fea not in numerical_features + no_feas]
# cols = [col for col in (set(numerical_features))]
# for col in cols:
#     data[col + '_Rank'] = data[col].rank()

# from tqdm import tqdm
# for cate in tqdm(['certId', 'bankCard', 'dist', 'residentAddr']):
#     for fea in numerical_features:
#         grouped_df = data.groupby(cate).agg({fea: ['mean','skew',pd.DataFrame.kurt]})
#         grouped_df.columns = [cate+'_' + '_'.join(col).strip() for col in grouped_df.columns.values]
#         grouped_df = grouped_df.reset_index()
#         data = pd.merge(data, grouped_df, on=cate, how='left')
# %%

features = [fea for fea in data.columns if fea not in no_feas]

# %%

train = data.loc[data['target'] != -1, :]  # train set
test = data.loc[data['target'] == -1, :]  # test set
y = train['target'].values.astype(int)
X = train[features].values
print("X shape:", X.shape)
print("y shape:", y.shape)
test_data = test[features].values
print("test shape", test_data.shape)

print(len(features))

(132029, 105)
(23561, 104)
(132029, 105)
(23561, 105)
(155590, 105)
X shape: (132029, 98)
y shape: (132029,)
test shape (23561, 98)
98


In [4]:
import numpy as np
import time
import pandas as pd
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

print("start：********************************")
start = time.time()

N = 5
skf = StratifiedKFold(n_splits=N, shuffle=True, random_state=2019)

auc_cv = []
pred_cv = []
for k, (train_in, test_in) in enumerate(skf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train_in], X[test_in], \
                                       y[train_in], y[test_in]


    ebm = ExplainableBoostingClassifier(n_estimators=50,
                                       random_state=2019)
    ebm.fit(X_train, y_train)  # Works on dataframes and numpy arrays
    
    print('................Start predict .........................')
    # 预测
    y_pred = ebm.predict_proba(X_test)[:, 1]
    # 评估
    tmp_auc = roc_auc_score(y_test, y_pred)
    auc_cv.append(tmp_auc)
    print("valid auc:", tmp_auc)
    # test
    pred = ebm.predict_proba(test_data)[:, 1]
    pred_cv.append(pred)
# K交叉验证的平均分数
print('the cv information:')
print(auc_cv)
print('cv mean score', np.mean(auc_cv))

end = time.time()
print("......................run with time: ", (end - start) / 60.0)
print("over:*********************************")

# 10.5折交叉验证结果均值融合，保存文件
mean_auc = np.mean(auc_cv)
print("mean auc:", mean_auc)
filepath = 'result/inter_' + str(mean_auc) + '.csv'  # 线下平均分数

# 转为array
res = np.array(pred_cv)
print("总的结果：", res.shape)
# 最后结果平均，mean
r = res.mean(axis=0)
print('result shape:', r.shape)
result = DataFrame()
result['id'] = test['id']
result['target'] = r
result.to_csv(filepath, index=False, sep=",")

start：********************************
................Start predict .........................
valid auc: 0.696286852127362
................Start predict .........................
valid auc: 0.7219555867093919
................Start predict .........................
valid auc: 0.6936038074438595
................Start predict .........................
valid auc: 0.7051756297525494
................Start predict .........................
valid auc: 0.7425175868216376
the cv information:
[0.696286852127362, 0.7219555867093919, 0.6936038074438595, 0.7051756297525494, 0.7425175868216376]
cv mean score 0.7119078925709601
......................run with time:  22.011788368225098
over:*********************************
mean auc: 0.7119078925709601
总的结果： (5, 23561)
result shape: (23561,)
