In [130]:
import pandas as pd
import numpy as np
import gc
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tensorflow.keras import utils
from tensorflow.keras import backend
import warnings
warnings.filterwarnings('ignore')

In [9]:
# 加载训练数据
datas = pd.read_csv('./data/cat-in-the-dat/train.csv')

In [10]:
X = datas.drop(columns='id')
y = X.pop('target')

In [11]:
# 标签编码
for f in features:
    lbe = LabelEncoder()
    X[f] = lbe.fit_transform(X[f])

In [140]:
# 训练集和测试集划分
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,shuffle=True,stratify=y,random_state=2)

In [141]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [142]:
# 特征名列表
features = X.columns.tolist()

#### layers.Embedding
- 嵌入层，将正整数编码数据嵌入到固定的高维空间。只能作为模型第一层  
1. input_dim：特征不重复取值的个数(或不同文本个数)。
2. output_dim：输出向量空间大小。
3. input_length：输入特征取值数，默认None

In [143]:
# 定义模型结构
def create_model(data,features):
    inputs = []
    xs = []
    # 按特征列输入
    for f in features:
        value_num = data[f].nunique() # 唯一值个数
        out_dim = int(min(np.ceil(value_num / 2), 40)) # 设置向量空间大小
        # 按特征列输入
        input_x = keras.Input(shape=(1,))
        # 嵌入层，将特征映射到高维空间
        x = layers.Embedding(input_dim=value_num, output_dim=out_dim, name=f)(input_x)
        # 随机删除整个1维特征映射比例为0.1。dropout删除单个神经元
        x = layers.SpatialDropout1D(rate=0.1)(x)
        # 调整维度
        x = layers.Reshape(target_shape=(out_dim,))(x)
        inputs.append(input_x)
        xs.append(x)
        
    p = layers.Concatenate()(xs) # 列表连接层
    p = layers.BatchNormalization()(p) # 批量归一化层
    
    p = layers.Dense(200,activation='relu')(p)
    p = layers.Dropout(rate=0.1)(p) # 随机暂时性删除10%的神经元
    p = layers.BatchNormalization()(p) # 批量归一化层
    
    p = layers.Dense(200,activation='relu')(p)
    p = layers.Dropout(rate=0.1)(p) # 随机暂时性删除10%的神经元
    p = layers.BatchNormalization()(p) # 批量归一化层
    
    outputs = layers.Dense(2,activation='sigmoid')(p)
    model = keras.Model(inputs=inputs,outputs=outputs)
    return model

- tf.py_func将tensor转换为numpy计算，再将输出的numpy转换为tensor
- tf.py_func参数依次为：自定义函数，函数所需参数，函数输出转换为tensor后的类型

In [144]:
# 定义auc度量函数
def auc(y_true, y_pred):
    def defined_auc(y_true, y_pred):
        return roc_auc_score(y_true, y_pred)
    return tf.py_func(defined_auc, (y_true, y_pred), tf.double)

In [145]:
sk = StratifiedKFold(n_splits=2,random_state=6)

- utils.to_categorical：类别向量转换为类别矩阵

In [146]:
oof_preds = np.zeros((len(X_train))) # 存放验证集预测结果
auc_score = 0 # 验证集auc值
# K折交叉验证
for tra_index, val_index in sk.split(X_train, y_train):
    tra_x, val_x = X_train.iloc[tra_index,:], X_train.iloc[val_index,:]
    tra_x = tra_x.reset_index(drop=True)
    val_x = val_x.reset_index(drop=True)
    tra_y, val_y = y_train.iloc[tra_index].values, y_train.iloc[val_index].values
    # 定义模型
    model = create_model(data=X, features=features)
    # 配置优化算法、损失函数、度量标准
    model.compile(optimizer=keras.optimizers.Adam(0.01), loss=keras.losses.binary_crossentropy, metrics = [auc])
    # 将训练集和验证集转换为模型接受的格式
    tra_x = [tra_x.values[:, k] for k in range(tra_x.values.shape[1])]
    val_x = [val_x.values[:, k] for k in range(val_x.values.shape[1])]
    # 设置earlystop,验证集auc值在5轮迭代内增长小于0.001停止。mode='max'指停止增长，'min'指停止下降。
    elystop= keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max')
    # 动态降低学习率。facotr:此处学习率降为原来的1/2，patience：此处3轮epochs没有提升就降低学习率，min_lr：学习率不能低于1e-6
    redlr = keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=2, min_lr=1e-6, mode='max', verbose=1)
    # 模型训练
    model.fit(tra_x, utils.to_categorical(tra_y), epochs=8, batch_size=100,callbacks=[elystop,redlr], validation_data=(val_x,utils.to_categorical(val_y)))
    val_pred = model.predict(val_x)[:,1] # 验证集预测
    oof_preds[val_index]=val_pred # 存放验证集预测结果
    auc_score += roc_auc_score(y_true=val_y,y_score=val_pred)
print(f'The mean auc is {auc_score/sk.n_splits} \nThe auc on val is {roc_auc_score(y_train,oof_preds)}')
backend.clear_session()

Train on 104999 samples, validate on 105001 samples
Epoch 1/8

InvalidArgumentError: ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
Traceback (most recent call last):

  File "D:\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 158, in __call__
    ret = func(*args)

  File "<ipython-input-144-d17f97f84e21>", line 4, in defined_auc
    return roc_auc_score(y_true, y_pred)

  File "D:\Anaconda3\lib\site-packages\sklearn\metrics\ranking.py", line 355, in roc_auc_score
    sample_weight=sample_weight)

  File "D:\Anaconda3\lib\site-packages\sklearn\metrics\base.py", line 119, in _average_binary_score
    sample_weight=score_weight)

  File "D:\Anaconda3\lib\site-packages\sklearn\metrics\ranking.py", line 323, in _binary_roc_auc_score
    raise ValueError("Only one class present in y_true. ROC AUC score "

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.


	 [[Node: metrics_3/auc/PyFunc = PyFunc[Tin=[DT_FLOAT, DT_FLOAT], Tout=[DT_DOUBLE], token="pyfunc_13", _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_dense_11_target_0_23, dense_11/Sigmoid)]]