https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html

In [None]:
import os

DIR_DATA = os.path.join(os.environ["HOME"], "workspace/third_party/shenweichen/DeepCTR/examples")

In [None]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

In [None]:
data = pd.read_csv(os.path.join(DIR_DATA, './criteo_sample.txt'))

In [None]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I'+str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

In [None]:
data.head()

In [None]:
data['label'].describe()

In [None]:
for feat in sparse_features:
    encoder = LabelEncoder()
    # 可选 HashEncoder()
    data[feat] = encoder.fit_transform(data[feat])

In [None]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [None]:
# 连续特征
dense_feat_columns = [
    DenseFeat(name=feat, dimension=1)  # dtype='float32'
    for feat in dense_features
]

# 稀疏特征
sparse_feat_columns = [
    SparseFeat(name=feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
    for i, feat in enumerate(sparse_features)
]

# 所有特征
fixlen_feature_columns = dense_feat_columns + sparse_feat_columns

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = dense_feat_columns

In [None]:
# 所有特征名
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
# 拆成 train / test
train, test = train_test_split(data, test_size=0.2)

In [None]:
# {特征名： 特征值}
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

In [None]:
model = DeepFM(
    linear_feature_columns = linear_feature_columns,  # Linear 部分的特征
    dnn_feature_columns = dnn_feature_columns,        # DNN 特征 
    task='binary'    # "binary" for binary logloss, "regression" for regression loss
)

In [None]:
type(model)

In [None]:
type(train[target].values)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=['binary_crossentropy', 'binary_accuracy']
)

In [None]:
history = model.fit(
    x=train_model_input,
    y=train[target].values,  # numpy.ndarray
    batch_size=256,
    epochs=80,
    verbose=2,
    validation_split=0.2,    # validation 不用于训练，只记录 loss 和 metrics
)

In [None]:
history_df = pd.DataFrame(history.history)

In [None]:
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")

In [None]:
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Binary Accuracy")

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [None]:
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))

In [None]:
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [None]:
pred = (pred_ans > 0.5).astype(int)

In [None]:
print("test Accuracy", round(accuracy_score(test[target].values, pred), 4))

In [None]:
history_df[['binary_accuracy', 'val_binary_accuracy']]