# CTR预估模型-FM实现

本文为CTR预估模型实现系列之FM模型。本文主要以讲解模型代码实现为主，数据仅供训练使用。
- 数据集：Criteo数据集前60w条数据
    - 前50w为训练数据
    - 后10w为验证数据
- package：tensorflow 2.0.0
- 算法：Factorization Machine

In [48]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
%matplotlib inline

# 加载数据

In [2]:
# 定义column名称
cols = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
       'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25',
       'C26']

In [10]:
data = pd.read_csv('/data/recsys/mgf/dataset/criteo_sampled_data.csv', header=0)

In [15]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [16]:
data.columns

Index(['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10',
       'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
       'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'],
      dtype='object')

# 数据预处理

数据预处理：
- dense features：
    - log变换
- sparse features：
    - embedding

## 定义特征组

In [17]:
dense_feats = [f for f in data.columns if f[0] == "I"]
sparse_feats = [f for f in data.columns if f[0] == "C"]

## 处理dense特征

In [18]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)  # 暂时填充为0
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

In [19]:
data_dense = process_dense_feats(data, dense_feats)

In [20]:
data_dense

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0.693147,0.693147,1.791759,0.000000,7.232010,1.609438,2.772589,1.098612,5.204007,0.693147,1.098612,0.000000,1.098612
1,1.098612,0.000000,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,0.693147,0.000000,1.609438
2,1.098612,0.000000,0.693147,2.708050,6.643790,4.499810,1.609438,1.098612,5.505332,0.693147,1.386294,1.386294,3.828641
3,0.000000,6.795706,0.000000,0.000000,8.387768,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1.386294,-1.000000,0.000000,0.000000,1.098612,0.000000,1.386294,0.000000,0.000000,0.693147,0.693147,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,0.000000,0.000000,4.465908,1.791759,7.864420,4.605170,1.386294,3.135494,3.135494,0.000000,0.693147,0.000000,1.791759
599996,0.693147,0.693147,1.098612,1.098612,7.071573,2.833213,0.693147,2.833213,2.833213,0.693147,0.693147,0.000000,1.098612
599997,0.000000,1.791759,0.693147,1.098612,8.349484,4.779123,1.945910,2.397895,3.713572,0.000000,0.693147,0.000000,1.098612
599998,0.000000,1.791759,4.290459,2.197225,7.864804,3.713572,0.693147,2.708050,2.708050,0.000000,0.000000,0.000000,2.197225


## 处理sparse特征

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")  # 默认填充为-1
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        
    return d

In [23]:
data_sparse = process_sparse_feats(data, sparse_feats)

## 处理后的全量数据

In [24]:
data_sparse

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,470,261,203952,41641,38,6,8961,63,2,16515,...,9,3439,213,3,4954,0,3,24768,52,14364
1,470,498,90258,22218,38,13,5957,19,2,4195,...,0,2465,213,1,60664,0,3,8432,52,10835
2,170,24,2223,65253,38,6,8067,19,2,5767,...,6,738,0,0,143786,9,3,7344,0,0
3,470,93,137623,15635,38,13,1935,19,2,23623,...,1,1648,0,0,67107,0,3,18107,0,0
4,612,368,162265,83638,38,2,7067,19,2,8071,...,1,556,0,0,21257,0,2,22439,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,23,66,172718,61471,38,0,577,196,2,11967,...,9,812,213,2,60254,0,2,30065,1,6169
599996,23,120,160098,68483,38,6,7567,7,2,12556,...,8,1194,0,0,39464,0,5,10799,0,0
599997,470,40,170810,7850,95,13,6953,19,2,24655,...,9,2695,0,0,24566,9,3,22603,0,0
599998,673,83,114264,83007,38,13,5212,19,2,14104,...,9,1929,27,3,125582,0,3,7906,1,6920


In [25]:
total_data = pd.concat([data_dense, data_sparse], axis=1)

In [26]:
total_data['label'] = data['label']

# 模型训练

本部分实现的是Factorization Machine算法，该算法公式为：

$$\hat{y}=w_0+\sum_{i=1}^n w_i x_i + \sum_{i=1}^n \sum_{j=i+1}^n \langle v_i, v_j \rangle x_i x_j$$

我们在实现中忽略偏置项$w_0$，因此整个模型可以分为两部分：
- 一阶线性部分
- 二阶交叉部分

## 一阶线性部分

### dense特征

In [27]:
import tensorflow as tf

In [28]:
dense_inputs = []
for f in dense_feats:
    _input = Input([1], name=f)
    dense_inputs.append(_input)

In [29]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(None, 1) dtype=float32>]

In [30]:
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)
fst_order_dense_layer = Dense(1)(concat_dense_inputs)

### sparse特征

In [31]:
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1], name=f)
    sparse_inputs.append(_input)

In [32]:
sparse_1d_embed = []
for _input in sparse_inputs:
    f = _input.name.split(':')[0]
    voc_size = data[f].nunique()
    _embed = Flatten()(Embedding(voc_size+1, 1, embeddings_regularizer=tf.keras.regularizers.l2(0.5))(_input))
    sparse_1d_embed.append(_embed)

In [33]:
fst_order_sparse_layer = Add()(sparse_1d_embed)

In [34]:
fst_order_sparse_layer

<tf.Tensor 'add/Identity:0' shape=(None, 1) dtype=float32>

## 二阶特征

In [35]:
k = 8

In [36]:
# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for feat, _input in enumerate(sparse_inputs):
    f = sparse_feats[feat]
    voc_size = data[f].nunique()
    _embed = Embedding(voc_size+1, k, embeddings_regularizer=tf.keras.regularizers.l2(0.7))(_input)
    sparse_kd_embed.append(_embed)

In [37]:
sparse_kd_embed

[<tf.Tensor 'embedding_26/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_27/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_28/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_29/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_30/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_31/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_32/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_33/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_34/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_35/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_36/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_37/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_38/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_39/Identity:0' shape=(None, 

FM本身的二阶交叉项，看上去是有$O(n^2)$的时间复杂度，但可以通过公式化简，这里具体就不进行讲解了。

其本质的化简思路为：$ab=\frac{1}{2}[(a+b)^2-(a^2+b^2)]$

因此我们的内积项可以进行化简：

$$\sum_{i=1}^n\sum_{j=i+1}^n \langle v_i, v_j \rangle x_i x_j=\frac{1}{2} \sum_{f=1}^k[(\sum_{i=1}^n V_{if})^2 - \sum_{i=1}^n V_{if}^2]$$

In [38]:
# 1.将所有sparse的embedding拼接起来，得到 (n, k)的矩阵，其中n为特征数，k为embedding大小
concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed)  # ?, n, k

In [39]:
# 2.先求和再平方
sum_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(concat_sparse_kd_embed)  # ?, k
square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])  # ?, k

In [40]:
# 3.先平方再求和
square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed]) # ?, n, k
sum_square_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(square_kd_embed)  # ?, k

In [41]:
# 4.相减除以2
sub = Subtract()([square_sum_kd_embed, sum_square_kd_embed])
sub = Lambda(lambda x: x*0.5)(sub)
snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1, keepdims=True))(sub)

## 输出层

In [42]:
linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])
interaction_part = snd_order_sparse_layer

In [43]:
logits = Add()([linear_part, interaction_part])
output_layer = Activation("sigmoid")(logits)

## 编译模型

In [52]:
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [53]:
model = Model(dense_inputs+sparse_inputs, output_layer)

In [54]:
plot_model(model, "fm_model.png", show_shapes=True)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [55]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
____________________________________________________________________________________________

In [56]:
import tensorflow as tf

In [57]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

## 训练

In [58]:
from tensorflow.keras.callbacks import TensorBoard

In [59]:
tbCallBack = TensorBoard(log_dir='./logs',  # log 目录
                 histogram_freq=0,  # 按照何等频率（epoch）来计算直方图，0为不计算
                 write_graph=True,  # 是否存储网络结构图
                 write_grads=True, # 是否可视化梯度直方图
                 write_images=True,# 是否可视化参数
                 embeddings_freq=0, 
                 embeddings_layer_names=None, 
                 embeddings_metadata=None)



In [60]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

In [61]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

In [62]:
train_label = [train_data['label'].values]

In [63]:
val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

In [64]:
val_label = [valid_data['label'].values]

In [65]:
model.fit(train_dense_x+train_sparse_x, 
          train_label, epochs=5, batch_size=256,
          validation_data=(val_dense_x+val_sparse_x, val_label), 
          callbacks=[tbCallBack])

Train on 500000 samples, validate on 100000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7d440d0ef0>

In [68]:
!tensorboard --host 0.0.0.0 --logdir ./logs 

2020-04-10 14:38:00.489712: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/extras/CUPTI/lib64/:/usr/local/cuda-10.1/extras/Sanitizer/
2020-04-10 14:38:00.489865: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/extras/CUPTI/lib64/:/usr/local/cuda-10.1/extras/Sanitizer/
2020-04-10 14:38:00.489883: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
TensorBoard 2.1.1 at http://0.0