## 설치
python 3.5~3.7  
$ pip install deepctr[gpu]   

## DeepCTR의 4가지 단계

- DeepFM: A Factorization-Machine based Neural Network for CTR Prediction 논문을 예시로 들 예정입니다  

- 참고 : https://www.ijcai.org/Proceedings/2017/0239.pdf

- 다룰 샘플 데이터는 Criteo Display Ads 데이터이며, 광고 클릭률 예측을 하는 것이 목적입니다.  
    - 13개의 정수 feature, 26개의 범주형 feature들을 가지고 있습니다.(26 categorical features where each category has a high cardinality.)  
    - Task는 이진분류입니다

### Step 1 : import model & sample data fillna

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names

In [2]:
data = pd.read_csv('./examples/criteo_sample.txt')

In [3]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,...,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de


In [4]:
data.shape

(200, 40)

In [5]:
data['label'].value_counts()

0    151
1     49
Name: label, dtype: int64

In [6]:
sparse_features = ['C' + str(i) for i in range(1,27)]
dense_features = ['I' + str(i) for i in range(1,14)]
print(sparse_features, dense_features)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'] ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


In [7]:
data[sparse_features] = data[sparse_features].fillna('-1', ) # 왜 이러한 값으로 보정했는지는 모름
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,3,260.0,0.0,17668.0,0.0,0.0,33.0,0.0,...,e5ba7672,87c6f83c,-1,-1,0429f84b,-1,3a171ecb,c0d61a5c,-1,-1
1,0,0.0,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,-1,-1,5155d8a3,-1,be7c41b4,ded4aac9,-1,-1
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,-1,-1,2e01979f,-1,bcdee96c,6d5d1302,-1,-1
3,0,0.0,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,-1,-1,e587c466,-1,32c7478e,3b183c5c,-1,-1
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,-1,32c7478e,0d4a6d1a,001f3601,92c878de


### Step 2 : 간단한 전처리

일반적으로 sparse한 feature를 임베딩 하는데 두가지 방법이 있습니다. (label encoding, hashing encoding)  

- label encoding : 정수값으로 매핑을 시킴. 값의 범위는 [0 ~ len(#unique)-1]
- hash encoding : 고정 범위 값으로 매핑 (예를 들면, 0 ~ 9999)

In [8]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [9]:
data[feat].head()

0     0
1     0
2     0
3     0
4    47
Name: C26, dtype: int32

In [10]:
data[feat].tail()

195    48
196     0
197    11
198    49
199     0
Name: C26, dtype: int32

In [11]:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [12]:
data[dense_features].head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,0.0,0.03125,0.0,0.343137
2,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,0.0,0.09375,0.0,0.176471
3,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,0.0,0.0625,0.0,0.039216
4,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,0.0,0.03125,0.0,0.264706


### Step 3 : 파생변수 생성
sparse feature에 대해서 보통 dense하게 임베딩 시키는 방법을 많이 적용합니다.  
dense numerical feature랑 합쳐서(concatenate)해서 FC(fully connected) Layer에 input을 시킵니다.  
여기서는 Label Encoding을 실시 합니다.


In [13]:
sparse_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                           for i,feat in enumerate(sparse_features)]
dense_feature_columns = [DenseFeat(feat, 1)
                      for feat in dense_features] # DenseFeat은 name과 dimension을 받음

In [14]:
DenseFeat(feat, 1)

DenseFeat(name='C26', dimension=1, dtype='float32')

In [15]:
sparse_feature_columns[:6] 

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group'),
 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group'),
 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group'),
 SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group'),
 SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group')]

In [16]:
dense_feature_columns[:6]

[DenseFeat(name='I1', dimension=1, dtype='float32'),
 DenseFeat(name='I2', dimension=1, dtype='float32'),
 DenseFeat(name='I3', dimension=1, dtype='float32'),
 DenseFeat(name='I4', dimension=1, dtype='float32'),
 DenseFeat(name='I5', dimension=1, dtype='float32'),
 DenseFeat(name='I6', dimension=1, dtype='float32')]

In [17]:
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [18]:
feature_names

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13']

### Step 4 : 훈련 샘플 데이터 생성하고 모델 훈련 시키기!
데이터들을 나누고, 인풋을 하기위한 작업을 실시.  
deepctr에 있는 인풋 함수들이 매우 다양하므로 잘 활용해보는 것이 중요함

In [19]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [20]:
dict(list(train_model_input.items())[0:2]) # dictionary partial하게 보기

{'C1': array([ 0,  4, 14,  0, 11, 10,  0, 21, 11,  0, 16,  0, 11, 18,  0,  0,  0,
         5, 21, 11,  6,  9, 11,  0, 11,  6, 11,  0, 16,  0, 11, 22, 16,  0,
        19,  0, 11,  3,  0, 11,  0,  0,  0,  0,  0,  0, 16,  2, 10,  0,  0,
         0,  7,  9,  0, 16,  6,  1, 21, 11,  0,  9, 14, 13,  0, 11,  0, 26,
        11,  7,  9, 23,  0,  9,  0,  0, 11,  0,  0, 16, 16, 11, 11,  0,  9,
         0,  0, 21,  0, 16,  0, 21, 11,  0, 19, 11,  9,  0, 25,  0,  0, 11,
        11, 11, 11,  0,  0,  0,  0, 15,  9,  0,  0, 11, 12, 16, 11, 16,  0,
         0,  0,  9, 11,  0, 18,  6,  0,  0,  0, 11,  0,  0, 21, 11,  8,  0,
        10, 16,  0,  6,  0,  0, 11, 20,  0, 11, 21, 10,  0,  0, 11, 11, 11,
         0, 11, 18,  0, 10,  0, 25]),
 'C2': array([30, 38, 18, 72,  5, 30, 57, 18, 44, 18, 32, 31, 18, 20,  4, 11, 18,
        80, 45,  5, 74, 63, 84, 88, 42, 71, 18, 18,  0, 52, 23, 78, 39,  5,
        19, 28, 84, 18,  5, 16, 85, 35, 19,  6, 87, 50, 75, 65, 55,  5, 18,
        19, 31, 18, 27, 82, 39, 13, 64

In [21]:
dict(list(train_model_input.items())[30:31])

{'I5': array([3.64060686e-03, 3.94218393e-06, 2.66649321e-02, 3.94218393e-06,
        8.50920401e-03, 2.33810929e-02, 1.26662370e-02, 0.00000000e+00,
        1.97109197e-06, 2.44468623e-01, 1.47358835e-02, 2.75558657e-03,
        5.79303929e-03, 1.25266837e-01, 3.48252528e-02, 5.34560141e-03,
        3.96780813e-03, 2.16820116e-05, 2.06314196e-02, 6.03225101e-01,
        3.31734778e-03, 4.63334733e-01, 1.33245817e-03, 3.63153984e-02,
        0.00000000e+00, 6.20893969e-03, 0.00000000e+00, 2.29927878e-02,
        1.48247798e-01, 2.90676932e-02, 7.03679832e-04, 1.56938342e-01,
        2.03929175e-02, 8.28331687e-02, 5.64717848e-03, 2.12700534e-02,
        1.24178794e-04, 5.89356498e-04, 9.85545983e-06, 8.51610284e-02,
        1.57687357e-05, 8.87188494e-03, 2.39349697e-02, 1.37976438e-05,
        3.50854370e-04, 2.16820116e-05, 1.71879219e-03, 7.39356596e-03,
        1.81340461e-04, 0.00000000e+00, 1.97109197e-05, 4.33640232e-03,
        4.84888623e-04, 5.91327590e-04, 4.13929313e-05, 1.

In [22]:
train.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
132,1,0.0,0.050966,0.001066,0.034483,0.003641,0.045584,0.039867,0.122449,0.010638,...,8,61,5,3,102,0,8,35,1,24
31,1,0.027027,0.000333,0.000355,0.0,4e-06,0.0,0.013289,0.0,0.0,...,8,70,5,2,133,0,0,77,5,58
51,0,0.0,0.039973,0.001421,0.045977,0.026665,0.0,0.0,0.142857,0.033849,...,0,32,0,0,158,0,1,110,0,0
112,1,1.0,0.037975,1.0,0.057471,4e-06,0.001425,0.086379,1.0,0.075435,...,8,65,5,1,125,1,0,61,9,62
94,0,0.0,0.000333,0.00071,0.0,0.008509,0.0,0.026578,0.0,0.0,...,8,40,10,3,115,0,0,21,16,25


이제 DeepFM을 활용할 시간, parameter들은 아래와 같이 다양하다.  
linear한 부분과 deep한 부분을 잘 나누고, task 목적에 맞게 적용하면 되는 것 같다.  
- param linear_feature_columns: An iterable containing all the features used by linear part of the model.
- param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
- param fm_group: list, group_name of features that will be used to do feature interactions.
- param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
- param l2_reg_linear: float. L2 regularizer strength applied to linear part
- param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
- param l2_reg_dnn: float. L2 regularizer strength applied to DNN
- param init_std: float,to use as the initialize std of embedding vector  
- param seed: integer ,to use as random seed.  
- param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.  
- param dnn_activation: Activation function to use in DNN  
- param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN  
- param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss  
return: A Keras model instance.

In [23]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

이제 model.fit 시킴, 데이터가 작아서 auc가 들쭉날쭉합니다

In [24]:
%%time
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 128 samples, validate on 32 samples
Epoch 1/10
128/128 - 9s - loss: 0.7348 - binary_crossentropy: 0.7348 - val_loss: 0.6742 - val_binary_crossentropy: 0.6742
Epoch 2/10
128/128 - 0s - loss: 0.6862 - binary_crossentropy: 0.6861 - val_loss: 0.6576 - val_binary_crossentropy: 0.6576
Epoch 3/10
128/128 - 0s - loss: 0.6412 - binary_crossentropy: 0.6412 - val_loss: 0.6433 - val_binary_crossentropy: 0.6432
Epoch 4/10
128/128 - 0s - loss: 0.5996 - binary_crossentropy: 0.5996 - val_loss: 0.6312 - val_binary_crossentropy: 0.6312
Epoch 5/10
128/128 - 0s - loss: 0.5612 - binary_crossentropy: 0.5612 - val_loss: 0.6215 - val_binary_crossentropy: 0.6214
Epoch 6/10
128/128 - 0s - loss: 0.5259 - binary_crossentropy: 0.5258 - val_loss: 0.6138 - val_binary_crossentropy: 0.6138
Epoch 7/10
128/128 - 0s - loss: 0.4934 - binary_crossentropy: 0.4934 - val_loss: 0.6083 - val_binary_crossentropy: 0.6082
Epoch 8/10
128/128 - 0s - loss: 0.4637 - binary_crossentropy: 0.4637 - val_loss: 0.6048 - val_binary_

In [25]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [26]:
pred_ans # 클릭할 확률들이 산출 된다. 

array([[0.27317607],
       [0.32546216],
       [0.48085976],
       [0.29952896],
       [0.29341197],
       [0.35952592],
       [0.41922015],
       [0.36183697],
       [0.25876385],
       [0.28970796],
       [0.30900788],
       [0.33773547],
       [0.3930998 ],
       [0.3503009 ],
       [0.3608576 ],
       [0.34273624],
       [0.31554228],
       [0.28798962],
       [0.47089738],
       [0.29938358],
       [0.32682222],
       [0.3708282 ],
       [0.2527224 ],
       [0.3172428 ],
       [0.36748862],
       [0.3398018 ],
       [0.36816552],
       [0.33345556],
       [0.26156232],
       [0.32996756],
       [0.28315267],
       [0.33249533],
       [0.27098158],
       [0.34877563],
       [0.30287418],
       [0.3301702 ],
       [0.5383497 ],
       [0.3515877 ],
       [0.3783651 ],
       [0.39717102]], dtype=float32)

In [27]:
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.5765
test AUC 0.5133
