## 설치
python 3.5~3.7  
$ pip install deepctr[gpu]

## DeepCTR의 4가지 단계
- DeepFM: A Factorization-Machine based Neural Network for CTR Prediction 논문을 예시로 들 예정입니다

- 참고 : https://www.ijcai.org/Proceedings/2017/0239.pdf

- 다룰 샘플 데이터는 Criteo Display Ads 데이터이며, 광고 클릭률 예측을 하는 것이 목적입니다.

    - 13개의 정수 feature, 26개의 범주형 feature들을 가지고 있습니다.(26 categorical features where each category has a high cardinality.)
    - Task는 이진분류입니다

### Step 1 : import model & sample data fillna

In [24]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import  SparseFeat, DenseFeat,get_feature_names
import torch

In [2]:
data = pd.read_csv('./examples/criteo_sample.txt')

In [3]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,...,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de


In [4]:
data.shape

(200, 40)

In [5]:
data['label'].value_counts()

0    151
1     49
Name: label, dtype: int64

In [6]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I'+str(i) for i in range(1, 14)]
print(sparse_features, dense_features)

['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'] ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


In [7]:
data[sparse_features] = data[sparse_features].fillna('-1', ) # 왜 이러한 값으로 보정했는지는 모름
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,3,260.0,0.0,17668.0,0.0,0.0,33.0,0.0,...,e5ba7672,87c6f83c,-1,-1,0429f84b,-1,3a171ecb,c0d61a5c,-1,-1
1,0,0.0,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,-1,-1,5155d8a3,-1,be7c41b4,ded4aac9,-1,-1
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,-1,-1,2e01979f,-1,bcdee96c,6d5d1302,-1,-1
3,0,0.0,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,-1,-1,e587c466,-1,32c7478e,3b183c5c,-1,-1
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,-1,32c7478e,0d4a6d1a,001f3601,92c878de


### Step 2 : 간단한 전처리
일반적으로 sparse한 feature를 임베딩 하는데 두가지 방법이 있습니다. (label encoding, hashing encoding)

- label encoding : 정수값으로 매핑을 시킴. 값의 범위는 [0 ~ len(#unique)-1]
- hash encoding : 고정 범위 값으로 매핑 (예를 들면, 0 ~ 9999)

In [8]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [9]:
data[feat].head()

0     0
1     0
2     0
3     0
4    47
Name: C26, dtype: int32

In [10]:
data[feat].tail()

195    48
196     0
197    11
198    49
199     0
Name: C26, dtype: int32

In [11]:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [12]:
data[dense_features].head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,0.0,0.03125,0.0,0.343137
2,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,0.0,0.09375,0.0,0.176471
3,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,0.0,0.0625,0.0,0.039216
4,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,0.0,0.03125,0.0,0.264706


### Step 3 : 파생변수 생성
sparse feature에 대해서 보통 dense하게 임베딩 시키는 방법을 많이 적용합니다.
dense numerical feature랑 합쳐서(concatenate)해서 FC(fully connected) Layer에 input을 시킵니다.
여기서는 Label Encoding을 실시 합니다.

In [13]:
sparse_feature_columns = [SparseFeat(feat, data[feat].nunique())
                        for feat in sparse_features]
dense_feature_columns = [DenseFeat(feat, 1)
                      for feat in dense_features]

In [15]:
sparse_feature_columns[:6]

[SparseFeat(name='C1', dimension=27, use_hash=False, dtype='int32', embedding_name='C1', embedding=True),
 SparseFeat(name='C2', dimension=92, use_hash=False, dtype='int32', embedding_name='C2', embedding=True),
 SparseFeat(name='C3', dimension=172, use_hash=False, dtype='int32', embedding_name='C3', embedding=True),
 SparseFeat(name='C4', dimension=157, use_hash=False, dtype='int32', embedding_name='C4', embedding=True),
 SparseFeat(name='C5', dimension=12, use_hash=False, dtype='int32', embedding_name='C5', embedding=True),
 SparseFeat(name='C6', dimension=7, use_hash=False, dtype='int32', embedding_name='C6', embedding=True)]

In [16]:
dense_feature_columns[:6]

[DenseFeat(name='I1', dimension=1, dtype='float32'),
 DenseFeat(name='I2', dimension=1, dtype='float32'),
 DenseFeat(name='I3', dimension=1, dtype='float32'),
 DenseFeat(name='I4', dimension=1, dtype='float32'),
 DenseFeat(name='I5', dimension=1, dtype='float32'),
 DenseFeat(name='I6', dimension=1, dtype='float32')]

In [17]:
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # C1 ~ I13

### Step 4 : 훈련 샘플 데이터 생성하고 모델 훈련 시키기
데이터들을 나누고, 인풋을 하기위한 작업을 실시.  
deepctr에 있는 인풋 함수들이 매우 다양하므로 잘 활용해보는 것이 중요함

In [18]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [19]:
dict(list(train_model_input.items())[0:2]) # dictionary partial하게 보기

{'C1': array([ 0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  9, 16, 11, 14,
         0,  3,  0, 11,  0,  0,  0, 14, 17,  0,  0,  0,  0,  0,  9, 16,  0,
        10, 21, 11, 11, 18,  0,  1,  0,  0, 16, 21, 16,  0,  5, 25, 11,  0,
        10,  0, 11, 23,  0, 11, 16, 21,  0, 11, 16, 11, 11, 16,  0,  4, 21,
         0,  6,  0, 25, 11, 18, 11,  1, 11, 16, 11,  2, 16,  0, 11,  0,  0,
         7,  0,  0, 16, 13,  6,  0,  0,  0, 11,  0, 11,  3, 26, 11,  9,  0,
         0, 11,  0,  9,  0, 21, 10, 11, 11,  0,  0, 10,  0,  0, 16,  0, 11,
        16,  0,  0,  9,  0, 11,  0,  9,  0, 11,  0,  0,  9, 11,  6, 18, 11,
         0,  0, 21,  0,  0, 11,  0,  0,  0, 16,  0, 19,  9,  0,  0,  0, 12,
         0,  0, 21, 10, 16,  0,  0]),
 'C2': array([30, 31, 56, 18,  9,  5, 89, 28, 46, 18, 84, 72, 84, 60,  5, 44, 40,
        13,  5, 18, 18, 84, 44, 81, 18, 67, 19,  0, 77, 18, 18, 73, 33, 23,
        26, 45, 46, 85, 20, 18, 13, 33, 35,  7, 89, 30, 46, 80, 18, 16, 35,
        12, 19, 42, 69, 18, 10, 82, 68

In [20]:
dict(list(train_model_input.items())[30:31])

{'I5': array([2.32588852e-04, 4.84888623e-04, 6.24639044e-03, 2.83916087e-02,
        1.08015840e-03, 7.54928223e-04, 1.04073656e-03, 2.12700534e-02,
        0.00000000e+00, 2.29927878e-02, 4.00131669e-04, 1.13889694e-02,
        7.22799424e-03, 7.77083296e-02, 2.44021185e-03, 1.97109197e-06,
        3.29803108e-02, 6.72536579e-03, 7.35611521e-03, 3.53811008e-03,
        0.00000000e+00, 1.71169626e-02, 1.81340461e-04, 2.75164438e-03,
        2.66649321e-02, 7.29304027e-05, 2.99014651e-03, 5.98423521e-03,
        1.09060518e-02, 1.97109197e-05, 3.00788634e-03, 2.46386496e-04,
        1.11287852e-02, 1.97109197e-06, 2.97634887e-04, 2.06314196e-02,
        3.52963438e-02, 3.51642807e-03, 1.25266837e-01, 0.00000000e+00,
        6.66031975e-03, 5.30223739e-04, 5.91327590e-05, 9.23259477e-03,
        5.51905750e-05, 4.72512137e-01, 6.20893969e-03, 2.16820116e-05,
        6.53614096e-03, 8.51610284e-02, 3.94218393e-06, 2.84763656e-02,
        2.39349697e-02, 0.00000000e+00, 3.09067220e-03, 5.

In [21]:
train.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
98,0,0.027027,0.000666,0.004973,0.011494,0.000233,0.000475,0.013289,0.020408,0.030948,...,8,61,5,2,75,0,0,35,1,24
92,0,0.027027,0.001666,0.026643,0.241379,0.000485,0.032764,0.003322,0.673469,0.031915,...,8,125,5,1,19,0,1,31,8,27
34,0,0.0,0.0,0.000355,0.264368,0.006246,0.069801,0.20598,0.0,0.72824,...,6,11,3,3,12,0,0,84,12,70
26,0,0.0,0.000666,0.00071,0.183908,0.028392,0.037512,0.006645,0.326531,0.099613,...,0,93,0,0,121,5,0,2,0,0
131,0,0.324324,0.000666,0.000355,0.172414,0.00108,0.011396,0.039867,0.367347,0.019342,...,8,89,0,0,57,0,0,70,0,0


In [22]:
train.shape

(160, 40)

이제 DeepFM을 활용할 시간, parameter들은 아래와 같이 다양하다.  
linear한 부분과 deep한 부분을 잘 나누고, task 목적에 맞게 적용하면 되는 것 같다.  

- param linear_feature_columns: An iterable containing all the features used by linear part of the model.
- param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
- param fm_group: list, group_name of features that will be used to do feature interactions.
- param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
- param l2_reg_linear: float. L2 regularizer strength applied to linear part
- param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
- param l2_reg_dnn: float. L2 regularizer strength applied to DNN
- param init_std: float,to use as the initialize std of embedding vector
- param seed: integer ,to use as random seed.
- param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
- param dnn_activation: Activation function to use in DNN
param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
- param task: str, "binary" for binary logloss or "regression" for regression loss  
return: A Keras model instance.

In [25]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

In [26]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

In [27]:
%%time
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

cpu
Train on 128 samples, validate on 32 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.6983 - binary_crossentropy:  0.6983 - val_binary_crossentropy:  0.6906
Epoch 2/10
0s - loss:  0.6859 - binary_crossentropy:  0.6859 - val_binary_crossentropy:  0.6818
Epoch 3/10
0s - loss:  0.6735 - binary_crossentropy:  0.6735 - val_binary_crossentropy:  0.6736
Epoch 4/10
0s - loss:  0.6614 - binary_crossentropy:  0.6614 - val_binary_crossentropy:  0.6656
Epoch 5/10
0s - loss:  0.6491 - binary_crossentropy:  0.6491 - val_binary_crossentropy:  0.6578
Epoch 6/10
0s - loss:  0.6368 - binary_crossentropy:  0.6368 - val_binary_crossentropy:  0.6503
Epoch 7/10
0s - loss:  0.6245 - binary_crossentropy:  0.6245 - val_binary_crossentropy:  0.6433
Epoch 8/10
0s - loss:  0.6122 - binary_crossentropy:  0.6122 - val_binary_crossentropy:  0.6370
Epoch 9/10
0s - loss:  0.6001 - binary_crossentropy:  0.6001 - val_binary_crossentropy:  0.6310
Epoch 10/10
0s - loss:  0.5878 - binary_crossentropy:  0.5878 - val_

In [28]:
pred_ans = model.predict(test_model_input, batch_size=256)
pred_ans # 클릭할 확률들이 산출 된다.

array([[0.43578967],
       [0.42702374],
       [0.4441861 ],
       [0.42078272],
       [0.4245204 ],
       [0.43748072],
       [0.4313904 ],
       [0.43499702],
       [0.44296852],
       [0.4303176 ],
       [0.41810435],
       [0.41593993],
       [0.43440115],
       [0.4366478 ],
       [0.42590103],
       [0.41957003],
       [0.43522128],
       [0.41668954],
       [0.42480537],
       [0.42948022],
       [0.40719205],
       [0.45852298],
       [0.44160125],
       [0.43341914],
       [0.4286052 ],
       [0.42457548],
       [0.4162119 ],
       [0.42717403],
       [0.4299931 ],
       [0.44425237],
       [0.43412456],
       [0.43167877],
       [0.42615494],
       [0.44083756],
       [0.42414126],
       [0.42910254],
       [0.43697315],
       [0.44370252],
       [0.41360542],
       [0.4171199 ]], dtype=float32)

In [29]:
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.5989
test AUC 0.4114
