In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names

In [2]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [3]:
data = pd.read_csv("./examples/movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
                   "gender", "age", "occupation", "zip", ]
target = ['rating']

In [4]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [5]:
data.shape

(200, 10)

## 1.Label Encoding for sparse features,and process sequence features

In [6]:
for feat in sparse_features:
    lbe = LabelEncoder() # Encode labels with value between 0 and n_classes-1.
    data[feat] = lbe.fit_transform(data[feat])

In [7]:
data[feat].head()

0     35
1    118
2     99
3     55
4     41
Name: zip, dtype: int32

In [8]:
# preprocess the sequence feature

key2index = {}
genres_list = list(map(split, data['genres'].values))

In [9]:
genres_list[:6] # 각 종류에 맡게

[[1, 2], [3, 4], [2, 5], [3, 6], [1, 2], [1]]

In [10]:
genres_length = np.array(list(map(len, genres_list)))
genres_length[:6]

array([2, 2, 2, 2, 2, 1])

In [11]:
max(genres_length)

5

In [12]:
# Notice : padding=`post' , 0을 앞으로
genres_list = pad_sequences(genres_list, maxlen=5, padding='post', )

In [13]:
genres_list[0:6]

array([[1, 2, 0, 0, 0],
       [3, 4, 0, 0, 0],
       [2, 5, 0, 0, 0],
       [3, 6, 0, 0, 0],
       [1, 2, 0, 0, 0],
       [1, 0, 0, 0, 0]])

## 2.count #unique features for each sparse field and generate feature config for sequence feature

In [14]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                    for feat in sparse_features]
fixlen_feature_columns

[SparseFeat(name='movie_id', dimension=187, use_hash=False, dtype='int32', embedding_name='movie_id', embedding=True),
 SparseFeat(name='user_id', dimension=193, use_hash=False, dtype='int32', embedding_name='user_id', embedding=True),
 SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True),
 SparseFeat(name='age', dimension=7, use_hash=False, dtype='int32', embedding_name='age', embedding=True),
 SparseFeat(name='occupation', dimension=20, use_hash=False, dtype='int32', embedding_name='occupation', embedding=True),
 SparseFeat(name='zip', dimension=188, use_hash=False, dtype='int32', embedding_name='zip', embedding=True)]

In [15]:
varlen_feature_columns = [VarLenSparseFeat('genres', len(
    key2index) + 1, 5, 'mean')]  # Notice : value 0 is for padding for sequence input feature
varlen_feature_columns

[VarLenSparseFeat(name='genres', dimension=18, maxlen=5, combiner='mean', use_hash=False, dtype='float32', embedding_name='genres', embedding=True)]

In [16]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']

## 3.generate input data for model

In [17]:
model_input = {name:data[name] for name in feature_names}
model_input['genres'] = genres_list

In [18]:
dict(list(model_input.items())[0:1]) 

{'movie_id': 0       12
 1      169
 2        6
 3      112
 4       45
 5      146
 6       43
 7      156
 8       30
 9      174
 10      82
 11     173
 12      91
 13     108
 14     132
 15      40
 16     109
 17      31
 18     180
 19     183
 20     129
 21      67
 22     137
 23      87
 24     127
 25       8
 26     104
 27     100
 28     140
 29      25
       ... 
 170     53
 171     90
 172      5
 173    173
 174     41
 175     59
 176    123
 177    159
 178     48
 179    115
 180    138
 181     63
 182     16
 183    179
 184      3
 185     97
 186    128
 187    186
 188    175
 189    105
 190    169
 191     32
 192     68
 193     18
 194     85
 195    176
 196     89
 197    125
 198     15
 199     86
 Name: movie_id, Length: 200, dtype: int64}

## 4.Define Model,compile and train

In [19]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
model

DeepFM(
  (embedding_dict): ModuleDict(
    (age): Embedding(7, 8)
    (gender): Embedding(2, 8)
    (movie_id): Embedding(187, 8)
    (occupation): Embedding(20, 8)
    (user_id): Embedding(193, 8)
    (zip): Embedding(188, 8)
    (genres): EmbeddingBag(18, 8, mode=mean)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (age): Embedding(7, 1)
      (gender): Embedding(2, 1)
      (movie_id): Embedding(187, 1)
      (occupation): Embedding(20, 1)
      (user_id): Embedding(193, 1)
      (zip): Embedding(188, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0)
    (linears): ModuleList(
      (0): Linear(in_features=56, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)

In [20]:
model.compile("adam", "mse", metrics=['mse'], )

In [21]:
history = model.fit(model_input, data[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

cpu
Train on 160 samples, validate on 40 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  14.0472 - mse:  14.0472 - val_mse:  13.0265
Epoch 2/10
0s - loss:  13.7911 - mse:  13.7911 - val_mse:  12.7940
Epoch 3/10
0s - loss:  13.5330 - mse:  13.5330 - val_mse:  12.5695
Epoch 4/10
0s - loss:  13.2817 - mse:  13.2817 - val_mse:  12.3483
Epoch 5/10
0s - loss:  13.0318 - mse:  13.0318 - val_mse:  12.1321
Epoch 6/10
0s - loss:  12.7856 - mse:  12.7856 - val_mse:  11.9213
Epoch 7/10
0s - loss:  12.5435 - mse:  12.5435 - val_mse:  11.7112
Epoch 8/10
0s - loss:  12.3012 - mse:  12.3012 - val_mse:  11.4986
Epoch 9/10
0s - loss:  12.0546 - mse:  12.0546 - val_mse:  11.2756
Epoch 10/10
0s - loss:  11.7953 - mse:  11.7953 - val_mse:  11.0419
