https://deepctr-doc.readthedocs.io/en/latest/Examples.html#regression-movielens

In [1]:
import os

DIR_DATA = os.path.join(os.environ["HOME"], "workspace/third_party/shenweichen/DeepCTR/examples")

In [2]:
import numpy as np
import pandas as pd
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr.models import DeepFM

In [3]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


In [4]:
data = pd.read_csv(os.path.join(DIR_DATA, "movielens_sample.txt"))

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

In [5]:
data[sparse_features] = data[sparse_features].astype(str)
target = ['rating']

In [6]:
# 1.Use hashing encoding on the fly for sparse features,and process sequence features

genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

In [7]:
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=object, value=0).astype(str)

In [8]:
# 2.set hashing space for each sparse field and generate feature config for sequence feature
fixlen_feature_columns = [
    SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
    for feat in sparse_features
]

In [9]:
varlen_feature_columns = [
    VarLenSparseFeat(
        SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
        maxlen=max_len, combiner='mean',
    )
]  # Notice : value 0 is for padding for sequence input feature

In [10]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

In [11]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [12]:
# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}
model_input['genres'] = genres_list

In [13]:
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

2022-09-24 21:05:41.503649: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
model.compile("adam", "mse", metrics=['mse'], )

In [15]:
history = model.fit(
    model_input, data[target].values,
    batch_size=256,
    epochs=10,
    verbose=2,
    validation_split=0.2,
)

Epoch 1/10
1/1 - 2s - loss: 14.2998 - mse: 14.2998 - val_loss: 13.3755 - val_mse: 13.3755
Epoch 2/10
1/1 - 0s - loss: 14.1553 - mse: 14.1553 - val_loss: 13.2324 - val_mse: 13.2324
Epoch 3/10
1/1 - 0s - loss: 13.9901 - mse: 13.9901 - val_loss: 13.0753 - val_mse: 13.0753
Epoch 4/10
1/1 - 0s - loss: 13.8095 - mse: 13.8095 - val_loss: 12.9031 - val_mse: 12.9031
Epoch 5/10
1/1 - 0s - loss: 13.6126 - mse: 13.6126 - val_loss: 12.7140 - val_mse: 12.7140
Epoch 6/10
1/1 - 0s - loss: 13.3969 - mse: 13.3969 - val_loss: 12.5055 - val_mse: 12.5055
Epoch 7/10
1/1 - 0s - loss: 13.1596 - mse: 13.1596 - val_loss: 12.2746 - val_mse: 12.2746
Epoch 8/10
1/1 - 0s - loss: 12.8978 - mse: 12.8978 - val_loss: 12.0183 - val_mse: 12.0183
Epoch 9/10
1/1 - 0s - loss: 12.6082 - mse: 12.6082 - val_loss: 11.7345 - val_mse: 11.7345
Epoch 10/10
1/1 - 0s - loss: 12.2878 - mse: 12.2878 - val_loss: 11.4211 - val_mse: 11.4211
