In [0]:
#!pip install deepctr
#!pip install deepctr[gpu]
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_feature_names

In [3]:
# #数据加载
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# # 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
    
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print("feature_names is:", feature_names)

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group'), SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'), SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'), SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'), SparseFeat(name='occupation', vocabulary_size=20, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='occupation', group_name='default_group'), SparseFeat(name='zip', vocabulary_size=188, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='zip', group_name='default_group')]
feature_names is: ['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']


In [4]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}
print(test_model_input)

{'movie_id': array([150, 162, 127,  43,  81, 186,   5, 107,  11, 158, 181, 161, 106,
        35,   6,  37, 140,  71,  83, 172,  38,  72,  66,  15, 119, 104,
        29, 130, 154,  75,  70, 185,  47,   1, 129,  82,  25,  24,  21,
       105]), 'user_id': array([ 29,  44, 103, 102,  29,  67, 120,  59,  13,  49, 167,  89,  80,
         0,  12,  95, 113,  17,  52, 168,  19,  86, 183, 181,  87,  54,
       108, 146,  92,   8,  72,  28, 149, 180, 186,  97,   9,  43,  96,
       121]), 'gender': array([1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0]), 'age': array([2, 3, 2, 0, 2, 2, 4, 2, 2, 2, 1, 2, 2, 3, 2, 5, 4, 6, 2, 3, 1, 5,
       1, 2, 2, 2, 1, 6, 5, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2, 1]), 'occupation': array([ 0, 14,  2,  1,  0,  7,  0,  3,  0,  0, 13, 13,  0,  7, 13, 11,  1,
       16, 11,  1,  4,  1, 14,  0,  3, 19,  4,  1, 19,  0, 16, 18,  4, 16,
        1, 19, 11,  7,  0, 14]), 'zip': array([ 21,  44, 135, 1

In [0]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
print("pred_ans is:", pred_ans)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

In [0]:
def get_embedding_weights(dnn_feature_columns,model):
    embedding_dict = {}
    for fc in dnn_feature_columns:
        if hasattr(fc,'embedding_name'):
            if fc.embedding_name is not None:
                name = fc.embedding_name
            else:
                name = fc.name
            embedding_dict[name] = model.get_layer("sparse_emb_"+name).get_weights()[0]
    return embedding_dict
    
embedding_dict = get_embedding_weights(dnn_feature_columns,model)

#user_id_emb = embedding_dict['user_id']
#print('user_id_emb is:', user_id_emb)
item_id_emb = embedding_dict['gender']
print('item_id_emb is:', item_id_emb)