In [1]:
import deepctr

In [2]:
from deepctr.feature_column import build_input_features

In [3]:
from deepctr.layers import DNN
from deepctr.layers.utils import NoMask, combined_dnn_input
from tensorflow.python.keras.models import Model

In [4]:
from itertools import chain

In [6]:
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, create_embedding_matrix, embedding_lookup,get_dense_input, varlen_embedding_lookup, get_varlen_pooling_list, mergeDict

In [43]:
def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True, 
                              support_dense=True, support_group=False, embedding_matrix_dict=None):
    sparse_feature_columns = list(filter(lambda x:isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    varlen_sparse_feature_columns = list(filter(lambda x:isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
    if embedding_matrix_dict is None:
        embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix, seq_mask_zero=seq_mask_zero)
    
    group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
    dense_value_list = get_dense_input(features, feature_columns)
    if not support_dense and len(dense_value_list) >0:
        raise ValueError('DenseFeat is not supported in dnn_feature_columns')
    
    sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns)
    group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features,
                                                                varlen_sparse_feature_columns)
    group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict)
    if not support_group:
        group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values()))
    
    return group_embedding_dict, dense_value_list

In [9]:
import numpy as np
import tensorflow as tf
from deepctr.layers.utils import reduce_max, reduce_mean, reduce_sum, concat_func, div, softmax
from tensorflow.python.keras.initializers import Zeros
from tensorflow.python.keras.layers import Layer

In [10]:
class PoolingLayer(Layer):
    def __init__(self, mode='mean', supports_masking=False, **kwargs):
        if mode not in ['sum', 'mean', 'max']:
            raise ValueError('mode must be sum or mean')
        self.mode = mode
        self.eps = tf.constant(1e-8, tf.float32)
        super(PoolingLayer, self).__init__(**kwargs)
        self.supports_masking = supports_masking
    def build(self, input_shape):
        super(PoolingLayer, self).build(input_shape)#be sure to call this somewhere
    def call(self, seq_value_len_list, mask=None, **kwargs):
        if not isinstance(seq_value_len_list, list):
            seq_value_len_list = [seq_value_len_list]
        if len(seq_value_len_list) == 1:
            return seq_value_len_list[0]
        expand_seq_value_len_list = list(map(lambda x:tf.expand_dims(x, axis=-1), seq_value_len_list))
        a = concat_func(expand_seq_value_len_list)
        if self.mode == 'mean':
            hist = reduce_mean(a, axis=-1,)
        if self.mode == 'sum':
            hist = reduce_sum(a, axis=-1,)
        if self.mode == 'max':
            hist = reduce_max(a, axis=-1,)
        return hist
    def get_conif(self,):
        config = {'mode':self.mode, 'supports_masking':self.supports_masking}
        base_config = super(PoolingLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [12]:
class SampledSoftmaxLayer(Layer):
    def __init__(self, sampler_config, temperature=1.0, **kwargs):
        self.sampler_config = sampler_config
        self.temperature = temperature
        self.sampler = self.sampler_config['sampler']
        self.item_count = self.sampler_config['item_count']
        
        super(SampledSoftmaxLayer, self).__init__(**kwargs)
    def build(self, input_shape):
        self.vocabulary_size = input_shape[0][0]
        self.zero_bias = self.add_weight(shape=[self.vocabulary_size], initializer=Zeros, dtype=tf.float32,
                                        trainable=False, name='bias')
        super(SampledSoftmaxLayer, self).build(input_shape)
    def call(self, inputs_with_item_idx, training=None, **kwargs):
        item_embeddings, user_vec, item_idx = inputs_with_item_idx
        if item_idx.dtype != tf.int64:
            item_idx = tf.cast(item_idx, tf.int64)
        user_vec /= self.temperature
        if self.sampler == 'inbatch':
            item_vec = tf.gather(item_embeddings, tf.squeeze(item_idx, axis=1))
            logits = tf.matmul(user_vec, item_vec, transpose_b=True)
            loss = inbatch_softmax_cross_entropy_with_logits(logits, self.item_count, item_idx)
        else:
            num_sampled = self.sampler_config['num_sampled']
            if self.sampler == 'frequency':
                sampled_values = tf.nn.fixed_unigram_candidate_sampler(item_idx, 1, num_sampled, True,
                                                                      self.vocabulary_size,
                                                                      distortion=self.sampler_config['distortion'],
                                                                      unigrams=np.maximum(self.item_count,1).tolist(),
                                                                      seed=None, name=None)
            elif self.sampler == 'adaptive':
                sampled_values = tf.nn.learned_unigram_candidate_sampler(item_idx, 1, num_sampled, True,
                                                                        self.vocabulary_size, seed=None, name=None)
            elif self.sampler == 'uniform':
                try:
                    sampled_values = tf.nn.uniform_candidate_sampler(item_idx, 1, num_sampled, True,
                                                                    self.vocabulary_size, seed=None, name=None)
                except AttributeError:
                    sampled_values = tf.random.uniform_candidate_sampler(item_idx, 1, num_sampled, True,
                                                                        self.vocabulary_size, seed=None, name=None)
            else:
                raise ValueError('%s sampler is not supported ' % self.sampler)
            
            loss = tf.nn.sampled_softmax_loss(weights=item_embeddings, biases=self.zero_bias, labels=item_idx,
                                             inputs=user_vec, num_sampled=num_sampled,
                                             num_classes=self.vocabulary_size, sampled_values=sampled_values)
        return tf.expand_dims(loss, axis=1)
    def compute_output_shape(self, input_shape):
        return (None, 1)
    def get_config(self,):
        config = {'sampler_config':self.sampler_config, 'temperature':self.temperature}
        base_config = super(SampledSoftmaxLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [13]:
class EmbeddingIndex(Layer):
    def __init__(self, index, **kwargs):
        self.index = index
        super(EmbeddingIndex, self).__init__(**kwargs)
    def build(self, input_shape):
        super(EmbeddingIndex, self).build(input_shape)
    def call(self, x, **kwargs):
        return tf.constant(self.index)
    def get_config(self,):
        config = {'index':self.index,}
        base_config = super(EmbeddingIndex, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [14]:
from tensorflow.python.keras.layers import Lambda

In [40]:
from collections import namedtuple
class NegativeSampler(namedtuple('NegativeSampler', ['sampler', 'num_sampled', 'item_name', 'item_count', 'distortion'])):
    """
    sampler:sampler name, ['inbatch', 'uniform', 'frequency', 'adaptive']
    num_sampled:negative samples number per one positive sample
    item_name:pkey of item features.
    item_count:global frequency of item
    distortion:skew factor of the unigram probability distribution
    """
    __slots__ = ()
    
    def __new__(cls, sampler, num_sampled, item_name, item_count=None, distortion=1.0,):
        if sampler not in ['inbatch', 'uniform', 'frequency', 'adaptive']:
            raise ValueError('%s sampler is not supported ' % sampler)
        if sampler in ['inbatch', 'frequency'] and item_count is None:
            raise ValueError('item_count must not be None when using inbatch or frequency sampler')
        return super(NegativeSampler, cls).__new__(cls, sampler, num_sampled, item_name, item_count, distortion)

In [15]:
def l2_normalize(x, axis=-1):
    return Lambda(lambda x:tf.nn.l2_normalize(x, axis))(x)
def get_item_embedding(item_embedding, item_input_layer):
    return Lambda(lambda x:tf.squeeze(tf.gather(item_embedding, x), axis=1))(item_input_layer)

In [51]:
def sampledsoftmaxloss(y_true, y_pred):
    return K.mean(y_pred)

In [49]:
def YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64,32), dnn_activation='relu',
               dnn_use_bn=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, output_activation='linear',
               temperature=0.05, sampler_config=None, seed=1024):
    """
    user_feature_columns: an iterable containing user's features used by the model
    item_feature_columns: an iterable containing item's features used by the model
    user_dnn_hidden_units: list, list of positive integer or empty list, the layer number and units in each layer of user tower
    dnn_activation:activation function to use in deep net
    dnn_use_bn:bool. whether use batchnormalization before activation or not in deep net
    l2_reg_dnn:float. L2 regularizer strength applied to DNN
    l2_reg_embedding:float. L2 regularizer strength applied to embedding vector
    dnn_dropout:float in [0,1), the probability we will drop out a given DNN coordinate.
    output_activation:Activation function to use in output layer
    temperature:float. scaling factor
    sampler_config:negative sample config
    seed:integer. to use as random seed
    return: A keras model instance.
    """
    if len(item_feature_columns) > 1:
        raise ValueError('Now YoutubeDNN only support 1 item feature like item_id')
    item_feature_name = item_feature_columns[0].name
    item_vocabulary_size = item_feature_columns[0].vocabulary_size
    
    embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
                                                   seed=seed)
    user_features = build_input_features(user_feature_columns)
    user_inputs_list = list(user_features.values())
    user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, user_feature_columns,
                                                                                  l2_reg_embedding, seed=seed,
                                                                                  embedding_matrix_dict=embedding_matrix_dict)
    user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)
    
    item_features = build_input_features(item_feature_columns)
    item_inputs_list = list(item_features.values())
    user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn,
                      output_activation=output_activation, seed=seed)(user_dnn_input)
    user_dnn_out = l2_normalize(user_dnn_out)
    
    item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name])
    
    item_embedding_matrix = embedding_matrix_dict[item_feature_name]
    item_embedding_weight = NoMask()(item_embedding_matrix(item_index))
    
    pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight])
    
    pooling_item_embedding_weight = l2_normalize(pooling_item_embedding_weight)
    output = SampledSoftmaxLayer(sampler_config._asdict(),temperature)([pooling_item_embedding_weight, user_dnn_out,
                                                                       item_features[item_feature_name]])
    model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)
    
    model.__setattr__('user_input', user_inputs_list)
    model.__setattr__('user_embedding', user_dnn_out)
    
    model.__setattr__('item_input', item_inputs_list)
    model.__setattr__('item_embedding', get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name]))
    
    return model

In [21]:
import numpy as np
import random
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

In [35]:
def gen_data_set(data, seq_max_len=50, negsample=0):
    data.sort_values('timestamp', inplace=True)
    item_ids = data['movie_id'].unique()
    item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values))
    train_set = []
    test_set = []
    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].to_list()
        genres_list = hist['genres'].tolist()
        rating_list = hist['rating'].tolist()
        
        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set, size=len(pos_list)*negsample, replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            genres_hist = genres_list[:i]
            seq_len = min(i, seq_max_len)
            if i != len(pos_list) - 1:
                train_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len],
                                 genres_list[i], rating_list[i]))
                for negi in range(negsample):
                    train_set.append((reviewerID, neg_list[i * negsample + negi], 0, hist[::-1][:seq_len], seq_len,
                                     genres_hist[::-1][:seq_len], item_id_genres_map[neg_list[i * negsample + negi]]))
            else:
                test_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len],
                                genres_list[i], rating_list[i]))
    random.shuffle(train_set)
    random.shuffle(test_set)
    print(len(train_set[0]), len(test_set[0]))
    return train_set, test_set

In [33]:
def gen_model_input(train_set, user_profile, seq_max_len):
    train_uid = np.array([line[0] for line in train_set])
    train_iid = np.array([line[1] for line in train_set])
    train_label = np.array([line[2] for line in train_set])
    train_seq = [line[3] for line in train_set]
    train_hist_len = np.array([line[4] for line in train_set])
    train_seq_genres = np.array([line[5] for line in train_set])
    train_genres = np.array([line[6] for line in train_set])
    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post',
                                        value=0)
    
    train_model_input = {'user_id':train_uid, 'movie_id':train_iid, 'hist_movie_id':train_seq_pad,
                        'hist_genres':train_seq_genres_pad, 'hist_len':train_hist_len, 'genres':train_genres}
    
    for key in ['gender', 'age', 'occupation', 'zip']:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
    
    return train_model_input, train_label

In [30]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
# from deepmatch.models import *
# from deepmatch.utils import sampledsoftmaxloss, NegativeSampler
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [54]:
if __name__ == '__main__':
    data = pd.read_csv('./movielens_sample.txt')
    data['genres'] = list(map(lambda x:x.split('|')[0], data['genres'].values))
    
    sparse_features = ['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']
    SEQ_LEN = 50
    #1.Label Encoding for sparse features, and process sequence features with 'gen_data_set' and 'gen_model_input'
    feature_max_idx = {}
    for feature in sparse_features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature]) + 1
        feature_max_idx[feature] = data[feature].max() + 1
    
    user_profile = data[['user_id', 'gender', 'age', 'occupation', 'zip']].drop_duplicates('user_id')
    
    item_profile = data[['movie_id']].drop_duplicates('movie_id')
    
    user_profile.set_index('user_id', inplace=True)
    user_item_list = data.groupby('user_id')['movie_id'].apply(list)
    
    train_set, test_set = gen_data_set(data, SEQ_LEN, 0)
    
    train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
    test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)
    
    #2. count #unique features for each sparse field and generate feature config for sequence feature
    embedding_dim = 16
    user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                           SparseFeat('gender', feature_max_idx['gender'], embedding_dim),
                           SparseFeat('age', feature_max_idx['age'], embedding_dim),
                           SparseFeat('occupation', feature_max_idx['occupation'], embedding_dim),
                           SparseFeat('zip', feature_max_idx['zip'], embedding_dim),
                           VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                      embedding_name='movie_id'), SEQ_LEN, 'mean', 'hist_len'),
                           VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,
                                                      embedding_name='genres'), SEQ_LEN, 'mean', 'hist_len')
                           ]
    item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]
    
    from collections import Counter
    
    train_counter = Counter(train_model_input['movie_id'])
    item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)]
    sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count)
    
    #3.Define Model and train
    import tensorflow as tf
    if tf.__version__ >= '2.0.0':
        tf.compat.v1.disable_eager_execution()
    else:
        K.set_learning_phase(True)
        
    model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, embedding_dim),
                      sampler_config=sampler_config)
    model.compile(optimizer='adam', loss=sampledsoftmaxloss)
    
    history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0,)
    
    #4. generate user features for testing and full item features for retrieval
    test_user_model_input = test_model_input
    all_item_model_input = {'movie_id':item_profile['movie_id'].values}
    
    user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
    item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
    
    user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
    item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
    
    print(user_embs.shape)
    print(item_embs.shape)
    
    #5. [Optional] ANN search by faiss and evaulate the result
    

100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1496.90it/s]


8 8


  import sys


Train on 227 samples




(3, 16)
(208, 16)


In [28]:
import deepmatch

In [29]:
import tensorflow as tf
print(tf.__version__)

2.10.0


In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./movielens_sample.txt')

In [3]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
