In [1]:
import torch
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
data = pd.read_csv('./ml-1m_sample.csv')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [3]:
data['cate_id'] = data['genres'].apply(lambda x:x.split('|')[0])
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,cate_id
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067,Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067,Animation
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067,Musical
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067,Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067,Animation


In [4]:
def negative_sample(items_cnt_order, ratio, method_id=0):
    """negative sample method for matching model
    items_cnt_order(dict):the item count dict, the keys(item) sorted by value(count) in reverse order.
    ratio(int):negative sample ratio, >=1
    method_id(int, optional):
        {
        0:'random sampling',
        1:'popularity sampling method used in word2vec',
        2:'popularity sampling method by 'log(count+1)+1e-6' ',
        3:'tencent RALM sampling'
        defaults to 0
        }
    
    returns:
       list:sampled negative item list
    """
    items_set = [item for item, count in items_cnt_order.items()]
    if method_id == 0:
        neg_items = np.random.choice(items_set, size=ratio, replace=True)
    elif method_id == 1:
        p_sel = {item: count**0.75 for item, count in items_cnt_order.items()}
        p_value = np.array(list(p_sel.values())) / sum(p_sel.values())
        neg_items = np.random.choice(items_set, size=ratio, replace=True, p=p_value)
    elif method_id == 2:
        p_sel = {item:np.log(count + 1) + 1e-6 for item, count in items_cnt_order.items()}
        p_value = np.array(list(p_sel.values())) / sum(p_sel.values())
        neg_items = np.random.choice(items_set, size=ratio, replace=True, p=p_value)
    elif method_id == 3:
        p_sel = {item: (np.log(k+2) - np.log(k+1)/np.log(len(items_cnt_order)+1)) for item, k in items_cnt_order.items()}
        p_value = np.array(list(p_sel.values())) / sum(p_sel.values())
        neg_items = np.random.choice(items_set, size=ratio, replace=False, p=p_value)
    else:
        raise ValueError('method id should in (0,1,2,3)')
    return neg_items

In [5]:
from collections import Counter, OrderedDict
import tqdm
import random
def generate_seq_feature_match(data, user_col, item_col, time_col, item_attribute_cols=None, sample_method=0,
                              mode=0, neg_ratio=0, min_item=0):
    """
    generate sequence feature and negative sample for match.
    data:the raw data
    user_col:the col name of user_id
    item_col:the col name of item_id
    time_col:the col name of timestamp
    item_attribute_cols(list[str], optional):the other attribute cols of item which you want to generate sequence
                                             features. Defaults to '[]'
    sample_method(int, optional):the negative sample method '{0:'random sampling',
                                                            1:'popularity sampling method used in word2vec',
                                                            2:'popularity sampling method by 'log(count+1)+1e-6',
                                                            3:'tencent RALM sampling'}
                                                            default to 0'
    mode(int,optional):the training mode. '{0:point-wise, 1:pair-wise, 2:list-wise}. default to 0'
    neg_ratio(int, optional):negative sample ratio, >=1, defaults to 0.
    min_item(int, optional):the min item each user must have. defaults to 0.
    
    returns:
    pd.DataFrame:split train and test data with sequence features
    """
    if item_attribute_cols is None:
        item_attribute_cols = []
    if mode == 2: #list wise learning
        assert neg_ratio > 0, 'neg_ratio must be greater than 0 when list-wise learning'
    elif mode == 1:#pair wise learning
        neg_ratio = 1
    print('preprocess data')
    data.sort_values(time_col, inplace=True) #sort by time from old to new
    train_set, test_set = [], []
    n_cold_user = 0
    
    items_cnt = Counter(data[item_col].tolist())
    item_cnt_order = OrderedDict(sorted((items_cnt.items()), key=lambda x:x[1], reverse=True)) #item_id:item_count
    neg_list = negative_sample(item_cnt_order, ratio=data.shape[0]*neg_ratio, method_id=sample_method)
    neg_idx = 0
    for uid, hist in tqdm.tqdm(data.groupby(user_col), desc='generate sequence features'):
        pos_list = hist[item_col].tolist()
        if len(pos_list) < min_item: #drop this user when his pos_items 
            n_cold_user += 1
            continue
        for i in range(1, len(pos_list)):
            hist_item = pos_list[:i]
            sample = [uid, pos_list[i], hist_item, len(hist_item)]
            if len(item_attribute_cols) > 0:
                for attr_col in item_attribute_cols: #the history of item attribute features
                    sample.append(hist[attr_col].tolist()[:i])
            if i != len(pos_list) - 1:
                if mode == 0: #point-wise, the last col is label_col, include label 0 and 1
                    last_col = 'label'
                    train_set.append(sample + [1])
                    for _ in range(neg_ratio):
                        sample[1] = neg_list[neg_idx]
                        neg_idx += 1
                        train_set.append(sample + [0])
                elif mode == 1: #pair-wise, the last col is neg_col, include one negative item
                    last_col = 'neg_items'
                    for _ in range(neg_ratio):
                        sample_copy = copy.deepcopy(sample)
                        sample_copy.append(neg_list[neg_idx])
                        neg_idx += 1
                        train_set.append(sample_copy)
                elif mode == 2:#list-wise, the last col is neg_col, include neg_ratio negative items
                    last_col = 'neg_items'
                    sample.append(neg_list[neg_idx:neg_idx+neg_ratio])
                    neg_idx += neg_ratio
                    train_set.append(sample)
                else:
                    raise ValueError('mode should in (0,1,2)')
            else:
                test_set.append(sample + [1]) #Note:if mode=1 or 2, the label col is useless.
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    print("n_train:%d, n_test:%d" % (len(train_set), len(test_set)))
    print("%d cold start user droped " % (n_cold_user))
    
    attr_hist_col = ['hist_' + col for col in item_attribute_cols]
    df_train = pd.DataFrame(train_set, 
                            columns=[user_col, item_col, 'hist_' + item_col, 'histlen_' + item_col] + attr_hist_col + [last_col])
    df_test = pd.DataFrame(test_set,
                          columns=[user_col, item_col, 'hist_'+item_col, 'histlen_'+item_col]+attr_hist_col+[last_col])
    
    return df_train, df_test

In [7]:
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
    """
    pads sequences(list of list) to the ndarray of same length
    This is an equivalent implementation of tf.keras.preprocessing.sequence.pad_sequences
    """
    assert padding in ['pre', 'post'], 'Invalid padding={}.'.format(padding)
    assert truncating in ['pre', 'post'], 'Invalid truncating={}'.format(truncating)
    
    if maxlen is None:
        maxlen = max(len(x) for x in sequences)
    arr = np.full((len(sequences), maxlen), value, dtype=dtype)
    for idx, x in enumerate(sequences):
        if len(x) == 0:
            continue #empty list
        if truncating == 'pre':
            trunc = x[-maxlen:]
        else:
            trunc = x[:maxlen]
        trunc = np.asarray(trunc, dtype=dtype)
        
        if padding == 'pre':
            arr[idx, -len(trunc):] = trunc
        else:
            arr[idx, :len(trunc)] = trunc
    return arr

In [8]:
def df_to_dict(data):
    """
    convert the dataframe to a dict type input that the network can accept
    """
    data_dict = data.to_dict('list')
    for key in data.keys():
        data_dict[key] = np.array(data_dict[key])
    return data_dict

In [9]:
def gen_model_input(df, user_profile, user_col, item_profile, item_col, seq_max_len, padding='pre', truncating='pre'):
    #merge use_profile and item_profile, pad history sequence feature
    df = pd.merge(df, user_profile, on=user_col, how='left') #how=left to keep samples order same as the input
    df = pd.merge(df, item_profile, on=item_col, how='left')
    for col in df.columns.to_list():
        if col.startswith('hist_'):
            df[col] = pad_sequences(df[col], maxlen=seq_max_len, value=0, padding=padding, truncating=truncating).tolist()
    input_dict = df_to_dict(df)
    return input_dict

In [10]:
def get_movielens_data(data, load_cache=False):
    sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', 'cate_id']
    user_col, item_col, label_col = 'user_id', 'movie_id', 'label'
    
    feature_max_idx = {}
    for feature in sparse_features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature]) + 1
        feature_max_idx[feature] = data[feature].max() + 1
        if feature == user_col:
            #encode user_id:raw_user_id
            user_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}
        if feature == item_col:
            #encode item_id:raw_item_id
            item_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}
    np.save('./raw_id_maps.npy', np.array((user_map, item_map), dtype=object))
    
    user_profile = data[['user_id', 'gender', 'age', 'occupation', 'zip']].drop_duplicates('user_id')
    item_profile = data[['movie_id', 'cate_id']].drop_duplicates('movie_id')
    
    if load_cache: #if you have run this script before and saved the preprocessed data
        x_train, y_train, x_test = np.load('./data_cache.npy', allow_pickle=True)
    else:
        #Note:mode=2 means list-wise negative sample generate, saved in last col 'neg_items'
        df_train, df_test = generate_seq_feature_match(data, user_col, item_col, time_col='timestamp',
                                                      item_attribute_cols=[], sample_method=1, mode=2,
                                                      neg_ratio=3, min_item=0)
        
        x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
        y_train = np.array([0] * df_train.shape[0]) #label=0 means the first pred value is positive sample
        x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
        np.save('./data_cache.npy', np.array((x_train, y_train, x_test), dtype=object))
    
    user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
    
    user_features = [SparseFeature(name, vocab_size=feature_max_idx[name], embed_dim=16) for name in user_cols]
    user_features += [
        SequenceFeature('hist_movie_id', vocab_size=feature_max_idx['movie_id'], embed_dim=16, pooling='mean',
                       shared_with='movie_id')
    ]
    
    item_features = [SparseFeature('movie_id', vocab_size=feature_max_idx['movie_id'], embed_dim=16)]
    neg_item_feature = [SequenceFeature('neg_items', vocab_size=feature_max_idx['movie_id'], embed_dim=16,
                                       pooling='concat', shared_with='movie_id')]
    all_item = df_to_dict(item_profile)
    test_user = x_test
    return user_features, item_features, neg_item_feature, x_train, y_train, all_item, test_user

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [12]:
class TorchDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
    def __getitem__(self, index):
        return {k:v[index] for k, v in self.x.items()}, self.y[index]
    def __len__(self):
        return len(self.y)

In [13]:
class PredictDataset(Dataset):
    def __init__(self, x):
        super().__init__()
        self.x = x
    def __getitem__(self, index):
        return {k:v[index] for k, v in self.x.items()}
    def __len__(self):
        return len(self.x[list(self.x.keys())[0]])

In [27]:
class MatchDataGenerator(object):
    def __init__(self, x, y=[]):
        super().__init__()
        if len(y) != 0:
            self.dataset = TorchDataset(x, y)
        else:#for pair-wise model, trained without given label
            self.dataset = PredictDataset(x)
    def generate_dataloader(self, x_test_user, x_all_item, batch_size, num_workers=0):
        train_dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
        test_dataset = PredictDataset(x_test_user)
        #shuffle = False to keep same order as ground truth
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
        item_dataset = PredictDataset(x_all_item)
        item_dataloader = DataLoader(item_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
        return train_dataloader, test_dataloader, item_dataloader

In [34]:

class ConcatPooling(nn.Module):
    """Keep the origin sequence embedding shape
   
    Shape:
    - Input: `(batch_size, seq_length, embed_dim)`
    - Output: `(batch_size, seq_length, embed_dim)`
    """

    def __init__(self):
        super().__init__()

    def forward(self, x, mask=None):
        return x


class AveragePooling(nn.Module):
    """Pooling the sequence embedding matrix by `mean`.
    
    Shape:
        - Input
            x: `(batch_size, seq_length, embed_dim)`
            mask: `(batch_size, 1, seq_length)`
        - Output: `(batch_size, embed_dim)`
    """

    def __init__(self):
        super().__init__()

    def forward(self, x, mask=None):
        if mask == None:
            return torch.mean(x, dim=1)
        else:
            sum_pooling_matrix = torch.bmm(mask, x).squeeze(1)
            non_padding_length = mask.sum(dim=-1)
            return sum_pooling_matrix / (non_padding_length.float() + 1e-16)


class SumPooling(nn.Module):
    """Pooling the sequence embedding matrix by `sum`.

    Shape:
        - Input
            x: `(batch_size, seq_length, embed_dim)`
            mask: `(batch_size, 1, seq_length)`
        - Output: `(batch_size, embed_dim)`
    """

    def __init__(self):
        super().__init__()

    def forward(self, x, mask=None):
        if mask == None:
            return torch.sum(x, dim=1)
        else:
            return torch.bmm(mask, x).squeeze(1)

In [30]:
import torch
import torch.nn as nn
class EmbeddingLayer(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.features = features
        self.embed_dict = nn.ModuleDict()
        self.n_dense = 0
        
        for fea in features:
            if fea.name in self.embed_dict:#exist
                continue
            if isinstance(fea, SparseFeature) and fea.shared_with == None:
                self.embed_dict[fea.name] = fea.get_embedding_layer()
            elif isinstance(fea, SequenceFeature) and fea.shared_with == None:
                self.embed_dict[fea.name] = fea.get_embedding_layer()
            elif isinstance(fea, DenseFeature):
                self.n_dense += 1
    def forward(self, x, features, squeeze_dim=False):
        sparse_emb, dense_values = [], []
        sparse_exists, dense_exists = False, False
        for fea in features:
            if isinstance(fea, SparseFeature):
                if fea.shared_with == None:
                    sparse_emb.append(self.embed_dict[fea.name](x[fea.name].long()).unsqueeze(1))
                else:
                    sparse_emb.append(self.embed_dict[fea.shared_with](x[fea.name].long()).unsqueeze(1))
            elif isinstance(fea, SequenceFeature):
                if fea.pooling == 'sum':
                    pooling_layer = SumPooling()
                elif fea.pooling == 'mean':
                    pooling_layer = AveragePooling()
                elif fea.pooling == 'concat':
                    pooling_layer = ConcatPooling()
                else:
                    raise ValueError('Sequence pooling method supports only pooling in %s, got %s.' % 
                                     (['sum', 'mean'], fea.pooling))
                fea_mask = InputMask()(x, fea)
                if fea.shared_with == None:
                    sparse_emb.append(pooling_layer(self.embed_dict[fea.name](x[fea.name].long()), fea_mask).unsqueeze(1))
                else:
                    sparse_emb.append(pooling_layer(self.embed_dict[fea.shared_with](x[fea.name].long()), fea_mask).unsqueeze(1))
            else:
                dense_values.append(x[fea.name].float().unsqueeze(1))
        if len(dense_values) > 0:
            dense_exists = True
            dense_values = torch.cat(dense_values, dim=1)
        if len(sparse_emb) > 0:
            sparse_exists = True
            sparse_emb = torch.cat(sparse_emb, dim=1) #[batch_size, num_features, embed_dim]
        if squeeze_dim: #if the emb_dim of sparse_features is different, we must squeeze_dim
            if dense_exists and not sparse_exists:#only input dense features
                return dense_values
            elif not dense_exists and sparse_exists:
                return sparse_emb.flatten(start_dim=1) #squeeze dim to :[batch_size, num_features*embed_dim]
            elif dense_exists and sparse_exists:
                return torch.cat((sparse_emb.flatten(start_dim=1), dense_values), dim=1) #concat dense value with sparse embedding
            else:
                raise ValueError('The input features can note be empty')
        else:
            if sparse_exists:
                return sparse_emb #[batch_size, num_features, embed_dim]
            else:
                raise ValueError('If keep the original shape:[batch_size, num_features, embed_dim], expected \
                                 %s in feature list, got %s' % ('sparseFeatures', features))

In [16]:
def activation_layer(act_name):
    if isinstance(act_name, str):
        if act_name.lower() == 'sigmoid':
            act_layer = nn.Sigmoid()
        elif act_name.lower() == 'relu':
            act_layer = nn.ReLU(inplace=True)
        elif act_name.lower() == 'dice':
            act_layer = Dice()
        elif act_name.lower() == 'prelu':
            act_layer = nn.PReLU()
        elif act_name.lower() == 'softmax':
            act_layer = nn.Softmax(dim=1)
    elif issubclass(act_name, nn.Module):
        act_layer = act_name()
    else:
        raise NotImplementedError
    return act_layer

In [17]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_layer=True, dims=None, dropout=0, activation='relu'):
        super().__init__()
        if dims is None:
            dims = []
        layers = list()
        for i_dim in dims:
            layers.append(nn.Linear(input_dim, i_dim))
            layers.append(nn.BatchNorm1d(i_dim))
            layers.append(activation_layer(activation))
            layers.append(nn.Dropout(p=dropout))
            input_dim = i_dim
        if output_layer:
            layers.append(nn.Linear(input_dim, 1))
        self.mlp = nn.Sequential(*layers)
    def forward(self, x):
        return self.mlp(x)

In [18]:
class RandomNormal(object):
    def __init__(self, mean=0.0, std=1.0):
        self.mean = mean
        self.std = std
    def __call__(self, vocab_size, embed_dim):
        embed = torch.nn.Embedding(vocab_size, embed_dim)
        torch.nn.init.normal_(embed.weight, self.mean, self.std)
        return embed

In [19]:
def get_auto_embedding_dim(num_classes):
    return np.floor(6 * np.pow(num_classes, 0.26))

In [20]:
class SequenceFeature(object):
    def __init__(self, name, vocab_size, embed_dim=None, pooling='mean', shared_with=None, padding_idx=None,
                initializer=RandomNormal(0, 0.0001)):
        self.name = name
        self.vocab_size = vocab_size
        if embed_dim is None:
            self.embed_dim = get_auto_embedding_dim(vocab_size)
        else:
            self.embed_dim = embed_dim
        self.pooling = pooling
        self.shared_with = shared_with
        self.padding_idx = padding_idx
        self.initializer = initializer
    def __repr__(self):
        return f'<SequenceFeature {self.name} with Embedding shape ({self.vocab_size}, {self.embed_dim})>'
    def get_embedding_layer(self):
        if not hasattr(self, 'embed'):
            self.embed = self.initializer(self.vocab_size, self.embed_dim)
        return self.embed

class SparseFeature(object):
    def __init__(self, name, vocab_size, embed_dim=None, shared_with=None, padding_idx=None, 
                 initializer=RandomNormal(0, 0.0001)):
        self.name = name
        self.vocab_size = vocab_size
        if embed_dim is None:
            self.embed_dim = get_auto_embedding_dim(vocab_size)
        else:
            self.embed_dim = embed_dim
        self.shared_with = shared_with
        self.padding_idx = padding_idx
        self.initializer = initializer
    def __repr__(self):
        return f'<SparseFeature {self.name} with Embedding shape ({self.vocab_size}, {self.embed_dim})>'
    def get_embedding_layer(self):
        if not hasattr(self, 'embed'):
            self.embed = self.initializer(self.vocab_size, self.embed_dim)
        return self.embed

class DenseFeature(object):
    def __init__(self, name):
        self.name = name
        self.embed_dim = 1
    def __repr__(self):
        return f'<DenseFeature {self.name}>'

In [32]:
class InputMask(nn.Module):
    """Return inputs mask from given features

    Shape:
        - Input: 
            x (dict): {feature_name: feature_value}, sequence feature value is a 2D tensor with shape:`(batch_size, seq_len)`,\
                      sparse/dense feature value is a 1D tensor with shape `(batch_size)`.
            features (list or SparseFeature or SequenceFeature): Note that the elements in features are either all instances of SparseFeature or all instances of SequenceFeature.
        - Output: 
            - if input Sparse: `(batch_size, num_features)`
            - if input Sequence: `(batch_size, num_features_seq, seq_length)`
    """

    def __init__(self):
        super().__init__()

    def forward(self, x, features):
        mask = []
        if not isinstance(features, list):
            features = [features]
        for fea in features:
            if isinstance(fea, SparseFeature) or isinstance(fea, SequenceFeature):
                if fea.padding_idx != None:
                    fea_mask = x[fea.name].long() != fea.padding_idx
                else:
                    fea_mask = x[fea.name].long() != -1
                mask.append(fea_mask.unsqueeze(1).float())
            else:
                raise ValueError("Only SparseFeature or SequenceFeature support to get mask.")
        return torch.cat(mask, dim=1)

In [21]:
import torch.nn.functional as F
class YoutubeDNN(torch.nn.Module):
    """
    user_features(list[feature class]):training by the user tower module
    item_features(list[feature class]):training by the embedding table, it's the item id feature
    neg_item_feature(list[feature class]):training by the embedding table, it's the negative items id feature
    user_params(dict):the params of the user tower module, keys include:{'dims':list, 'activation':str, 'dropout':float, 'output_layer':bool}
    temperature(float):temperature factor for similarity score, default to 1.0
    """
    def __init__(self, user_features, item_features, neg_item_feature, user_params, temperature=1.0):
        super().__init__()
        self.user_features = user_features
        self.item_features = item_features
        self.neg_item_feature = neg_item_feature
        self.temperature = temperature
        self.user_dims = sum([fea.embed_dim for fea in user_features])
        self.embedding = EmbeddingLayer(user_features + item_features)
        self.user_mlp = MLP(self.user_dims, output_layer=False, **user_params)
        self.mode = None
    def forward(self, x):
        user_embedding = self.user_tower(x)
        item_embedding = self.item_tower(x)
        if self.mode == 'user':
            return user_embedding
        if self.mode == 'item':
            return item_embedding
        #calculate cosine score
        y = torch.mul(user_embedding, item_embedding).sum(dim=2)
        y = y / self.temperature
        return y
    def user_tower(self, x):
        if self.mode == 'item':
            return None
        input_user = self.embedding(x, self.user_features, squeeze_dim=True)
        user_embedding = self.user_mlp(input_user).unsqueeze(1) 
        user_embedding = F.normalize(user_embedding, p=2, dim=2)
        if self.mode == 'user':
            return user_embedding.squeeze(1)
        return user_embedding
    def item_tower(self, x):
        if self.mode == 'user':
            return None
        pos_embedding = self.embedding(x, self.item_features, squeeze_dim=False)
        pos_embedding = F.normalize(pos_embedding, p=2, dim=2)
        if self.mode == 'item':
            return pos_embedding.squeeze(1)
        neg_embeddings = self.embedding(x, self.neg_item_feature, squeeze_dim=False).squeeze(1)
        neg_embeddings = F.normalize(neg_embeddings, p=2, dim=2)
        return torch.cat((pos_embedding, neg_embeddings), dim=1)

In [22]:
from sklearn.metrics import roc_auc_score

In [23]:
class EarlyStopper(object):
    def __init__(self, patience):
        self.patience = patience
        self.trial_counter = 0
        self.best_auc = 0
        self.best_weights = None
    def stop_training(self, val_auc, weights):
        if val_auc > self.best_auc:
            self.best_auc = val_auc
            self.trial_counter = 0
            self.best_weights = copy.deepcopy(weights)
            return False
        elif self.trial_counter + 1 < self.patience:
            self.trial_counter += 1
            return False
        else:
            return True

In [24]:
class MatchTrainer(object):
    """A general trainer for Matching/Retrieval

    Args:
        model (nn.Module): any matching model.
        mode (int, optional): the training mode, `{0:point-wise, 1:pair-wise, 2:list-wise}`. Defaults to 0.
        optimizer_fn (torch.optim): optimizer function of pytorch (default = `torch.optim.Adam`).
        optimizer_params (dict): parameters of optimizer_fn.
        scheduler_fn (torch.optim.lr_scheduler) : torch scheduling class, eg. `torch.optim.lr_scheduler.StepLR`.
        scheduler_params (dict): parameters of optimizer scheduler_fn.
        n_epoch (int): epoch number of training.
        earlystop_patience (int): how long to wait after last time validation auc improved (default=10).
        device (str): `"cpu"` or `"cuda:0"`
        gpus (list): id of multi gpu (default=[]). If the length >=1, then the model will wrapped by nn.DataParallel.
        model_path (str): the path you want to save the model (default="./"). Note only save the best weight in the validation data.
    """

    def __init__(
        self,
        model,
        mode=0,
        optimizer_fn=torch.optim.Adam,
            optimizer_params=None,
        scheduler_fn=None,
        scheduler_params=None,
        n_epoch=10,
        earlystop_patience=10,
        device="cpu",
            gpus=None,
        model_path="./",
    ):
        self.model = model  # for uniform weights save method in one gpu or multi gpu
        if gpus is None:
            gpus = []
        self.gpus = gpus
        if len(gpus) > 1:
            print('parallel running on these gpus:', gpus)
            self.model = torch.nn.DataParallel(self.model, device_ids=gpus)
        self.device = torch.device(device)  #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        if optimizer_params is None:
            optimizer_params = {
                "lr": 1e-3,
                "weight_decay": 1e-5
            }
        self.mode = mode
        if mode == 0:  #point-wise loss, binary cross_entropy
            self.criterion = torch.nn.BCELoss()  #default loss binary cross_entropy
        elif mode == 1:  #pair-wise loss
            self.criterion = BPRLoss()
        elif mode == 2:  #list-wise loss, softmax
            self.criterion = torch.nn.CrossEntropyLoss()
        else:
            raise ValueError("mode only contain value in %s, but got %s" % ([0, 1, 2], mode))
        self.optimizer = optimizer_fn(self.model.parameters(), **optimizer_params)  #default optimizer
        self.scheduler = None
        if scheduler_fn is not None:
            self.scheduler = scheduler_fn(self.optimizer, **scheduler_params)
        self.evaluate_fn = roc_auc_score  #default evaluate function
        self.n_epoch = n_epoch
        self.early_stopper = EarlyStopper(patience=earlystop_patience)
        self.model_path = model_path

    def train_one_epoch(self, data_loader, log_interval=10):
        self.model.train()
        total_loss = 0
        tk0 = tqdm.tqdm(data_loader, desc="train", smoothing=0, mininterval=1.0)
        for i, (x_dict, y) in enumerate(tk0):
            x_dict = {k: v.to(self.device) for k, v in x_dict.items()}  #tensor to GPU
            y = y.to(self.device)
            if self.mode == 0:
                y = y.float()  #torch._C._nn.binary_cross_entropy expected Float
            else:
                y = y.long()  #
            if self.mode == 1:  #pair_wise
                pos_score, neg_score = self.model(x_dict)
                loss = self.criterion(pos_score, neg_score)
            else:
                y_pred = self.model(x_dict)
                loss = self.criterion(y_pred, y)
            # used for debug
            # if i == 0:
            #     print()
            #     if self.mode == 0:
            #         print('pred: ', [f'{float(each):5.2g}' for each in y_pred.detach().cpu().tolist()])
            #         print('truth:', [f'{float(each):5.2g}' for each in y.detach().cpu().tolist()])
            #     elif self.mode == 2:
            #         pred = y_pred.detach().cpu().mean(0)
            #         pred = torch.softmax(pred, dim=0).tolist()
            #         print('pred: ', [f'{float(each):4.2g}' for each in pred])
            #     elif self.mode == 1:
            #         print('pos:', [f'{float(each):5.2g}' for each in pos_score.detach().cpu().tolist()])
            #         print('neg: ', [f'{float(each):5.2g}' for each in neg_score.detach().cpu().tolist()])

            self.model.zero_grad()
            loss.backward()
            self.optimizer.step()
            total_loss += loss.item()
            if (i + 1) % log_interval == 0:
                tk0.set_postfix(loss=total_loss / log_interval)
                total_loss = 0

    def fit(self, train_dataloader, val_dataloader=None):
        for epoch_i in range(self.n_epoch):
            print('epoch:', epoch_i)
            self.train_one_epoch(train_dataloader)
            if self.scheduler is not None:
                if epoch_i % self.scheduler.step_size == 0:
                    print("Current lr : {}".format(self.optimizer.state_dict()['param_groups'][0]['lr']))
                self.scheduler.step()  #update lr in epoch level by scheduler

            if val_dataloader:
                auc = self.evaluate(self.model, val_dataloader)
                print('epoch:', epoch_i, 'validation: auc:', auc)
                if self.early_stopper.stop_training(auc, self.model.state_dict()):
                    print(f'validation: best auc: {self.early_stopper.best_auc}')
                    self.model.load_state_dict(self.early_stopper.best_weights)
                    break
        torch.save(self.model.state_dict(), os.path.join(self.model_path,
                                                            "model.pth"))  #save best auc model


    def evaluate(self, model, data_loader):
        model.eval()
        targets, predicts = list(), list()
        with torch.no_grad():
            tk0 = tqdm.tqdm(data_loader, desc="validation", smoothing=0, mininterval=1.0)
            for i, (x_dict, y) in enumerate(tk0):
                x_dict = {k: v.to(self.device) for k, v in x_dict.items()}
                y = y.to(self.device)
                y_pred = model(x_dict)
                targets.extend(y.tolist())
                predicts.extend(y_pred.tolist())
        return self.evaluate_fn(targets, predicts)

    def predict(self, model, data_loader):
        model.eval()
        predicts = list()
        with torch.no_grad():
            tk0 = tqdm.tqdm(data_loader, desc="predict", smoothing=0, mininterval=1.0)
            for i, (x_dict, y) in enumerate(tk0):
                x_dict = {k: v.to(self.device) for k, v in x_dict.items()}
                y = y.to(self.device)
                y_pred = model(x_dict)
                predicts.extend(y_pred.tolist())
        return predicts

    def inference_embedding(self, model, mode, data_loader, model_path):
        #inference
        assert mode in ["user", "item"], "Invalid mode={}.".format(mode)
        model.mode = mode
        model.load_state_dict(torch.load(os.path.join(model_path, "model.pth")))
        model = model.to(self.device)
        model.eval()
        predicts = []
        with torch.no_grad():
            tk0 = tqdm.tqdm(data_loader, desc="%s inference" % (mode), smoothing=0, mininterval=1.0)
            for i, x_dict in enumerate(tk0):
                x_dict = {k: v.to(self.device) for k, v in x_dict.items()}
                y_pred = model(x_dict)
                predicts.append(y_pred.data)
        return torch.cat(predicts, dim=0)

In [36]:
user_features, item_features, neg_item_feature, x_train, y_train, all_item, test_user = get_movielens_data(data)
dg = MatchDataGenerator(x=x_train, y=y_train)

model = YoutubeDNN(user_features, item_features, neg_item_feature, user_params={'dims':[128, 64, 16]}, temperature=0.02)

learning_rate = 1e-4
weight_decay = 1e-6
epoch = 10
device = 'cpu'
batch_size = 2048
save_dir = './'
trainer = MatchTrainer(model, mode=2, optimizer_params={'lr':learning_rate, 'weight_decay':weight_decay},
                      n_epoch=epoch, device=device, model_path=save_dir)
train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=batch_size)
trainer.fit(train_dl)

print('inference embedding')
user_embedding = trainer.inference_embedding(model=model, mode='user', data_loader=test_dl, model_path=save_dir)
item_embedding = trainer.inference_embedding(model=model, mode='item', data_loader=item_dl, model_path=save_dir)

print(user_embedding.shape, item_embedding.shape)
#torch.save(user_embedding.data.cpu(), save_dir+'user_embedding.pth')
#torch.save(item_embedding.data.cpu(), save_dir+'item_embedding.pth')
# match_evaluation(user_embedding, item_embedding, test_user, all_item, topk=10)

preprocess data






generate sequence features:   0%|                                                                | 0/2 [00:00<?, ?it/s]



generate sequence features: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 987.59it/s]

n_train:96, n_test:2
0 cold start user droped 
epoch: 0






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 66.86it/s]

epoch: 1






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 83.49it/s]

epoch: 2






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.56it/s]

epoch: 3






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.75it/s]

epoch: 4






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 40.10it/s]

epoch: 5






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.98it/s]

epoch: 6






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.77it/s]

epoch: 7






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 40.11it/s]

epoch: 8






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 77.13it/s]

epoch: 9






train:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]



train: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.78it/s]

inference embedding






user inference:   0%|                                                                            | 0/1 [00:00<?, ?it/s]



user inference: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.13it/s]



item inference:   0%|                                                                            | 0/1 [00:00<?, ?it/s]



item inference: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.23it/s]

torch.Size([2, 16]) torch.Size([93, 16])


In [56]:
model

YoutubeDNN(
  (embedding): EmbeddingLayer(
    (embed_dict): ModuleDict(
      (user_id): Embedding(3, 16)
      (gender): Embedding(3, 16)
      (age): Embedding(3, 16)
      (occupation): Embedding(3, 16)
      (zip): Embedding(3, 16)
      (movie_id): Embedding(94, 16)
    )
  )
  (user_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=96, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Dropout(p=0, inplace=False)
      (4): Linear(in_features=128, out_features=64, bias=True)
      (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU(inplace=True)
      (7): Dropout(p=0, inplace=False)
      (8): Linear(in_features=64, out_features=16, bias=True)
      (9): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU(inplace=True)
      (11): Dropout(p=0, inplace=False)
    

In [44]:
type(user_features), type(item_features), type(neg_item_feature), type(x_train), type(y_train), type(all_item), type(test_user)

(list, list, list, dict, numpy.ndarray, dict, dict)

In [37]:
len(x_train), len(y_train), len(x_test)

(10, 96, 10)

In [45]:
user_features

[<SparseFeature user_id with Embedding shape (3, 16)>,
 <SparseFeature gender with Embedding shape (3, 16)>,
 <SparseFeature age with Embedding shape (3, 16)>,
 <SparseFeature occupation with Embedding shape (3, 16)>,
 <SparseFeature zip with Embedding shape (3, 16)>,
 <SequenceFeature hist_movie_id with Embedding shape (94, 16)>]

In [46]:
item_features

[<SparseFeature movie_id with Embedding shape (94, 16)>]

In [48]:
neg_item_feature

[<SequenceFeature neg_items with Embedding shape (94, 16)>]

In [50]:
all_item

{'movie_id': array([35, 37, 43, 32, 78, 36, 34, 92,  3, 79, 86, 82, 44, 56, 40, 21, 30,
        93, 80, 81, 39, 61, 60, 62, 88, 15, 38, 45, 31, 64, 84, 58, 76, 49,
        89, 16, 52, 83,  7, 75, 87, 68, 51, 25, 41, 90, 65,  6, 59, 53,  8,
        46, 50, 91, 74,  5, 18, 23, 14, 70, 55, 24, 28, 57,  4, 26, 29, 22,
        73, 42, 71, 17, 77, 10, 85, 72, 27, 12, 33, 67, 47,  9, 13,  1, 69,
        19, 11, 20, 66, 63, 54, 48,  2]),
 'cate_id': array([ 1,  7,  7,  7,  5,  6,  7,  7,  1,  5,  7,  7,  7,  7,  2,  7,  7,
         1,  7,  7,  7,  7,  1,  6,  5,  7,  7,  5,  5,  5,  5,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  7,  1,  7,  3,  5,  1,  9,  1,  1,  7,  1,
         1,  1,  7,  5,  1,  3,  2,  6,  1,  7,  8,  8,  3,  7,  4,  4,  8,
         5,  1, 10,  3,  5,  4,  3,  5,  3,  3,  1,  7,  7,  7,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  2,  3])}

In [51]:
test_user

{'user_id': array([2, 1]),
 'movie_id': array([50,  2]),
 'hist_movie_id': array([[ 0,  0,  0,  0, 35, 37, 43, 32, 78, 36, 34, 92,  3, 79, 86, 82,
         44, 56, 40, 21, 30, 93, 80, 81, 39, 61, 60, 62, 88, 15, 38, 45,
         31, 64, 84, 58, 76, 49, 89, 16, 52, 83,  7, 75, 68, 90,  6, 59,
          8, 46],
        [25, 41, 65, 53, 91, 34, 74, 32,  5, 18, 23, 14, 70, 55, 58, 82,
         24, 28, 56, 57,  4, 26, 29, 22, 42, 73, 71, 38, 17, 77, 10, 85,
         72, 64, 27, 33, 12, 67, 47,  9, 13,  1, 69, 19, 11, 20, 66, 63,
         48, 54]]),
 'histlen_movie_id': array([46, 52]),
 'neg_items': array([1, 1]),
 'gender': array([2, 1]),
 'age': array([2, 1]),
 'occupation': array([2, 1]),
 'zip': array([2, 1]),
 'cate_id': array([1, 3])}