<a href="https://colab.research.google.com/github/ivoryRabbit/RecSys/blob/master/11_Item2Vec(demo).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Item2Vec

- [Item2Vec: Neural Item Embedding for Collaborative Filtering](https://arxiv.org/vc/arxiv/papers/1603/1603.04259v2.pdf)

In [301]:
import glob
import numpy as np
import pandas as pd
from typing import Callable, Tuple, List
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dot, Flatten

from tensorflow.keras.utils import get_file
import zipfile

In [302]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''

    if data_size == '1m':
        fname = 'ml-1m.zip'
        data = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        fname = 'ml-10m.zip'
        data = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        fname = 'ml-20m.zip'
        data = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        fname = 'ml-25m.zip'
        data = 'ml-25m/ratings.csv'
    if not glob.glob(data):
        origin = f'http://files.grouplens.org/datasets/movielens/{fname}'
        file = get_file(fname, origin)
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_dtypes = {'user_id': np.int32, 'movie_id': np.int32, 
                  'rating': np.uint8, 'timestamp': np.int32}
    col_names = list(col_dtypes.keys())
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(
            data, names = col_names, dtype = col_dtypes, engine = 'python',
            chucksize = 10000
        )
        
    else:
        ratings = pd.read_csv(
            data, names = col_names, dtype = col_dtypes, engine = 'python',
            sep = '|', delimiter = '::'
        )
    print(ratings.shape)
    return ratings

In [303]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [304]:
n_user = ratings.user_id.nunique()
print(f'# of users = {n_user}')

n_item = ratings.movie_id.nunique()
print(f'# of items = {n_item}')

# of users = 6040
# of items = 3706


In [305]:
def binarizer(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    df = df.assign(rating = np.where(df.rating >= threshold, 1, 0))
    return df[df.rating > 0.0].reset_index(drop = True)

def make_warm(df: pd.DataFrame, threshold = 5) -> pd.DataFrame: # remove cold starters
    positive = df.groupby('user_id')['movie_id'].count()
    positive = positive.index[positive >= threshold]
    return df[df.user_id.isin(positive)].reset_index(drop = True)

def train_valid_test_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.user_id.unique(), test_size = 2 * size, random_state = 777)
    valid_user, test_user = train_test_split(test_user, test_size = 0.5, random_state = 777)
    train, valid, test = map(lambda x: df[df.user_id.isin(x)], (train_user, valid_user, test_user))
    train, valid, test = map(lambda df: df.reset_index(drop = True), (train, valid, test))
    return train, valid, test

def query_relev_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    timeorder = df.groupby('user_id')['timestamp'].rank(method = 'first', ascending = True)
    seen_cnts = df.groupby('user_id')['movie_id'].transform('count')
    df = df.assign(seen_cnts = seen_cnts, timeorder = timeorder)
    query = df[df.timeorder < df.seen_cnts * (1-size)]
    relev = df[df.timeorder >= df.seen_cnts * (1-size)]
    relev = relev[relev.user_id.isin(query.user_id.unique())]
    query, relev = map(lambda df: df.drop(columns = ['timeorder', 'seen_cnts']), (query, relev))
    query, relev = map(lambda df: df.reset_index(drop = True), (query, relev))
    return query, relev

def list_agg(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('user_id', as_index = False)[['movie_id']].agg(list)

In [306]:
data = binarizer(ratings)
data = make_warm(data)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,1,978300760
1,1,3408,1,978300275
2,1,2355,1,978824291
3,1,1287,1,978302039
4,1,2804,1,978300719


In [307]:
train, _, test = train_valid_test_split(data, size = 0.1)
test_q, test_r = query_relev_split(test, size = 0.2)

In [308]:
class train_generator:
    def __init__(self, batch_size, ns):
        '''
        ns = the number of negative samples for each positive sample
        '''
        self.batch_size = batch_size
        self.ns = ns

    def __call__(self):
        BUFFER_SIZE = 10000
        AUTOTUNE = tf.data.AUTOTUNE
        dataset = tf.data.Dataset.from_tensor_slices(((self.target, self.context), self.label))
        dataset = dataset.shuffle(BUFFER_SIZE).batch(self.batch_size, drop_remainder = False)
        dataset = dataset.cache().prefetch(buffer_size = AUTOTUNE)
        return dataset

    def encoding(self, df: pd.DataFrame) -> pd.DataFrame:
        item_list = df.movie_id.unique()
        self.item2idx_map = {Id: idx for idx, Id in enumerate(item_list)}
        return df.assign(movie_id = lambda df: df.movie_id.map(self.item2idx_map))

    def get_sampling_dist(self, train):
        freq = train.movie_id.value_counts()
        freq = freq.sort_index()
        pos_dist = np.sqrt(1e-5 / freq).to_numpy()
        neg_dist = np.power(freq, 3/4).to_numpy()
        return pos_dist, neg_dist

    def pos_sampling(self, seq, pos_dist):
        prob = pos_dist[None, seq]
        prob = np.repeat(prob, len(seq), axis = 0)
        prob[np.diag_indices(len(seq))] = 0
        prob = prob / prob.sum(axis = 1)
        prob = np.cumsum(prob, axis = 1)
        draw = np.random.rand(len(seq))
        pos_sample = np.diag(np.apply_along_axis(np.searchsorted, 1, prob, draw))
        return pos_sample

    def neg_sampling(self, seq, neg_dist):
        mask = np.ones(neg_dist.size, dtype = bool)
        mask[seq] = False
        prob = neg_dist[mask]
        prob = prob / prob.sum(axis = 0)
        prob = np.cumsum(prob, axis = 0)
        draw = np.random.rand(len(seq), self.ns)
        neg_sample = np.searchsorted(prob, draw)
        return neg_sample

    def fit(self, train):
        self.train = self.encoding(train)
        pos_dist, neg_dist = self.get_sampling_dist(self.train)
        item_list = list_agg(self.train).movie_id
        pos = item_list.apply(lambda x: self.pos_sampling(x, pos_dist)).explode()
        neg = item_list.apply(lambda x: self.neg_sampling(x, neg_dist)).explode()
        self.target = self.train.movie_id.to_numpy()
        self.context = np.vstack(pos + neg).astype(int)
        self.label = np.zeros(shape = self.context.shape, dtype = np.uint8)
        self.label[:, 0] = 1

In [309]:
class Word2Vec(tf.keras.Model):
    def __init__(self, n_item, latent_dim, ns):
        super(Word2Vec, self).__init__()
        self.embedding_layer = Embedding(n_item, latent_dim, name = 'item_embedding')
        self.dot = Dot(axes = (2, 2))
        self.flatten = Flatten()

    def call(self, data):
        targ, cont = data
        e_t = self.embedding_layer(targ)
        e_c = self.embedding_layer(cont)
        s = self.dot([e_t, e_c])
        return self.flatten(s)

In [310]:
def custom_loss(y_true, x_logit):
    ce_logit = tf.nn.softmax_cross_entropy_with_logits(logits = x_logit, labels = y_true)
    return tf.reduce_sum(ce_logit)

In [311]:
ns = 5
batch_size = 512

train_gen = train_generator(batch_size, ns)
train_gen.fit(train)

train_data = train_gen()

In [312]:
latent_dim = 500

model = Word2Vec(n_item, latent_dim, ns)
model.compile(optimizer = 'adam', loss = custom_loss, metrics = ['accuracy'])

In [313]:
model.fit(train_data, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fd6d70d3198>

In [314]:
weight = model.get_layer(name = 'item_embedding').get_weights()[0]

In [315]:
t = train_gen.train
t = list_agg(t)
t.head(5)

Unnamed: 0,user_id,movie_id
0,1,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,2,"[45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 14, 5..."
2,4,"[51, 112, 113, 114, 36, 40, 77, 86, 91, 19, 10..."
3,6,"[121, 122, 123, 124, 125, 11, 53, 126, 127, 12..."
4,7,"[49, 158, 159, 56, 69, 160, 161, 162, 81, 84, ..."


In [316]:
q = test_q[['user_id', 'movie_id']]
q = q.assign(movie_id = test_q.movie_id.map(train_gen.item2idx_map))
q = q.dropna().astype(int)
q = list_agg(q)
q.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[1303, 1610, 52, 1737, 286, 1027, 453, 36, 73,..."
1,9,"[171, 312, 314, 547, 61, 369, 318, 952, 32, 73..."
2,42,"[49, 417, 312, 313, 419, 69, 160, 84, 231, 86,..."
3,79,"[312, 902, 369, 1107, 1108, 833, 322, 328, 122..."
4,92,"[414, 1721, 1462, 25, 32, 423, 231, 87, 7, 429..."


In [317]:
def get_user_embedding(movie_list, weight):
    return np.mean(weight[movie_list], axis = 0)

In [318]:
train_user_embedding = np.vstack(t.movie_id.apply(lambda x: get_user_embedding(x, weight)))
test_user_embedding = np.vstack(q.movie_id.apply(lambda x: get_user_embedding(x, weight)))

In [319]:
sim = np.matmul(test_user_embedding, train_user_embedding.T)
nn = np.argsort(sim, axis = 1)[:, ::-1][:, :200]

In [320]:
f = lambda x: pd.value_counts(np.hstack(t.movie_id[x])).index[:200]
cands = np.apply_along_axis(f, 1, nn)

In [321]:
top_k = lambda cand, seen: cand[~np.isin(cand, seen)][:100]
pred = [top_k(cand, seen) for cand, seen in zip(cands, q.movie_id)]

In [322]:
idx2item_map = {idx: Id for idx, Id in enumerate(train_gen.item2idx_map)}
pred = pd.DataFrame({'user_id': q.user_id, 'movie_id': pred})
pred = pred.assign(movie_id = pred.movie_id.apply(lambda x: [idx2item_map[y] for y in x]))

In [323]:
class evaluate:
    def __init__(self, true: pd.DataFrame, pred: pd.DataFrame):
        self.true = true
        self.pred = pred
        self.max_K = 10000
        self.idcg = np.cumsum([1.0 / np.log2(i+2) for i in range(self.max_K)])

    def _recall(self, gt: List, rec: List, K = None) -> float: # Recall
        res = [r for r in rec[:K] if r in gt]
        return len(res) / np.min([K, len(gt)])
    
    def _precision(self, gt: List, rec: List, K = None) -> float: # Precision
        res = [r for r in rec[:K] if r in gt]
        return len(res) / len(rec[:K])

    def _AP(self, gt: List, rec: List, K = None) -> float: # Average Precision
        res = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                res += self._precision(gt, rec[:K], i+1)
        return res / np.min([K, len(gt)])

    def _HR(self, gt: List, rec: List, K = None) -> float: # Hit Rate
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0
        return 0

    def _RR(self, gt: List, rec: List, K = None) -> float: # Reciprocal Rank
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0 / (i+1)
        return 0

    def _nDCG(self, gt: List, rec: List, K = None) -> float: # normalized Discounted Cumulative Gain
        dcg = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                dcg += 1.0 / np.log2(i+2)
        idcg = self.idcg[min([len(gt), K])-1]
        return dcg / idcg

    def _get_item(self, rec: List, K = None) -> float:
        for r in rec[:K]:
            self.uniq_item[r] = self.uniq_item.get(r, 0) + 1

    def __call__(self, K = None):
        self.K = K if K else self.max_K
        self.recall = 0.0
        self.precision = 0.0
        self.MAP = 0.0
        self.HR = 0.0
        self.MRR = 0.0
        self.nDCG = 0.0
        self.uniq_item = {}
        n = self.true.index.size
        for gt, rec in zip(tqdm(self.true.movie_id), self.pred.movie_id):
            self.recall += self._recall(gt, rec, K) / n
            self.precision += self._precision(gt, rec, K) / n
            self.MAP += self._AP(gt, rec, K) / n
            self.HR += self._HR(gt, rec, K) / n
            self.MRR += self._RR(gt, rec, K) / n
            self.nDCG += self._nDCG(gt, rec, K) / n
            self._get_item(rec, K)
        self.CO = len(self.uniq_item) / (n*K) # Coverage
        self.ED = -np.sum([self.uniq_item[i] * np.log(self.uniq_item[i] / (n*K)) for i in self.uniq_item]) / (n*K) # Entropy-Diversity

    def print_all(self):
        K = '@' + str(self.K) if self.K else ''
        print(f'{"Recall":>12}{K} : {self.recall:.5f}',
              f'\n{"Precision":>12}{K} : {self.precision:.5f}',
              f'\n{"MAP":>12}{K} : {self.MAP:.5f}',
              f'\n{"HR":>12}{K} : {self.HR:.5f}', 
              f'\n{"MRR":>12}{K} : {self.MRR:.5f}',
              f'\n{"nDCG":>12}{K} : {self.nDCG:.5f}',
              f'\n{"CO":>12}{K} : {self.CO:.5f}',
              f'\n{"ED":>12}{K} : {self.ED:.5f}')

In [324]:
true = list_agg(test_r)
true.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[1394, 104, 1079, 1259, 2355, 3552, 1304, 2081]"
1,9,"[2268, 1682, 2278, 590, 524, 529, 2294, 1921, ..."
2,42,"[3421, 3868, 1257, 2997, 2134, 1265, 593, 2143..."
3,79,"[3044, 3176, 2686, 2024]"
4,92,"[2054, 2134, 733, 592, 2072, 3107, 2162, 1291,..."


In [325]:
pred.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[110, 2571, 2028, 858, 589, 593, 1240, 318, 12..."
1,9,"[2997, 296, 2396, 110, 1196, 589, 260, 1198, 3..."
2,42,"[2571, 1580, 2916, 2028, 110, 858, 2858, 32, 1..."
3,79,"[2858, 3578, 260, 2762, 2997, 1196, 589, 480, ..."
4,92,"[2858, 1210, 2997, 2028, 1196, 260, 589, 1198,..."


In [326]:
scores = evaluate(true, pred)

scores(K = 10)
scores.print_all()

scores(K = 100)
scores.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@10 : 0.09622 
   Precision@10 : 0.07864 
         MAP@10 : 0.04080 
          HR@10 : 0.46026 
         MRR@10 : 0.18688 
        nDCG@10 : 0.09214 
          CO@10 : 0.02086 
          ED@10 : 3.87926


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.32565 
   Precision@100 : 0.04744 
         MAP@100 : 0.04858 
          HR@100 : 0.90232 
         MRR@100 : 0.20545 
        nDCG@100 : 0.17948 
          CO@100 : 0.01119 
          ED@100 : 5.57329
