# Popularity

## Experiment

- Using Jaccard Similarity has the best performance

In [None]:
import glob
import numpy as np
import pandas as pd
from typing import Callable, Tuple
from collections import Counter
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from tensorflow.keras.utils import get_file
import zipfile

In [None]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''
    if data_size == '100k':
        file = get_file('ml-100k.zip', 'http://files.grouplens.org/datasets/movielens/ml-100k.zip')
        file_name = 'ml-100k/*'
    elif data_size == '1m':
        file = get_file('ml-1m.zip', 'http://files.grouplens.org/datasets/movielens/ml-1m.zip')
        file_name = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        file = get_file('ml-10m.zip', 'http://files.grouplens.org/datasets/movielens/ml-10m.zip')
        file_name = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        file = get_file('ml-20m.zip', 'http://files.grouplens.org/datasets/movielens/ml-20m.zip')
        file_name = 'ml-20m/ratings.csv'
    zip_ref = zipfile.ZipFile(file, 'r')
    zip_ref.extractall()

    col_names = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = pd.read_csv(file_name, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [None]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 1. Data Preprocessing

In [None]:
def preprocessing(df: pd.DataFrame, threshold = 3.5) -> pd.DataFrame:
    df = df[df.rating > threshold]
    more_than_4 = df.groupby('userId')['movieId'].nunique()
    more_than_4 = more_than_4.index[more_than_4 > 4]
    df = df[df.userId.isin(more_than_4)]
    return df.reset_index(drop = True)

In [None]:
idx_user_map = ratings.userId.unique()
user_idx_map = {e: i for i, e in enumerate(idx_user_map)}
n_user = idx_user_map.shape[0]
print(f'# of user = {n_user}')

idx_item_map = ratings.movieId.unique()
item_idx_map = {e: i for i, e in enumerate(idx_item_map)}
n_item = idx_item_map.shape[0]
print(f'# of item = {n_item}')

# of user = 6040
# of item = 3706


In [None]:
def Id2idx(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(userId = lambda x: x.userId.map(user_idx_map), 
                     movieId = lambda x: x.movieId.map(item_idx_map))

def idx2Id(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(userId = lambda x: x.userId.apply(lambda x: idx_user_map[x]), 
                     movieId = lambda x: x.movieId.apply(lambda x: idx_item_map[x]))

In [None]:
ratings = preprocessing(ratings)
ratings = Id2idx(ratings)

print(ratings.shape)
ratings.head(5)

(575272, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,5,978300760
1,0,3,4,978300275
2,0,4,5,978824291
3,0,6,5,978302039
4,0,7,5,978300719


In [None]:
def make_interaction(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('userId', as_index = False)[['movieId', 'rating']].agg(list)

def data_split(df: pd.DataFrame, test_size = 0.1) -> pd.DataFrame:
    train_user, test_user = train_test_split(ratings.userId.unique(), test_size = test_size, random_state = 7777)
    train, test = map(lambda x: df[df.userId.isin(x)], (train_user, test_user))
    train, test = map(lambda df: df.reset_index(drop = True), (train, test))
    return train, test

def make_evaluation(df: pd.DataFrame, eval_size = 0.2) -> pd.DataFrame:
    query, answer = train_test_split(df, test_size = eval_size, stratify = df.userId, random_state = 7777)
    query, answer = map(lambda df: df.reset_index(drop = True), (query, answer))
    return query, answer

In [183]:
class ItemCF:
    def __init__(self, N: int):
        self.N = N       
    
    def fit(self, df: pd.DataFrame):
        self.df = df
        row = df.userId
        col = df.movieId
        data = np.ones(shape = df.index.size)
        self.mat = csr_matrix((data, (row, col)), shape = (n_user, n_item))
        self.mat_T = self.mat.T.tocsr()
    
    def predict(self, df:pd.DataFrame, similarity = 'dot') -> list:
        res = []
        top_N = 20
        profile = df['movieId']
        for idx in tqdm(df.index):
            y_true = np.zeros(shape = (n_item, ))
            pos = profile[idx]
            y_true[pos] = 1

            sim = self.mat.dot(y_true)
            if similarity == 'cosine':
                a = self.mat.dot(np.ones(n_item))
                b = np.sum(y_true)
                sim /= np.sqrt(a * b)
            elif similarity == 'jaccard':
                a = self.mat.dot(np.ones(n_item))
                b = np.sum(y_true)
                sim /= (a + b - sim)
            elif similarity == 'euclidean':
                a = self.mat.dot(np.ones(n_item))
                b = np.sum(y_true)
                sim = np.sqrt(a + b - 2 * sim)
            y_pred = self.mat_T.dot(sim)
            cand = np.argsort(np.where(y_true == 1, -10, 1) * y_pred)[:-top_N-1:-1]
            res.append(cand)
        return res

In [184]:
def ndcg(gt, rec):
    dcg, idcg = 0.0, 0.0
    for i, r in enumerate(rec):
        idcg += 1.0 / np.log(i + 2)
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def recall(gt, rec):
    res = [r for r in rec if r in gt]
    return len(res) / len(gt)

def precision(gt, rec):
    res = [r for r in rec if r in gt]
    return len(res) / len(rec)

def precision_k(gt, rec, k):
    rec_k = rec[:k+1]
    res = [r for r in rec_k if r in gt]
    return len(res) / k

def AP_k(gt, rec, k = 20):
    res = 0
    for i in range(k):
        if rec[i] in gt:
            res += precision_k(gt, rec, i+1)
    return res / min(k, len(gt))

In [185]:
train, test = data_split(ratings)
test_q, test_a = make_evaluation(test)
test_q, test_a = map(make_interaction, (test_q, test_a))

In [186]:
model = ItemCF(20)
model.fit(train)

### 1. Dot Product Similarity

$$ Dot(A, B) := A \cdot B$$

In [187]:
pred = model.predict(test_q)

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))




In [188]:
def evaluator(true: pd.DataFrame, pred):
    m_ndcg = 0
    m_recall = 0
    m_precision = 0
    map_k = 0
    for i in range(len(pred)):
        rec = pred[i]
        gt = true.at[i, 'movieId']
        m_ndcg += ndcg(gt, rec)
        m_recall += recall(gt, rec)
        m_precision += precision(gt, rec)
        map_k += AP_k(gt, rec)
    m_ndcg /= len(pred)
    m_recall /= len(pred)
    m_precision /= len(pred)
    map_k /= len(pred)
    return m_ndcg, m_recall, m_precision, map_k

In [189]:
m_ndcg, m_recall, m_precision, map_k = evaluator(test_a, pred)
print(f'nDCG: {m_ndcg:.5f}',
      f'\nRecall: {m_recall:.5f}',
      f'\nPrecision: {m_precision:.5f}',
      f'\nMAP: {map_k:.5f}')

nDCG: 0.14051 
Recall: 0.15420 
Precision: 0.12078 
MAP: 0.09095


### 2. Cosine Similarity

$$ Cos(A, B) := \cos(\angle AB) = \frac{A \cdot B}{|A||B|} = \frac{A \cdot B}{\sqrt{\sum A}\sqrt{\sum B}}$$

In [190]:
pred = model.predict(test_q, similarity = 'cosine')

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))






In [191]:
m_ndcg, m_recall, m_precision, map_k = evaluator(test_a, pred)
print(f'nDCG: {m_ndcg:.5f}',
      f'\nRecall: {m_recall:.5f}',
      f'\nPrecision: {m_precision:.5f}',
      f'\nMAP: {map_k:.5f}')

nDCG: 0.14209 
Recall: 0.15883 
Precision: 0.12202 
MAP: 0.09294


### 3. Jaccard Similarity

$$ \text{J}(A, B) := \frac{A \cdot B}{\sum A + \sum B + A \cdot B}$$

In [159]:
pred = model.predict(test_q, similarity = 'jaccard')

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))




In [161]:
m_ndcg, m_recall, m_precision, map_k = evaluator(test_a, pred)
print(f'nDCG: {m_ndcg:.5f}',
      f'\nRecall: {m_recall:.5f}',
      f'\nPrecision: {m_precision:.5f}',
      f'\nMAP: {map_k:.5f}')

nDCG: 0.14503 
Recall: 0.16471 
Precision: 0.12475 
MAP: 0.09542


### 4. Euclidean Distance

$$
\begin{aligned}
D(A, B) &:= \sqrt{\sum(A - B)^2}
\\ &= \sqrt{\sum (A^2 - 2AB + B^2)}
\\ &= \sqrt{\sum (A + B - 2AB)}
\\ &= \sqrt{\sum A + \sum B - 2 A \cdot B}
\end{aligned}
$$

Here since A and B are binary vectors, it satisfies $ A ^2 = A $ by the element-wise product. In additional, note that $ A \cdot B = \sum AB $.

In [192]:
pred = model.predict(test_q, similarity = 'euclidean')

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))




In [193]:
m_ndcg, m_recall, m_precision, map_k = evaluator(test_a, pred)
print(f'nDCG: {m_ndcg:.5f}',
      f'\nRecall: {m_recall:.5f}',
      f'\nPrecision: {m_precision:.5f}',
      f'\nMAP: {map_k:.5f}')

nDCG: 0.11994 
Recall: 0.12982 
Precision: 0.10604 
MAP: 0.07055
