# Popularity

In [1]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from typing import Callable, Tuple, List

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import get_file
import zipfile

In [2]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''

    if data_size == '1m':
        file = get_file('ml-1m.zip', 'http://files.grouplens.org/datasets/movielens/ml-1m.zip')
        file_name = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        file = get_file('ml-10m.zip', 'http://files.grouplens.org/datasets/movielens/ml-10m.zip')
        file_name = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        file = get_file('ml-20m.zip', 'http://files.grouplens.org/datasets/movielens/ml-20m.zip')
        file_name = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        file = get_file('ml-25m.zip', 'http://files.grouplens.org/datasets/movielens/ml-25m.zip')
        file_name = 'ml-25m/ratings.csv'
    if not glob.glob(file_name):
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_names = ['userId', 'movieId', 'rating', 'timestamp']
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(file_name, engine = 'python')
    else:
        ratings = pd.read_csv(file_name, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [3]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
idx_user_map = ratings.userId.unique()
user_idx_map = {e: i for i, e in enumerate(idx_user_map)}
n_user = idx_user_map.shape[0]
print(f'# of user = {n_user}')

idx_item_map = ratings.movieId.unique()
item_idx_map = {e: i for i, e in enumerate(idx_item_map)}
n_item = idx_item_map.shape[0]
print(f'# of item = {n_item}')

# of user = 6040
# of item = 3706


In [5]:
def preprocessing(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    positive = df.groupby('userId')['movieId'].nunique()
    positive = positive.index[positive >= 5]
    df = df[df.userId.isin(positive)]
    return df.reset_index(drop = True)

In [6]:
def Id2idx(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(userId = lambda x: x.userId.map(user_idx_map), 
                     movieId = lambda x: x.movieId.map(item_idx_map))

def idx2Id(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(userId = lambda x: x.userId.apply(lambda x: idx_user_map[x]), 
                     movieId = lambda x: x.movieId.apply(lambda x: idx_item_map[x]))

In [7]:
ratings = preprocessing(ratings)
ratings = Id2idx(ratings)

print(ratings.shape)
ratings.head(5)

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,5,978300760
1,0,1,3,978302109
2,0,2,3,978301968
3,0,3,4,978300275
4,0,4,5,978824291


In [8]:
def train_valid_test_split(df: pd.DataFrame) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.userId.unique(), test_size = 0.1, random_state = 7777)
    train, test = map(lambda x: df[df.userId.isin(x)], (train_user, test_user))
    train, test = map(lambda df: df.reset_index(drop = True), (train, test))
    return train, test

def query_answer_split(df: pd.DataFrame) -> pd.DataFrame:
    query, answer = train_test_split(df, test_size = 0.2, stratify = df.userId, random_state = 7777)
    query, answer = map(lambda df: df.sort_values(['userId', 'timestamp']), (query, answer))
    query, answer = map(lambda df: df.reset_index(drop = True), (query, answer))
    return query, answer

def list_aggregation(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('userId', as_index = False)[['movieId', 'rating']].agg(list)

In [9]:
class POP:
    def __init__(self, K: int, implicit = True):
        self.K = K
        self.implicit = implicit
    
    def fit(self, df: pd.DataFrame):
        self.train = df
        if self.implicit:
            item_rank = self.train.groupby('movieId')[['userId']].agg('count')
            item_rank = item_rank.userId.sort_values(ascending = False)
        else:
            item_rank = self.train.groupby('movieId')[['rating']].agg(['sum', 'count'])
            item_rank = item_rank.rating[item_rank.rating['count'] > 99]
            item_rank = item_rank['sum'] / item_rank['count']
            item_rank = item_rank.sort_values(ascending = False)
        self.pop = item_rank[:self.K].index.tolist()
    
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        pred = []
        for idx in tqdm(df.index):
            pred.append({'userId': df.at[idx, 'userId'],
                         'movieId': self.pop})
        return pd.DataFrame(pred)

In [10]:
def Recall(gt: pd.Series, rec: pd.Series, at_K = None) -> float:
    res = [r for r in rec if r in gt]
    return len(res) / len(gt)

def Recall_K(gt: pd.Series, rec: pd.Series, K = 20) -> float:
    res = [r for r in rec[:K] if r in gt]
    return len(res) / np.min([K, len(gt)])

def Precision(gt: pd.Series, rec: pd.Series) -> float:
    res = [r for r in rec if r in gt]
    return len(res) / len(rec)

def Precision_K(gt: pd.Series, rec: pd.Series, K = 20) -> float:
    res = [r for r in rec[:K] if r in gt]
    return len(res) / K

def AP(gt: pd.Series, rec: pd.Series) -> float: # Average Precision
    res = 0
    for i, r in enumerate(rec):
        if r in gt:
            res += Precision_K(gt, rec, i+1)
    return res / len(gt)

def AP_K(gt: pd.Series, rec: pd.Series, K = 20) -> float:
    res = 0
    for i, r in enumerate(rec[:K]):
        if r in gt:
            res += Precision_K(gt, rec, i+1)
    return res / np.min([K, len(gt)])

def nRR(gt: pd.Series, rec: pd.Series) -> float: # normalized Reciprocal Rank
    rr, irr = 0.0, 0.0
    j = 0
    for i, r in enumerate(rec):
        if r in gt:
            rr += 1 / (i + 1)
            irr += 1 / (j + 1)
            j += 1
    return rr / irr if j > 0 else 0

def nRR_K(gt: pd.Series, rec: pd.Series, K = 20) -> float:
    rr, irr = 0.0, 0.0
    j = 0
    for i, r in enumerate(rec[:K]):
        if r in gt:
            rr += 1 / (i + 1)
            irr += 1 / (j + 1)
            j += 1
    return rr / irr if j > 0 else 0

def nDCG(gt: pd.Series, rec: pd.Series) -> float: # normalized Discounted Cumulative Gain
    dcg, idcg = 0.0, 0.0
    j = 0
    for i, r in enumerate(rec):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
            idcg += 1.0 / np.log(j + 2)
            j += 1
    return dcg / idcg if idcg > 0 else 0

def nDCG_K(gt: pd.Series, rec: pd.Series, K = 20) -> float:
    dcg, idcg = 0.0, 0.0
    j = 0
    for i, r in enumerate(rec[:K]):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
            idcg += 1.0 / np.log(j + 2)
            j += 1
    return dcg / idcg if j > 0 else 0

In [11]:
def evaluation(true: pd.DataFrame, pred: pd.DataFrame, K = 20):
    recall = 0
    precision = 0
    map = 0
    nrr = 0
    ndcg = 0
    N = true.index.size
    for gt, rec in zip(tqdm(true.movieId), pred.movieId):
        recall += Recall_K(gt, rec, K) / N
        precision += Precision_K(gt, rec, K) / N
        map += AP_K(gt, rec, K) / N
        nrr += nRR_K(gt, rec, K) / N
        ndcg += nDCG_K(gt, rec, K) / N
    return recall, precision, map, nrr, ndcg

In [12]:
train, test = train_valid_test_split(ratings)
test_q, test_a = query_answer_split(test)
test_q, test_a = map(list_aggregation, (test_q, test_a))

## 1. Quantitative Popularity

In [13]:
model = POP(K = 20)
model.fit(train)

In [14]:
pred = model.predict(test_q)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,9,"[104, 124, 44, 64, 48, 113, 97, 132, 22, 128, ..."
1,14,"[104, 124, 44, 64, 48, 113, 97, 132, 22, 128, ..."
2,18,"[104, 124, 44, 64, 48, 113, 97, 132, 22, 128, ..."
3,22,"[104, 124, 44, 64, 48, 113, 97, 132, 22, 128, ..."
4,42,"[104, 124, 44, 64, 48, 113, 97, 132, 22, 128, ..."


In [15]:
evals = evaluation(test_a, pred)
print(f'{"Recall@20":>12} : {evals[0]:.5f}',
      f'\n{"Precision@20":>12} : {evals[1]:.5f}',
      f'\n{"MAP@20":>12} : {evals[2]:.5f}',
      f'\n{"nRR@20":>12} : {evals[3]:.5f}',
      f'\n{"nDCG@20":>12} : {evals[4]:.5f}')

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


   Recall@20 : 0.12515 
Precision@20 : 0.09296 
      MAP@20 : 0.03704 
      nRR@20 : 0.22985 
     nDCG@20 : 0.37157


## 2. Qualitative Popularity

In [16]:
model = POP(K = 20, implicit = False)
model.fit(train)

In [17]:
pred = model.predict(test_q)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,9,"[1092, 167, 669, 29, 23, 259, 535, 1449, 127, ..."
1,14,"[1092, 167, 669, 29, 23, 259, 535, 1449, 127, ..."
2,18,"[1092, 167, 669, 29, 23, 259, 535, 1449, 127, ..."
3,22,"[1092, 167, 669, 29, 23, 259, 535, 1449, 127, ..."
4,42,"[1092, 167, 669, 29, 23, 259, 535, 1449, 127, ..."


In [18]:
evals = evaluation(test_a, pred)
print(f'{"Recall@20":>12} : {evals[0]:.5f}',
      f'\n{"Precision@20":>12} : {evals[1]:.5f}',
      f'\n{"MAP@20":>12} : {evals[2]:.5f}',
      f'\n{"nRR@20":>12} : {evals[3]:.5f}',
      f'\n{"nDCG@20":>12} : {evals[4]:.5f}')

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


   Recall@20 : 0.06093 
Precision@20 : 0.04801 
      MAP@20 : 0.01450 
      nRR@20 : 0.12036 
     nDCG@20 : 0.21929
