<a href="https://colab.research.google.com/github/ivoryRabbit/RecSys/blob/master/1.itemPOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Item Popularity

## Remark

- [Improving Aggregate Recommendation Diversity
Using Ranking-Based Techniques](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.459.8174&rep=rep1&type=pdf)
- [Effective Latent Models for Binary Feedback in
Recommender Systems](http://www.cs.toronto.edu/~mvolkovs/sigir2015_svd.pdf)
- [Novelty and Diversity Metrics for Recommender
Systems: Choice, Discovery and Relevance
](http://ir.ii.uam.es/rim3/publications/ddr11.pdf)
- [lenskit-python](https://github.com/lenskit/lkpy)

In [29]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from typing import Callable, Tuple, List

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import get_file
import zipfile

In [30]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''

    if data_size == '1m':
        fname = 'ml-1m.zip'
        data = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        fname = 'ml-10m.zip'
        data = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        fname = 'ml-20m.zip'
        data = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        fname = 'ml-25m.zip'
        data = 'ml-25m/ratings.csv'
    if not glob.glob(data):
        origin = f'http://files.grouplens.org/datasets/movielens/{fname}'
        file = get_file(fname, origin)
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_names = ['userId', 'movieId', 'rating', 'timestamp']
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(data, engine = 'python')
    else:
        ratings = pd.read_csv(data, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [31]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [32]:
n_user = ratings.userId.nunique()
print(f'# of user = {n_user}')

n_item = ratings.movieId.nunique()
print(f'# of item = {n_item}')

# of user = 6040
# of item = 3706


In [33]:
def train_valid_test_split(df: pd.DataFrame) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.userId.unique(), test_size = 0.2, random_state = 777)
    valid_user, test_user = train_test_split(test_user, test_size = 0.5, random_state = 777)

    train, valid, test = map(lambda x: df[df.userId.isin(x)], (train_user, valid_user, test_user))
    train, valid, test = map(lambda df: df.reset_index(drop = True), (train, valid, test))
    return train, valid, test

def query_answer_split(df: pd.DataFrame) -> pd.DataFrame:
    timeorder = df.groupby(by = 'userId')['timestamp'].rank(method = 'first', ascending = True)
    seen_cnts = df.groupby(by = 'userId', as_index = False).agg(seen_cnts = ('movieId', 'count'))
    df = df.merge(seen_cnts, how = 'left', on = 'userId')
    df = df.assign(timeorder = timeorder)
    df = df.assign(split_type = np.where(df.timeorder < df.seen_cnts * 0.8, 'query', 'answer'))
    query = df[df.split_type == 'query']
    answer = df[df.split_type == 'answer']
    answer = answer[answer.userId.isin(query.userId.unique())]
    query, answer = map(lambda df: df.drop(columns = ['timeorder', 'seen_cnts', 'split_type']), (query, answer))
    query, answer = map(lambda df: df.reset_index(drop = True), (query, answer))
    return query, answer

def cut_off(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    return df[df.rating >= threshold].reset_index(drop = True)

def make_warm(df: pd.DataFrame, threshold = 5) -> pd.DataFrame: # remove cold starters
    positive = df.groupby('userId')['movieId'].nunique()
    positive = positive.index[positive >= threshold]
    return df[df.userId.isin(positive)].reset_index(drop = True)

def list_aggregation(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('userId', as_index = False)[['movieId', 'rating']].agg(list)

Weighted Rating 
$$\text{(WR)} =  \frac{v}{v+m} \cdot R + \frac{m}{v+m} \cdot C$$

- v is the number of votes for the movie
- m is the minimum votes required to be listed in the data
- R is the average rating of the movie
- C is the mean vote across the whole data

In [34]:
class POP:
    def __init__(self, implicit = True):
        self.implicit = implicit
    
    def fit(self, df: pd.DataFrame):
        _train = make_warm(df)
        _train = _train.assign(mean_rating_over_user = _train.groupby('userId')['rating'].transform('mean'),
                               mean_rating = np.mean(_train.rating))
        _train = _train.assign(debiased_rating = lambda df: df.rating + df.mean_rating - df.mean_rating_over_user)
        if self.implicit:
            item_rank = _train.groupby('movieId').agg(seen_cnt = ('rating', 'count'))
            item_rank = item_rank.seen_cnt.sort_values(ascending = False)
        else:
            m = _train.groupby('movieId')['rating'].count()
            m = m.mean() + 2 * m.std()
            C = _train.rating.mean()
            item_rank = _train.groupby('movieId').agg(mean_rating = ('rating', 'mean'), seen_cnt = ('rating', 'count'))
            item_rank = item_rank.assign(weighted_rating = lambda df: (df.seen_cnt * df.mean_rating + m * C) / (df.seen_cnt + m))
            item_rank = item_rank.weighted_rating.sort_values(ascending = False)
        self.pop = item_rank
    
    def predict(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
        cand = self.pop[:N].index.tolist()
        pred = []
        for idx in tqdm(df.index):
            pred.append({'userId': df.at[idx, 'userId'],
                         'movieId': cand})
        return pd.DataFrame(pred)

In [35]:
class evaluate:
    def __init__(self, true: pd.DataFrame, pred: pd.DataFrame):
        self.true = true
        self.pred = pred
        self.max_K = 10000
        self.idcg = np.cumsum([1.0 / np.log(i+2) for i in range(self.max_K)])

    def _recall(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / np.min([K, len(gt)])
    
    def _precision(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / len(rec[:K])

    def _AP(self, gt: List, rec: List, K = None) -> float: # Average Precision
        K = K if K else self.max_K
        res = 0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                res += self._precision(gt, rec[:K], i+1)
        return res / np.min([K, len(gt)])

    def _RR(self, gt: List, rec: List, K = None) -> float: # Reciprocal Rank
        K = K if K else self.max_K
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0 / (i+1)
        return 0

    def _nDCG(self, gt: List, rec: List, K = None) -> float: # normalized Discounted Cumulative Gain
        K = K if K else self.max_K
        dcg = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                dcg += 1.0 / np.log(i+2)
        idcg = self.idcg[min([len(gt), K])-1]
        return dcg / idcg
    
    def __call__(self, K = None):
        self.K = K
        self.recall = 0.0
        self.precision = 0.0
        self.MAP = 0.0
        self.MRR = 0.0
        self.nDCG = 0.0
        n = self.true.index.size
        for gt, rec in zip(tqdm(self.true.movieId), self.pred.movieId):
            self.recall += self._recall(gt, rec, K) / n
            self.precision += self._precision(gt, rec, K) / n
            self.MAP += self._AP(gt, rec, K) / n
            self.MRR += self._RR(gt, rec, K) / n
            self.nDCG += self._nDCG(gt, rec, K) / n

    def print_all(self):
        K = '@' + str(self.K) if self.K else ''
        print(f'{"Recall":>12}{K} : {self.recall:.5f}',
              f'\n{"Precision":>12}{K} : {self.precision:.5f}',
              f'\n{"MAP":>12}{K} : {self.MAP:.5f}',
              f'\n{"nRR":>12}{K} : {self.MRR:.5f}',
              f'\n{"nDCG":>12}{K} : {self.nDCG:.5f}')

In [36]:
train, _, test = train_valid_test_split(ratings)
test = cut_off(test)

test_q, test_a = query_answer_split(test)
test_q, test_a = map(list_aggregation, (test_q, test_a))

## 1. Qualitative Popularity

In [37]:
model = POP(implicit = False)
model.fit(train)

In [38]:
pred = model.predict(test_q, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,5,"[318, 260, 527, 858, 1198, 50, 2762, 2858, 202..."
1,10,"[318, 260, 527, 858, 1198, 50, 2762, 2858, 202..."
2,15,"[318, 260, 527, 858, 1198, 50, 2762, 2858, 202..."
3,38,"[318, 260, 527, 858, 1198, 50, 2762, 2858, 202..."
4,42,"[318, 260, 527, 858, 1198, 50, 2762, 2858, 202..."


In [39]:
res = evaluate(test_a, pred)

res(K = 20)
res.print_all()

res(K = 100)
res.print_all()

HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))


      Recall@20 : 0.05409 
   Precision@20 : 0.03483 
         MAP@20 : 0.01450 
         nRR@20 : 0.10017 
        nDCG@20 : 0.04632


HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))


      Recall@100 : 0.18026 
   Precision@100 : 0.02881 
         MAP@100 : 0.02022 
         nRR@100 : 0.10874 
        nDCG@100 : 0.09652


### 2. Quantitative Popularity

In [40]:
model = POP()
model.fit(train)

In [41]:
pred = model.predict(test_q, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,5,"[2858, 1196, 260, 1210, 480, 589, 2028, 1270, ..."
1,10,"[2858, 1196, 260, 1210, 480, 589, 2028, 1270, ..."
2,15,"[2858, 1196, 260, 1210, 480, 589, 2028, 1270, ..."
3,38,"[2858, 1196, 260, 1210, 480, 589, 2028, 1270, ..."
4,42,"[2858, 1196, 260, 1210, 480, 589, 2028, 1270, ..."


In [42]:
res = evaluate(test_a, pred)

res(K = 20)
res.print_all()

res(K = 100)
res.print_all()

HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))


      Recall@20 : 0.06435 
   Precision@20 : 0.03864 
         MAP@20 : 0.01385 
         nRR@20 : 0.09888 
        nDCG@20 : 0.04960


HBox(children=(FloatProgress(value=0.0, max=603.0), HTML(value='')))


      Recall@100 : 0.22319 
   Precision@100 : 0.03295 
         MAP@100 : 0.02202 
         nRR@100 : 0.10850 
        nDCG@100 : 0.11288
