<a href="https://colab.research.google.com/github/ivoryRabbit/RecSys/blob/master/10_RankSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Learning to Rank: SVM ver.

- [Learning to Rank: From Pairwise Approach to Listwise Approach](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-2007-40.pdf)

In [1]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from typing import Callable, Tuple, List

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from multiprocessing import Pool
from sklearn.svm import LinearSVC
from xgboost import XGBRanker
from lightgbm import LGBMRanker

from tensorflow.keras.utils import get_file
import zipfile

np.random.seed(777)

In [2]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''
    if data_size == '1m':
        fname = 'ml-1m.zip'
        data = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        fname = 'ml-10m.zip'
        data = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        fname = 'ml-20m.zip'
        data = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        fname = 'ml-25m.zip'
        data = 'ml-25m/ratings.csv'
    if not glob.glob(data):
        origin = f'http://files.grouplens.org/datasets/movielens/{fname}'
        file = get_file(fname, origin)
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(data, names = col_names, engine = 'python')
    else:
        ratings = pd.read_csv(data, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [3]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
print(f'# of users = {ratings.user_id.nunique()}')
print(f'# of items = {ratings.movie_id.nunique()}')

# of users = 6040
# of items = 3706


In [5]:
def binarizer(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    df = df.assign(rating = np.where(df.rating >= threshold, 1, 0))
    return df[df.rating > 0.0].reset_index(drop = True)

def make_warm(df: pd.DataFrame, threshold = 5) -> pd.DataFrame: # remove cold starters
    positive = df.groupby('user_id')['movie_id'].count()
    positive = positive.index[positive >= threshold]
    return df[df.user_id.isin(positive)].reset_index(drop = True)

def train_valid_test_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.user_id.unique(), test_size = 2 * size, random_state = 777)
    valid_user, test_user = train_test_split(test_user, test_size = 0.5, random_state = 777)
    train, valid, test = map(lambda x: df[df.user_id.isin(x)], (train_user, valid_user, test_user))
    train, valid, test = map(lambda df: df.reset_index(drop = True), (train, valid, test))
    return train, valid, test

def query_relev_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    timeorder = df.groupby('user_id')['timestamp'].rank(method = 'first', ascending = True)
    seen_cnts = df.groupby('user_id')['movie_id'].transform('count')
    df = df.assign(seen_cnts = seen_cnts, timeorder = timeorder)
    query = df[df.timeorder < df.seen_cnts * (1-size)]
    relev = df[df.timeorder >= df.seen_cnts * (1-size)]
    relev = relev[relev.user_id.isin(query.user_id.unique())]
    query, relev = map(lambda df: df.drop(columns = ['timeorder', 'seen_cnts']), (query, relev))
    query, relev = map(lambda df: df.reset_index(drop = True), (query, relev))
    return query, relev

def list_agg(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('user_id', as_index = False)[['movie_id']].agg(list)

In [6]:
data = binarizer(ratings)
data = make_warm(data)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,1,978300760
1,1,3408,1,978300275
2,1,2355,1,978824291
3,1,1287,1,978302039
4,1,2804,1,978300719


In [7]:
train, _, test = train_valid_test_split(data, size = 0.1)
test_q, test_r = query_relev_split(test, size = 0.2)

## How to generate item features from user feedback

- [Remark](https://github.com/JunsolKim/kakao-arena-melon-ysocio13)

---



For given user $u \in U$ and arbitrary item $i \in I$, define an item representative $\mathbb{x}_i = \mathbb{x}_i(u) \in \mathbb{R}^{|P|}$ as follows:

$$
\begin{aligned}
\mathbb{x}_{iv} := \left\{ \begin{array}{ll}
\frac{|I(u) \cap I(v)|}{|I(v)|^{\alpha}} & \text{if } v \in U(i) \\
0 & \text{o.w.}\\
\end{array} \right.
\end{aligned}
$$

where
- $U(i)$ is the set of users that include item $i$
- $I(u)$ is the set of items included by user $u$
- $\alpha$ is an hyperparameter

---

Next, define the similarity $s_i = s_i(u) \in \mathbb{R}$ between two items $i\in I(u)$ and $j\in I$ as follows:

$$ s_{ij} := \frac{\mathbb{x}_i^T\mathbb{x}_j}{|U(i)|^{\beta} |U(j)|^{1- \beta}} $$

where
- $\beta$ is an hyperparameter

---

Finally, predict $y_j \in \{0, 1\}$ from the inputs $r_j \in \mathbb{R}^{|I(u)|}$ for each item $j \in I$ using various classifier, where

- $\ r_{j} =\big( s_{1j}, \ s_{2j}, \ldots, \ s_{|I(u)|j}\big)$

- $\begin{aligned}
y_{j} = \left\{ \begin{array}{ll}
1 & \text{if } j \in I(u) \\
0 & \text{o.w.}\\
\end{array} \right.
\end{aligned}
$

In [8]:
class RankSVM:
    def __init__(self, alpha: float, beta: float, ranker_type = 'SVM'):
        self.alpha = alpha
        self.beta = beta
        self.ranker_type = ranker_type
        self.ranking_model = {
            'LGB': LGBMRanker, 
            'XGB': XGBRanker, 
            'SVM': LinearSVC
        }[self.ranker_type]

    def fit(self, df: pd.DataFrame):
        self.user_idx2id = df.user_id.unique() # list[index] = id
        self.user_id2idx = {e: i for i, e in enumerate(self.user_idx2id)} # dict[id] = index
        self.item_idx2id = df.movie_id.unique() # list[index] = id
        self.item_id2idx = {e: i for i, e in enumerate(self.item_idx2id)} # dict[id] = index
        self.n_item = len(self.item_idx2id)

        df = df.assign(user_id = df.user_id.map(self.user_id2idx), 
                       movie_id = df.movie_id.map(self.item_id2idx))

        row = df.user_id
        col = df.movie_id
        val = np.ones(shape = df.index.size)

        self.mat = csr_matrix((val, (row, col)), dtype = np.float32)

    def rerank(self, profile):
        query = np.zeros(shape = (self.n_item, ), dtype = np.float32)
        pos = np.array(profile)
        query[pos] = 1.0

        users_item = np.sum(self.mat, axis = 1)
        items_user = np.sum(self.mat, axis = 0)

        weight = self.mat[:, pos].multiply(np.power(users_item, -2*self.alpha)).sum(axis = 1)

        value = self.mat[:, pos].multiply(weight).T.dot(self.mat)
        value = value.multiply(np.power(1e-1 + items_user[:, pos], -self.beta).reshape(-1, 1))
        value = value.multiply(np.power(1e-1 + items_user, self.beta-1))
        sim = value.T.tocsr()

        if self.ranker_type == 'SVM':
            ranker = self.ranking_model(tol = 1e-6, max_iter = 500000)
            ranker.fit(sim, query)
            pred_batch = ranker.decision_function(sim)
        else:
            ranker = self.ranking_model(n_jobs = -1)
            ranker.fit(sim, query)
            pred_batch = ranker.predict(sim)
        
        pred_batch[pos] = -np.inf

        rec = np.argsort(pred_batch)[:-self.N-1:-1]
        rec = self.item_idx2id[rec]
        return rec

    def predict(self, df:pd.DataFrame, N: int) -> pd.DataFrame:
        self.N = N
        df = df.assign(movie_id = df.movie_id.map(self.item_id2idx)).dropna()
        df = df.assign(movie_id = df.movie_id.astype(int))
        df = list_agg(df)
        profile = df['movie_id']
        
        pred = []
        pool = Pool(25)
        results = pool.map(self.rerank, profile)
        pool.close()
        pool.join()

        for idx, rec in enumerate(results):
            pred.append({'user_id': df.at[idx, 'user_id'],
                         'movie_id': rec})

        return pd.DataFrame(pred)

In [9]:
class evaluate:
    def __init__(self, true: pd.DataFrame, pred: pd.DataFrame):
        self.true = true
        self.pred = pred
        self.max_K = 10000
        self.idcg = np.cumsum([1.0 / np.log2(i+2) for i in range(self.max_K)])

    def _recall(self, gt: List, rec: List, K = None) -> float: # Recall
        res = [r for r in rec[:K] if r in gt]
        return len(res) / np.min([K, len(gt)])
    
    def _precision(self, gt: List, rec: List, K = None) -> float: # Precision
        res = [r for r in rec[:K] if r in gt]
        return len(res) / len(rec[:K])

    def _AP(self, gt: List, rec: List, K = None) -> float: # Average Precision
        res = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                res += self._precision(gt, rec[:K], i+1)
        return res / np.min([K, len(gt)])

    def _HR(self, gt: List, rec: List, K = None) -> float: # Hit Rate
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0
        return 0

    def _RR(self, gt: List, rec: List, K = None) -> float: # Reciprocal Rank
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0 / (i+1)
        return 0

    def _nDCG(self, gt: List, rec: List, K = None) -> float: # normalized Discounted Cumulative Gain
        dcg = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                dcg += 1.0 / np.log2(i+2)
        idcg = self.idcg[min([len(gt), K])-1]
        return dcg / idcg

    def _get_item(self, rec: List, K = None) -> float:
        for r in rec[:K]:
            self.uniq_item[r] = self.uniq_item.get(r, 0) + 1

    def __call__(self, K = None):
        self.K = K if K else self.max_K
        self.recall = 0.0
        self.precision = 0.0
        self.MAP = 0.0
        self.HR = 0.0
        self.MRR = 0.0
        self.nDCG = 0.0
        self.uniq_item = {}
        n = self.true.index.size
        for gt, rec in zip(tqdm(self.true.movie_id), self.pred.movie_id):
            self.recall += self._recall(gt, rec, K) / n
            self.precision += self._precision(gt, rec, K) / n
            self.MAP += self._AP(gt, rec, K) / n
            self.HR += self._HR(gt, rec, K) / n
            self.MRR += self._RR(gt, rec, K) / n
            self.nDCG += self._nDCG(gt, rec, K) / n
            self._get_item(rec, K)
        self.CO = len(self.uniq_item) / (n*K) # Coverage
        self.ED = -np.sum([self.uniq_item[i] * np.log(self.uniq_item[i] / (n*K)) for i in self.uniq_item]) / (n*K) # Entropy-Diversity

    def print_all(self):
        K = '@' + str(self.K) if self.K else ''
        print(f'{"Recall":>12}{K} : {self.recall:.5f}',
              f'\n{"Precision":>12}{K} : {self.precision:.5f}',
              f'\n{"MAP":>12}{K} : {self.MAP:.5f}',
              f'\n{"HR":>12}{K} : {self.HR:.5f}', 
              f'\n{"MRR":>12}{K} : {self.MRR:.5f}',
              f'\n{"nDCG":>12}{K} : {self.nDCG:.5f}',
              f'\n{"CO":>12}{K} : {self.CO:.5f}',
              f'\n{"ED":>12}{K} : {self.ED:.5f}')

In [10]:
model = RankSVM(alpha = 0.6, beta = 0.3, ranker_type = 'SVM')
model.fit(train)

In [11]:
true = list_agg(test_r)
true.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[1394, 104, 1079, 1259, 2355, 3552, 1304, 2081]"
1,9,"[2268, 1682, 2278, 590, 524, 529, 2294, 1921, ..."
2,42,"[3421, 3868, 1257, 2997, 2134, 1265, 593, 2143..."
3,79,"[3044, 3176, 2686, 2024]"
4,92,"[2054, 2134, 733, 592, 2072, 3107, 2162, 1291,..."


In [12]:
pred = model.predict(test_q, N = 100)
pred.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[1240, 2571, 1036, 589, 110, 1270, 592, 2628, ..."
1,9,"[296, 2997, 2396, 110, 2599, 2329, 356, 3578, ..."
2,42,"[2916, 2571, 2528, 780, 1527, 1580, 1676, 2968..."
3,79,"[3300, 3555, 3354, 3578, 3785, 3753, 3623, 374..."
4,92,"[2078, 594, 2087, 1032, 1029, 919, 3034, 2918,..."


In [13]:
scores = evaluate(true, pred)

scores(K = 10)
scores.print_all()

scores(K = 100)
scores.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@10 : 0.11874 
   Precision@10 : 0.09470 
         MAP@10 : 0.05153 
          HR@10 : 0.50662 
         MRR@10 : 0.20790 
        nDCG@10 : 0.11021 
          CO@10 : 0.12781 
          ED@10 : 5.69472


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.41134 
   Precision@100 : 0.05833 
         MAP@100 : 0.06523 
          HR@100 : 0.95530 
         MRR@100 : 0.22744 
        nDCG@100 : 0.22326 
          CO@100 : 0.03303 
          ED@100 : 6.49351
