<a href="https://colab.research.google.com/github/ivoryRabbit/RecSys/blob/master/3.EASE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EASE

- [Embarrassingly Shallow Autoencoders for Sparse Data](https://arxiv.org/pdf/1905.03375v1.pdf)

## Experiment

In [17]:
import glob
import numpy as np
import pandas as pd
from typing import Callable, Tuple, List
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from tensorflow.keras.utils import get_file
import zipfile

In [18]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''

    if data_size == '1m':
        fname = 'ml-1m.zip'
        data = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        fname = 'ml-10m.zip'
        data = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        fname = 'ml-20m.zip'
        data = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        fname = 'ml-25m.zip'
        data = 'ml-25m/ratings.csv'
    if not glob.glob(data):
        origin = f'http://files.grouplens.org/datasets/movielens/{fname}'
        file = get_file(fname, origin)
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_names = ['userId', 'movieId', 'rating', 'timestamp']
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(data, engine = 'python')
    else:
        ratings = pd.read_csv(data, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [19]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [20]:
n_user = ratings.userId.nunique()
print(f'# of user = {n_user}')

n_item = ratings.movieId.nunique()
print(f'# of item = {n_item}')

# of user = 6040
# of item = 3706


In [21]:
def train_valid_test_split(df: pd.DataFrame) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.userId.unique(), test_size = 0.2, random_state = 777)
    valid_user, test_user = train_test_split(test_user, test_size = 0.5, random_state = 777)

    train, valid, test = map(lambda x: df[df.userId.isin(x)], (train_user, valid_user, test_user))
    train, valid, test = map(lambda df: df.reset_index(drop = True), (train, valid, test))
    return train, valid, test

def query_answer_split(df: pd.DataFrame) -> pd.DataFrame:
    timeorder = df.groupby(by = 'userId')['timestamp'].rank(method = 'first', ascending = True)
    seen_cnts = df.groupby(by = 'userId', as_index = False).agg(seen_cnts = ('movieId', 'count'))
    df = df.merge(seen_cnts, how = 'left', on = 'userId')
    df = df.assign(timeorder = timeorder)
    df = df.assign(split_type = np.where(df.timeorder < df.seen_cnts * 0.8, 'query', 'answer'))    
    query = df[df.split_type == 'query']
    answer = df[df.split_type == 'answer']
    answer = answer[answer.userId.isin(query.userId.unique())]
    query, answer = map(lambda df: df.drop(columns = ['timeorder', 'seen_cnts', 'split_type']), (query, answer))
    query, answer = map(lambda df: df.reset_index(drop = True), (query, answer))
    return query, answer

def cut_off(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    return df[df.rating >= threshold].reset_index(drop = True)

def make_warm(df: pd.DataFrame, threshold = 5) -> pd.DataFrame: # remove cold starters
    positive = df.groupby('userId')['movieId'].nunique()
    positive = positive.index[positive >= threshold]
    return df[df.userId.isin(positive)].reset_index(drop = True)

def list_aggregation(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('userId', as_index = False)[['movieId', 'rating']].agg(list)

In [22]:
class EASE:
    def __init__(self, n_user: int, n_item: int, vanilla = False):
        self.n_user = n_user
        self.n_item = n_item
        self.vanilla = vanilla

    def id2idx(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(userId = lambda x: x.userId.map(self.user_id2idx), 
                         movieId = lambda x: x.movieId.map(self.item_id2idx))

    def idx2id(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(userId = lambda x: x.userId.apply(lambda x: self.user_idx2id[x]), 
                         movieId = lambda x: x.movieId.apply(lambda x: self.item_idx2id[x]))

    def fit(self, df: pd.DataFrame, lamb = None):
        self.user_idx2id = df.userId.unique() # list[index] = id
        self.user_id2idx = {e: i for i, e in enumerate(self.user_idx2id)} # dict[id] = index
        self.item_idx2id = df.movieId.unique() # list[index] = id
        self.item_id2idx = {e: i for i, e in enumerate(self.item_idx2id)} # dict[id] = index
        _train = self.id2idx(df)

        row = _train.userId
        col = _train.movieId
        data = np.ones(shape = _train.index.size)
        shape = (len(self.user_idx2id), self.n_item)

        mat = csr_matrix((data, (row, col)), shape = shape)
        self.G = mat.T.dot(mat).toarray()
        diag_idx = np.diag_indices(self.G.shape[0])
        if self.vanilla:
            self.G[diag_idx] = 0
        else:
            self.G[diag_idx] += lamb
            P = np.linalg.inv(self.G)
            self.B = - P / np.diag(P)
            self.B[diag_idx] = 0

    def predict(self, df:pd.DataFrame, batch_size:int, N: int) -> List:
        n_row = df.index.size

        Ids = np.arange(n_row)
        profile = df.movieId.apply(lambda x: [self.item_id2idx.get(y) for y in x if self.item_id2idx.get(y)])

        pred = []
        steps = int(np.ceil(n_row / batch_size))
        for batch_step in tqdm(range(steps)):
            lower = batch_step * batch_size
            upper = lower + batch_size

            batch_Id = Ids[lower: upper]
            y_true = np.zeros(shape = (batch_Id.size, self.n_item))
            for i, idx in enumerate(batch_Id):
                y_true[i, profile[idx]] = 1.0

            if self.vanilla:
                y_pred = y_true.dot(self.G)
            else:
                y_pred = y_true.dot(self.B)
        
            cand = np.argsort(np.where(y_true == 1.0, -1.0, y_pred), axis = -1)[:, :-N-1:-1]
            cand = self.item_idx2id[cand]
            pred.append(cand)
        pred = np.concatenate(pred)
        pred = [{'userId': df.at[i, 'userId'], 'movieId': pred[i]} for i in range(n_row)]
        return pd.DataFrame(pred)

In [23]:
class evaluate:
    def __init__(self, true: pd.DataFrame, pred: pd.DataFrame):
        self.true = true
        self.pred = pred
        self.max_K = 10000
        self.idcg = np.cumsum([1.0 / np.log(i+2) for i in range(self.max_K)])

    def _recall(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / np.min([K, len(gt)])
    
    def _precision(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / len(rec[:K])

    def _AP(self, gt: List, rec: List, K = None) -> float: # Average Precision
        K = K if K else self.max_K
        res = 0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                res += self._precision(gt, rec[:K], i+1)
        return res / np.min([K, len(gt)])

    def _RR(self, gt: List, rec: List, K = None) -> float: # Reciprocal Rank
        K = K if K else self.max_K
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0 / (i+1)
        return 0

    def _nDCG(self, gt: List, rec: List, K = None) -> float: # normalized Discounted Cumulative Gain
        K = K if K else self.max_K
        dcg = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                dcg += 1.0 / np.log(i+2)
        idcg = self.idcg[min([len(gt), K])-1]
        return dcg / idcg
    
    def __call__(self, K = None):
        self.K = K
        self.recall = 0.0
        self.precision = 0.0
        self.MAP = 0.0
        self.MRR = 0.0
        self.nDCG = 0.0
        n = self.true.index.size
        for gt, rec in zip(tqdm(self.true.movieId), self.pred.movieId):
            self.recall += self._recall(gt, rec, K) / n
            self.precision += self._precision(gt, rec, K) / n
            self.MAP += self._AP(gt, rec, K) / n
            self.MRR += self._RR(gt, rec, K) / n
            self.nDCG += self._nDCG(gt, rec, K) / n

    def print_all(self):
        K = '@' + str(self.K) if self.K else ''
        print(f'{"Recall":>12}{K} : {self.recall:.5f}',
              f'\n{"Precision":>12}{K} : {self.precision:.5f}',
              f'\n{"MAP":>12}{K} : {self.MAP:.5f}',
              f'\n{"nRR":>12}{K} : {self.MRR:.5f}',
              f'\n{"nDCG":>12}{K} : {self.nDCG:.5f}')

In [24]:
ratings = cut_off(ratings)

train, _, test = train_valid_test_split(ratings)
train = make_warm(train)

test_q, test_a = query_answer_split(test)
test_q, test_a = map(list_aggregation, (test_q, test_a))

### 1. Vanilla item-based CF

In [25]:
model = EASE(n_user, n_item, vanilla = True)
model.fit(train)

In [26]:
pred = model.predict(test_q, batch_size = 128, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,5,"[1196, 260, 318, 2762, 1617, 1198, 2028, 527, ..."
1,10,"[1196, 260, 2858, 2571, 593, 2028, 318, 608, 5..."
2,15,"[1196, 260, 593, 589, 608, 1210, 296, 1270, 16..."
3,45,"[260, 1196, 2858, 1198, 2396, 593, 318, 1197, ..."
4,60,"[1196, 260, 2571, 2858, 1198, 2762, 2028, 1210..."


In [27]:
res = evaluate(test_a, pred)

res(K = 20)
res.print_all()

res(K = 100)
res.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@20 : 0.12569 
   Precision@20 : 0.07666 
         MAP@20 : 0.04343 
         nRR@20 : 0.24152 
        nDCG@20 : 0.11315


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.29400 
   Precision@100 : 0.04725 
         MAP@100 : 0.05232 
         nRR@100 : 0.24773 
        nDCG@100 : 0.17989


### 2. EASE

In [28]:
model = EASE(n_user, n_item)
model.fit(train, lamb = 3000)

In [29]:
pred = model.predict(test_q, batch_size = 128, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,userId,movieId
0,5,"[1617, 2762, 223, 2396, 3160, 318, 527, 778, 1..."
1,10,"[1097, 260, 1196, 2797, 595, 1136, 2406, 2987,..."
2,15,"[593, 589, 1617, 733, 2959, 296, 150, 1784, 60..."
3,45,"[2396, 1307, 539, 597, 898, 356, 17, 2797, 267..."
4,60,"[3114, 1580, 480, 780, 733, 2762, 648, 2006, 1..."


In [30]:
res = evaluate(test_a, pred)

res(K = 20)
res.print_all()

res(K = 100)
res.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@20 : 0.17100 
   Precision@20 : 0.09354 
         MAP@20 : 0.05836 
         nRR@20 : 0.26802 
        nDCG@20 : 0.14410


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.42275 
   Precision@100 : 0.06129 
         MAP@100 : 0.07516 
         nRR@100 : 0.27492 
        nDCG@100 : 0.24365
