<a href="https://colab.research.google.com/github/ivoryRabbit/RecSys/blob/master/3_EASE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EASE

- [Embarrassingly Shallow Autoencoders for Sparse Data](https://arxiv.org/pdf/1905.03375v1.pdf)

## Experiment

In [1]:
import glob
import numpy as np
import pandas as pd
from typing import Callable, Tuple, List
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from tensorflow.keras.utils import get_file
import zipfile

In [2]:
def load_data(data_size : str) -> pd.DataFrame:
    ''' load Movie Lens data '''

    if data_size == '1m':
        fname = 'ml-1m.zip'
        data = 'ml-1m/ratings.dat'
    elif data_size == '10m':
        fname = 'ml-10m.zip'
        data = 'ml-10M100K/ratings.dat'
    elif data_size == '20m':
        fname = 'ml-20m.zip'
        data = 'ml-20m/ratings.csv'
    elif data_size == '25m':
        fname = 'ml-25m.zip'
        data = 'ml-25m/ratings.csv'
    if not glob.glob(data):
        origin = f'http://files.grouplens.org/datasets/movielens/{fname}'
        file = get_file(fname, origin)
        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()

    col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    if data_size in ['20m', '25m']:
        ratings = pd.read_csv(data, engine = 'python')
    else:
        ratings = pd.read_csv(data, sep = '|', delimiter = '::', names = col_names, engine = 'python')
    print(ratings.shape)
    return ratings

In [3]:
ratings = load_data('1m')
ratings.head()

(1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
user_list = ratings.user_id.unique()
user_idx_map = {e: i for i, e in enumerate(user_list)}
n_user = len(user_list)
print(f'# of user = {n_user}')

item_list = ratings.movie_id.unique()
item_idx_map = {e: i for i, e in enumerate(item_list)}
n_item = len(item_list)
print(f'# of item = {n_item}')

# of user = 6040
# of item = 3706


In [5]:
def binarizer(df: pd.DataFrame, threshold = 4) -> pd.DataFrame:
    df = df.assign(rating = np.where(df.rating >= threshold, 1, 0))
    return df[df.rating > 0.0].reset_index(drop = True)

def make_warm(df: pd.DataFrame, threshold = 5) -> pd.DataFrame: # remove cold starters
    positive = df.groupby('user_id')['movie_id'].count()
    positive = positive.index[positive >= threshold]
    return df[df.user_id.isin(positive)].reset_index(drop = True)

def train_valid_test_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    train_user, test_user = train_test_split(df.user_id.unique(), test_size = 2 * size, random_state = 777)
    valid_user, test_user = train_test_split(test_user, test_size = 0.5, random_state = 777)
    train, valid, test = map(lambda x: df[df.user_id.isin(x)], (train_user, valid_user, test_user))
    train, valid, test = map(lambda df: df.reset_index(drop = True), (train, valid, test))
    return train, valid, test

def query_relev_split(df: pd.DataFrame, size: float) -> pd.DataFrame:
    timeorder = df.groupby('user_id')['timestamp'].rank(method = 'first', ascending = True)
    seen_cnts = df.groupby('user_id')['movie_id'].transform('count')
    df = df.assign(seen_cnts = seen_cnts, timeorder = timeorder)
    query = df[df.timeorder < df.seen_cnts * (1-size)]
    relev = df[df.timeorder >= df.seen_cnts * (1-size)]
    relev = relev[relev.user_id.isin(query.user_id.unique())]
    query, relev = map(lambda df: df.drop(columns = ['timeorder', 'seen_cnts']), (query, relev))
    query, relev = map(lambda df: df.reset_index(drop = True), (query, relev))
    return query, relev

def list_agg(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby('user_id', as_index = False)[['movie_id']].agg(list)

In [6]:
data = binarizer(ratings)
data = make_warm(data)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,1,978300760
1,1,3408,1,978300275
2,1,2355,1,978824291
3,1,1287,1,978302039
4,1,2804,1,978300719


In [7]:
train, valid, test = train_valid_test_split(data, size = 0.1)
valid_q, valid_r = query_relev_split(valid, size = 0.2)
test_q, test_r = query_relev_split(test, size = 0.2)

In [8]:
class EASE:
    def __init__(self, n_user: int, n_item: int, vanilla = False):
        self.n_user = n_user
        self.n_item = n_item
        self.vanilla = vanilla

    def fit(self, df: pd.DataFrame, lamb = None):
        self.user_idx2id = df.user_id.unique() # list[index] = id
        self.user_id2idx = {e: i for i, e in enumerate(self.user_idx2id)} # dict[id] = index
        self.item_idx2id = df.movie_id.unique() # list[index] = id
        self.item_id2idx = {e: i for i, e in enumerate(self.item_idx2id)} # dict[id] = index
        df = df.assign(user_id = df.user_id.map(self.user_id2idx), 
                       movie_id = df.movie_id.map(self.item_id2idx))

        row = df.user_id
        col = df.movie_id
        data = np.ones(shape = df.index.size)
        shape = (len(self.user_idx2id), self.n_item)

        mat = csr_matrix((data, (row, col)), shape = shape)
        self.G = mat.T.dot(mat).toarray()
        diag_idx = np.diag_indices(self.G.shape[0])
        if self.vanilla:
            self.G[diag_idx] = 0
        else:
            self.G[diag_idx] += lamb
            P = np.linalg.inv(self.G)
            self.B = - P / np.diag(P)
            self.B[diag_idx] = 0

    def predict(self, df:pd.DataFrame, batch_size: int, N: int) -> pd.DataFrame:
        df = df.assign(movie_id = df.movie_id.map(self.item_id2idx)).dropna()
        df = df.assign(movie_id = df.movie_id.astype(int))
        df = list_agg(df)

        n_data = df.index.size
        ids = np.arange(n_data)
        profile = df['movie_id']
        
        pred = []
        steps = int(np.ceil(n_data / batch_size))
        for batch_step in tqdm(range(steps)):
            lower = batch_size * batch_step
            upper = batch_size + lower

            batch_id = ids[lower: upper]
            batch = np.zeros(shape = (batch_id.size, self.n_item))
            for i, idx in enumerate(batch_id):
                batch[i, profile[idx]] = 1.0

            if self.vanilla:
                pred_batch = batch.dot(self.G)
            else:
                pred_batch = batch.dot(self.B)
        
            rec = np.argsort(np.where(batch == 1.0, -1.0, pred_batch), axis = 1)[:, :-N-1:-1]
            rec = self.item_idx2id[rec]
            pred.append(rec)
        pred = np.concatenate(pred)
        pred = [{'user_id': df.at[i, 'user_id'], 'movie_id': pred[i]} for i in range(n_data)]
        return pd.DataFrame(pred)

In [9]:
class evaluate:
    def __init__(self, true: pd.DataFrame, pred: pd.DataFrame):
        self.true = true
        self.pred = pred
        self.max_K = 10000
        self.idcg = np.cumsum([1.0 / np.log(i+2) for i in range(self.max_K)])

    def _recall(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / np.min([K, len(gt)])
    
    def _precision(self, gt: List, rec: List, K = None) -> float:
        K = K if K else self.max_K
        res = [r for r in rec[:K] if r in gt]
        return len(res) / len(rec[:K])

    def _AP(self, gt: List, rec: List, K = None) -> float: # Average Precision
        K = K if K else self.max_K
        res = 0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                res += self._precision(gt, rec[:K], i+1)
        return res / np.min([K, len(gt)])

    def _RR(self, gt: List, rec: List, K = None) -> float: # Reciprocal Rank
        K = K if K else self.max_K
        for i, r in enumerate(rec[:K]):
            if r in gt:
                return  1.0 / (i+1)
        return 0

    def _nDCG(self, gt: List, rec: List, K = None) -> float: # normalized Discounted Cumulative Gain
        K = K if K else self.max_K
        dcg = 0.0
        for i, r in enumerate(rec[:K]):
            if r in gt:
                dcg += 1.0 / np.log(i+2)
        idcg = self.idcg[min([len(gt), K])-1]
        return dcg / idcg
    
    def __call__(self, K = None):
        self.K = K
        self.recall = 0.0
        self.precision = 0.0
        self.MAP = 0.0
        self.MRR = 0.0
        self.nDCG = 0.0
        n = self.true.index.size
        for gt, rec in zip(tqdm(self.true.movie_id), self.pred.movie_id):
            self.recall += self._recall(gt, rec, K) / n
            self.precision += self._precision(gt, rec, K) / n
            self.MAP += self._AP(gt, rec, K) / n
            self.MRR += self._RR(gt, rec, K) / n
            self.nDCG += self._nDCG(gt, rec, K) / n

    def print_all(self):
        K = '@' + str(self.K) if self.K else ''
        print(f'{"Recall":>12}{K} : {self.recall:.5f}',
              f'\n{"Precision":>12}{K} : {self.precision:.5f}',
              f'\n{"MAP":>12}{K} : {self.MAP:.5f}',
              f'\n{"nRR":>12}{K} : {self.MRR:.5f}',
              f'\n{"nDCG":>12}{K} : {self.nDCG:.5f}')

## 1. Vanilla item-based CF

In [10]:
model = EASE(n_user, n_item, vanilla = True)
model.fit(train)

In [11]:
true = list_agg(test_r)
true.head(5)

Unnamed: 0,user_id,movie_id
0,3,"[1394, 104, 1079, 1259, 2355, 3552, 1304, 2081]"
1,9,"[2268, 1682, 2278, 590, 524, 529, 2294, 1921, ..."
2,42,"[3421, 3868, 1257, 2997, 2134, 1265, 593, 2143..."
3,79,"[3044, 3176, 2686, 2024]"
4,92,"[2054, 2134, 733, 592, 2072, 3107, 2162, 1291,..."


In [12]:
pred = model.predict(test_q, batch_size = 128, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,user_id,movie_id
0,3,"[2571, 2028, 589, 593, 1270, 110, 2762, 318, 1..."
1,9,"[1196, 260, 1198, 296, 858, 589, 110, 1197, 23..."
2,42,"[2571, 2028, 2858, 110, 593, 2762, 858, 1580, ..."
3,79,"[2858, 260, 1196, 593, 2762, 2571, 608, 2028, ..."
4,92,"[1196, 260, 2858, 1198, 2028, 1210, 589, 527, ..."


In [13]:
scores = evaluate(true, pred)

scores(K = 20)
scores.print_all()

scores(K = 100)
scores.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@20 : 0.11487 
   Precision@20 : 0.07127 
         MAP@20 : 0.03673 
         nRR@20 : 0.20011 
        nDCG@20 : 0.09943


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.28338 
   Precision@100 : 0.04657 
         MAP@100 : 0.04474 
         nRR@100 : 0.20728 
        nDCG@100 : 0.16515


## 2. EASE

In [14]:
model = EASE(n_user, n_item)
model.fit(train, lamb = 3000)

In [15]:
pred = model.predict(test_q, batch_size = 128, N = 100)
pred.head(5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,user_id,movie_id
0,3,"[110, 1270, 2571, 2628, 1240, 2028, 1580, 589,..."
1,9,"[296, 2997, 2396, 356, 110, 3578, 1196, 858, 1..."
2,42,"[2571, 2916, 1580, 780, 1527, 110, 2000, 2947,..."
3,79,"[3578, 50, 2858, 3481, 3753, 2762, 3751, 1252,..."
4,92,"[2858, 919, 1265, 2918, 1196, 260, 1210, 1073,..."


In [16]:
scores = evaluate(true, pred)

scores(K = 20)
scores.print_all()

scores(K = 100)
scores.print_all()

HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@20 : 0.15546 
   Precision@20 : 0.08982 
         MAP@20 : 0.05393 
         nRR@20 : 0.25163 
        nDCG@20 : 0.13393


HBox(children=(FloatProgress(value=0.0, max=604.0), HTML(value='')))


      Recall@100 : 0.41717 
   Precision@100 : 0.06012 
         MAP@100 : 0.06951 
         nRR@100 : 0.25936 
        nDCG@100 : 0.23298
