In [1]:
from ast import literal_eval
from os import listdir
from os.path import isfile, join
from scipy.sparse import csr_matrix, load_npz, save_npz
from tqdm import tqdm
from sklearn.preprocessing import normalize

import seaborn as sns
import datetime
import json
import numpy as np
import pandas as pd
import time
import yaml
import scipy.sparse as sparse
from ast import literal_eval

# For Python2 this have to be done
from __future__ import division

# Load Dataframe

In [520]:
df = pd.read_csv('../../data/yelp/Data.csv')

# Split Data

In [521]:
def to_sparse_matrix(df, num_user, num_item, user_col, item_col, rating_col):

    dok = df[[user_col, item_col, rating_col]].copy()
    dok = dok.values
    dok = dok[dok[:, 2] > 0]
    shape = [num_user, num_item]

    return sparse.csr_matrix((dok[:, 2].astype(np.float32), (dok[:, 0], dok[:, 1])), shape=shape)

def leave_one_out_split(df, user_col, ratio, random_state=None):
    grouped = df.groupby(user_col, as_index=False)
    valid = grouped.apply(lambda x: x.sample(frac=ratio, random_state=random_state))
    train = df.loc[~df.index.isin([x[1] for x in valid.index])]
    return train, valid

def time_ordered_split(df, ratio, user_col = None, random_state=None):
    # Sort data based on timestamp
    argsort = np.argsort(df['timestamp'])
    df_ordered = df.reindex(argsort)
    train_offset = int((1-ratio)*len(df_ordered))
    
    train = df_ordered[:train_offset]
    valid = df_ordered[train_offset:]
    return train, valid


def main(enable_validation = False, time_ordered_split_en = True, implicit_en = False):
    df = pd.read_csv('../../data/yelp/' + 'Data.csv')

    num_users = df['UserIndex'].nunique()
    num_items = df['ItemIndex'].nunique()

    # Get timestamp 
    date_time_df = df[['Day','Month','Year']]
    date_time_df.rename(columns={'Year': 'year', 'Month': 'day', 'Day':'month'}, inplace=True)
    date_time = pd.to_datetime(date_time_df)
    df['timestamp'] = date_time

    rating_col = 'rating'
    if implicit_en == True:
        rating_col = 'Binary'
    
    if time_ordered_split_en:
        df_train, df_test = time_ordered_split(df, 0.2)
    else:
        df_train, df_test = leave_one_out_split(df, 'UserIndex', 0.2, random_state=8292)

    if enable_validation:
        if time_ordered_split_en:
            df_train, df_valid = time_ordered_split(df_train, 0.2)
        else:
            df_train, df_valid = leave_one_out_split(df_train, 'UserIndex', 0.2, random_state=8292)
        
        # Clean empty rows
        df_valid = df_valid.dropna().reset_index(drop = True)
        
        # Save
        df_valid.to_csv('../../data/yelp/' + 'Valid.csv')
        R_valid = to_sparse_matrix(df_valid, num_users, num_items, 'UserIndex','ItemIndex', rating_col)
        sparse.save_npz('../../data/yelp/' + 'Rvalid.npz', R_valid)
    
    # Clean empty rows
    df_train = df_train.dropna().reset_index(drop = True)
    df_test = df_test.dropna().reset_index(drop = True)
    
    # Save
    df_train.to_csv('../../data/yelp/'  + 'Train.csv')
    R_train = to_sparse_matrix(df_train, num_users, num_items, 'UserIndex', 'ItemIndex', rating_col)
    sparse.save_npz('../../data/yelp/' + 'Rtrain.npz', R_train)

    df_test.to_csv('../../data/yelp/' + 'Test.csv')
    R_test = to_sparse_matrix(df_test, num_users, num_items, 'UserIndex', 'ItemIndex', rating_col)
    sparse.save_npz('../../data/yelp/' + 'Rtest.npz', R_test)
    
def date_to_timestamp(date, **not_used):
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    return time.mktime(dt.timetuple())


In [522]:
df.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Unnamed: 0.1.1', u'business_id',
       u'friend_count', u'ghost', u'img_dsc', u'img_url', u'nr',
       u'photo_count', u'rating', u'review_count', u'review_date',
       u'review_id', u'review_language', u'review_text', u'ufc', u'user_id',
       u'user_loc', u'vote_count', u'Updated', u'Year', u'Month', u'Day',
       u'Binary', u'review', u'conca_review', u'keyVector',
       u'keyphrases_indices_length', u'UserIndex', u'ItemIndex'],
      dtype='object')

In [726]:
main(enable_validation = True, time_ordered_split_en = False, implicit_en = False)

# Load Data

In [524]:
# Load Original Data
df_train = pd.read_csv('../../data/yelp/Train.csv')
df_valid = pd.read_csv('../../data/yelp/Valid.csv')
df_test = pd.read_csv('../../data/yelp/Test.csv')
keyphrases = pd.read_csv('../../data/yelp/KeyPhrases.csv')['Phrases'].tolist()

In [525]:
ItemIndex = pd.read_csv('../../data/yelp/ItemIndex.csv')
ItemIndex = ItemIndex.sort_values('ItemIndex').drop_duplicates(subset=['ItemIndex', 'business_id'])

In [526]:
# Load U-I Data 
rtrain = load_npz("../../data/yelp/Rtrain.npz")
rvalid = load_npz("../../data/yelp/Rvalid.npz")
rtest = load_npz("../../data/yelp/Rtest.npz")

In [527]:
rtrain

<2473x10282 sparse matrix of type '<type 'numpy.float32'>'
	with 102741 stored elements in Compressed Sparse Row format>

In [528]:
len(keyphrases)

237

In [640]:
# Generate U_K and I_K
# For validation set
U_K = get_I_K(df_train, row_name = 'UserIndex', shape = (2473, 237))
I_K = get_I_K(df_train, row_name = 'ItemIndex', shape = (10282, 237))
# For test set
U_K_test = get_I_K(df_test, row_name = 'UserIndex', shape = (2473, 237))
I_K_test = get_I_K(df_test, row_name = 'ItemIndex', shape = (10282, 237))

100%|███████████████████████████████████████████████████████████████████████| 102741/102741 [00:06<00:00, 17012.92it/s]
100%|███████████████████████████████████████████████████████████████████████| 102741/102741 [00:05<00:00, 17493.79it/s]
100%|█████████████████████████████████████████████████████████████████████████| 32133/32133 [00:01<00:00, 17636.11it/s]
100%|█████████████████████████████████████████████████████████████████████████| 32133/32133 [00:01<00:00, 17665.20it/s]


In [641]:
# Save
save_npz('../../data/yelp/U_K.npz',U_K)
save_npz('../../data/yelp/I_K.npz',I_K)
save_npz( '../../data/yelp/U_K_test.npz',U_K_test)
save_npz('../../data/yelp/I_K_test.npz',I_K_test)

In [642]:
# Load 
U_K = load_npz('../../data/yelp/U_K.npz')
I_K = load_npz('../../data/yelp/I_K.npz')

In [354]:
# Models
from sklearn.metrics.pairwise import cosine_similarity
def train(matrix_train):
    similarity = cosine_similarity(X=matrix_train, Y=None, dense_output=True)
    return similarity

def get_I_K(df, row_name = 'ItemIndex', shape = (3668,75)):
    rows = []
    cols = []
    vals = []
    for i in tqdm(range(df.shape[0])):
        key_vector = literal_eval(df['keyVector'][i])
        rows.extend([df[row_name][i]]*len(key_vector)) ## Item index
        cols.extend(key_vector) ## Keyword Index
#         if binary:
        vals.extend(np.array([1]*len(key_vector)))
#         else:
#             vals.extend(arr[arr.nonzero()])    
    return csr_matrix((vals, (rows, cols)), shape=shape)


def predict(matrix_train, k, similarity, item_similarity_en = False):
    """
    res = similarity * matrix_train    if item_similarity_en = False
    res = similarity * matrix_train.T  if item_similarity_en = True
    """
    prediction_scores = []
    
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores to all users
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        similar_users = vector_u.argsort()[::-1][1:k+1]
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        similar_users_ratings = matrix_train[similar_users].toarray()

        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]

        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    
    return res

def prediction(prediction_score, topK, matrix_Train):

    prediction = []

    for user_index in tqdm(range(matrix_Train.shape[0])):
        vector_u = prediction_score[user_index]
        vector_train = matrix_Train[user_index]
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return np.vstack(prediction)


def sub_routine(vector_u, vector_train, topK=500):

    train_index = vector_train.nonzero()[1]

    vector_u = vector_u

    candidate_index = np.argpartition(-vector_u, topK+len(train_index))[:topK+len(train_index)]
    vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
    vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    return vector_u[:topK]


In [355]:
# Evluation 
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)

def precisionk(vector_predict, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_predict)


def average_precisionk(vector_predict, hits, **unused):
    precisions = np.cumsum(hits, dtype=np.float32)/range(1, len(vector_predict)+1)
    return np.mean(precisions)


def r_precision(vector_true_dense, vector_predict, **unused):
    vector_predict_short = vector_predict[:len(vector_true_dense)]
    hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)


def _dcg_support(size):
    arr = np.arange(1, size+1)+1
    return 1./np.log2(arr)


def ndcg(vector_true_dense, vector_predict, hits):
    idcg = np.sum(_dcg_support(len(vector_true_dense)))
    dcg_base = _dcg_support(len(vector_predict))
    dcg_base[np.logical_not(hits)] = 0
    dcg = np.sum(dcg_base)
    return dcg/idcg


def click(hits, **unused):
    first_hit = next((i for i, x in enumerate(hits) if x), None)
    if first_hit is None:
        return 5
    else:
        return first_hit/10


def evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
            vector_true = matrix_Test[user_index]
            vector_true_dense = vector_true.nonzero()[1]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output

def explain_evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
#                 vector_true = matrix_Test[user_index]
                vector_true = np.ravel(matrix_Test.todense()[0])
                vector_true_dense = np.argsort(vector_true)[::-1][:k]
#                 vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
#             vector_true = matrix_Test[user_index]
#             vector_true_dense = vector_true.nonzero()[1]
            vector_true = np.ravel(matrix_Test.todense()[0])
            vector_true_dense = np.argsort(vector_true)[::-1][:k]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output

# Explanation Model

In [356]:
def explain(R,W2,k, model = "Cosine_similarity", item_similarity_en = True):
    """
    k: knn's hyperparameter k
    R: Rating Matrix with size U*I
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    Z: Joint Embedding/Latent Space with size U*U, generate r_ij and s_ij
    W2: Reconstruction matrix with size U*K 
    S: Output explanation prediction matrix with size U*K (dense numpy ndarray)
    """
    Z = train(R) # Cosine similarity as default
    S = predict(W2, k, Z, item_similarity_en=item_similarity_en) 
    if normalize_en == True:       
        return normalize(S) # prediction score
    return S

def predict(matrix_train, k, similarity, item_similarity_en = False, normalize_en = False):
    """
    matrix_train: Rating Matrix with size U*I
    k: knn's hyperparameter k
    similarity: Joint Embedding/Latent Space with size U*U or I*I
    
    res = similarity * matrix_train    if item_similarity_en = False
    res = similarity * matrix_train.T  if item_similarity_en = True
    
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    """
    prediction_scores = []
    
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores to all users
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        similar_users = vector_u.argsort()[::-1][1:k+1]
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        similar_users_ratings = matrix_train[similar_users].toarray()

        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]

        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    if normalize_en:
        res = normalize(res)
    return res

def explain_prediction(prediction_score, topK, matrix_Train):
    """
    output prediction res of the  top K items/keyphrase/whatever
    """
    prediction = []

    for user_index in tqdm(range(matrix_Train.shape[0])):
        vector_u = prediction_score[user_index]
        vector_train = matrix_Train[user_index]
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine_explain(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)
    return np.vstack(prediction)
#     return prediction

def sub_routine_explain(vector_u, vector_train, topK=30):
    """
    vector_u: predicted user vector
    vector_train: true user vector
    topK: top k items in vector
    vector_u: top k items predicted
    """
#     train_index = vector_train.nonzero()[1]
#     candidate_index = np.argpartition(-vector_u, topK+75)[:topK+75] #  10 here to make res consistent
#     candidate_index = np.argpartition(-vector_u, 74)[:topK]
#     vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
#     vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    candidate_index = np.argsort(vector_u)[::-1][:topK]
    return candidate_index

def predict_pilot_explanation(explanation_scores, top_keyphrase = 10):
    """
    Used for retrieve the 1st row of prediction scores, used for pilot test
    """
    explanation = []
    for explanation_score in tqdm(explanation_scores):
        explanation.append(np.argsort(explanation_score)[::-1][:top_keyphrase])
    return np.array(explanation)

In [357]:
def explain(R,W2,k):
    """
    R: Rating Matrix with size U*I
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    Z: Joint Embedding/Latent Space with size U*U, generate r_ij and s_ij
    W2: Reconstruction matrix with size U*K 
    S: Output explanation prediction matrix with size U*K
    """
    Z = train(R)
    S = predict(W2, k, Z)
    return normalize(S)

# Evaluation Model
def recall(vector_true_dense, vector_true_predict):
    """
    The fraction of relevant instances that have been retrieved over the total amount of relevant instances
    The length of vector_true_dense and vector_true_predict has to be the same
    Out put recall
    """
    hits = len(np.isin(vector_true_predict, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)

# Total Recall
def recall_all(true_matrix, predict_matrix, topK = 20):
    res = []
    for i in tqdm(range(len(Explanation_res1))):
        true_vector = np.argsort(np.ravel(normalize(true_matrix).todense()[i]))[-topK:]
        predict_vector = np.argsort(predict_matrix[i])[-topK:]
        res.append(recall(true_vector,predict_vector))
    return sum(res)/len(res)

# Evaluation

### explicit with time-ordered split (new)

In [673]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:01<00:00, 1840.03it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:00<00:00, 7562.69it/s]


In [674]:
explain_evaluate(explanation, U_K_test, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 932.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 920.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 925.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 943.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 861.96it/s]


{'MAP@10': (0.25586715994268083, 0.001353590654717463),
 'MAP@20': (0.3366685134557135, 0.0011824646421380184),
 'MAP@5': (0.0031510416666666666, 0.0004847510631302727),
 'MAP@50': (0.5610333179971825, 0.0011967631690340385),
 'NDCG': (0.6087433758971473, 0.0013926192605863022),
 'Precision@10': (0.22005208333333334, 0.001578073907642935),
 'Precision@20': (0.27421874999999996, 0.0012901125154105365),
 'Precision@5': (0.013802083333333333, 0.0019980380038774437),
 'Precision@50': (0.47991319444444447, 0.0014071444362840457),
 'R-Precision': (0.47991319444444447, 0.0014071444362840457),
 'Recall@10': (0.22005208333333334, 0.001578073907642935),
 'Recall@20': (0.27421874999999996, 0.0012901125154105365),
 'Recall@5': (0.013802083333333333, 0.0019980380038774437),
 'Recall@50': (0.47991319444444447, 0.0014071444362840457)}

### explicit with time-ordered split

In [127]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:01<00:00, 1809.27it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:00<00:00, 7414.56it/s]


In [129]:
explain_evaluate(explanation, U_K_test, atK=[5,10,30,50])

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 1053.51it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 1079.23it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 1062.59it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 1019.14it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 1018.25it/s]


{'MAP@10': (0.49012323675471464, 0.001752167730460859),
 'MAP@30': (0.5868575706982433, 0.0011499213639099082),
 'MAP@5': (0.6690339091490721, 0.0027927379124820764),
 'MAP@50': (0.5560703129913159, 0.0008912113533278475),
 'NDCG': (0.606270925862621, 0.0007611411843798994),
 'Precision@10': (0.28968330134356995, 0.0012569661306957034),
 'Precision@30': (0.42738323736404354, 0.0012872793817335507),
 'Precision@5': (0.4001919385796545, 0.0005016998301376581),
 'Precision@50': (0.39292706333973126, 0.000860039642773129),
 'R-Precision': (0.39292706333973126, 0.000860039642773129),
 'Recall@10': (0.28968330134356995, 0.0012569661306957034),
 'Recall@30': (0.42738323736404354, 0.0012872793817335507),
 'Recall@5': (0.4001919385796545, 0.0005016998301376581),
 'Recall@50': (0.39292706333973126, 0.000860039642773129)}

### Implicit without time-ordered split

In [25]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:01<00:00, 2128.07it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:00<00:00, 7607.15it/s]


In [28]:
explain_evaluate(explanation, U_K_test, atK=[5,20,40]) 

100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 939.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 889.18it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 913.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 882.49it/s]


{'MAP@20': (0.45316721230242296, 0.0019350810812115379),
 'MAP@40': (0.5842206191206047, 0.002377891946403873),
 'MAP@5': (0.623118509034002, 0.0029496286180747486),
 'NDCG': (0.7268562260242788, 0.0021141846532503274),
 'Precision@20': (0.3027102005975245, 0.0018155475022447942),
 'Precision@40': (0.5250213401621853, 0.0021673342673273256),
 'Precision@5': (0.3977806231327358, 0.0016873251019215718),
 'R-Precision': (0.5250213401621853, 0.0021673342673273256),
 'Recall@20': (0.3027102005975245, 0.0018155475022447942),
 'Recall@40': (0.5250213401621853, 0.0021673342673273256),
 'Recall@5': (0.3977806231327358, 0.0016873251019215718)}

# Tuning 

TODO

# Item-based Explanation

### explicit with time-oredered split (new)

In [675]:
similarity = train(np.transpose(rtrain))
explanation_scores = predict(I_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, I_K)

100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:06<00:00, 1666.18it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:01<00:00, 7276.72it/s]


In [708]:
explain_evaluate(explanation, I_K, atK=[5,10,30,50])

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:40<00:00, 252.01it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:42<00:00, 242.54it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:43<00:00, 235.80it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:44<00:00, 233.16it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:45<00:00, 224.58it/s]


{'MAP@10': (0.6154823600942113, 0.001724659791955),
 'MAP@30': (0.5114275408681419, 0.000994026740346734),
 'MAP@5': (0.6862275706457504, 0.001730412184158496),
 'MAP@50': (0.43082281538800626, 0.0007338316374898346),
 'NDCG': (0.4116329054547845, 0.0006277699440078318),
 'Precision@10': (0.46250134307510476, 0.0015506019823310372),
 'Precision@30': (0.3161419719924071, 0.0007302846569774447),
 'Precision@5': (0.4319974212957989, 0.0015513551750021481),
 'Precision@50': (0.27950145052111314, 0.0005713366085377778),
 'R-Precision': (0.27950145052111314, 0.0005713366085377778),
 'Recall@10': (0.46250134307510476, 0.0015506019823310372),
 'Recall@30': (0.3161419719924071, 0.0007302846569774447),
 'Recall@5': (0.4319974212957989, 0.0015513551750021481),
 'Recall@50': (0.27950145052111314, 0.0005713366085377778)}

### explicit without time-oredered split

In [131]:
similarity = train(np.transpose(rtrain))
explanation_scores = predict(I_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, I_K)

100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:03<00:00, 2005.38it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:00<00:00, 8034.48it/s]


In [132]:
explain_evaluate(explanation, I_K_test, atK=[5,20,50]) 

100%|█████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:13<00:00, 561.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:13<00:00, 560.94it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:13<00:00, 561.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:13<00:00, 552.71it/s]


{'MAP@20': (0.18036523094037374, 0.0006643399155876114),
 'MAP@5': (0.18969546954695468, 0.0013948017800600933),
 'MAP@50': (0.204786603725808, 0.0005440288534445658),
 'NDCG': (0.22373220342456343, 0.0005602887409655968),
 'Precision@20': (0.12375112511251127, 0.0007850385182262991),
 'Precision@5': (0.19963996399639963, 0.000215195479214176),
 'Precision@50': (0.1786993699369937, 0.0006504945026639768),
 'R-Precision': (0.1786993699369937, 0.0006504945026639768),
 'Recall@20': (0.12375112511251127, 0.0007850385182262991),
 'Recall@5': (0.19963996399639963, 0.000215195479214176),
 'Recall@50': (0.1786993699369937, 0.0006504945026639768)}

# Understand item-based Explanation

## Get restaurant names correspond to ItemIndex

In [543]:
def get_business_df(path = "../../data/yelp/business.json" ):
    with open(path) as json_file:
        data = json_file.readlines()
        data = list(map(json.loads, data))
    df = pd.DataFrame(data)
    
    return df

def get_restaurant_info(business_df, business_id, name = True, review_count = True, stars = True ):
    output_list = {}
    row_idx = int(business_df.index[business_df['business_id'] == business_id].tolist()[0])
    if name == True:
        output_list['name'] = business_df['name'][row_idx].encode('utf-8').strip()
    if review_count == True:
        output_list['review_count'] = business_df['review_count'][row_idx]
    if stars == True:
        output_list['stars'] = business_df['stars'][row_idx] 
    return output_list

def get_businessid_from_Itemindex(ItemIndex_list, itemindex):
    return ItemIndex_list['business_id'].tolist()[itemindex]


In [545]:
business_df = get_business_df()

In [546]:
get_businessid_from_Itemindex(ItemIndex, 8010)

'l_uAw0K2lkOsyVJATcnwsA'

In [547]:
get_restaurant_info(business_df, 'l_uAw0K2lkOsyVJATcnwsA')

{'name': 'Spicy Mafia', 'review_count': 9, 'stars': 3.0}

In [223]:
rtrain

<2343x7456 sparse matrix of type '<type 'numpy.float32'>'
	with 75764 stored elements in Compressed Sparse Row format>

## Find specific Restaurant

In [571]:
x = ['sushi on bloor' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

35611     Sushi On Bloor
114653    Sushi On Bloor
Name: name, dtype: object


In [296]:
x = ['mafia' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

7031                       Spicy Mafia
28409                 Caricature Mafia
51959               Mafia Mike's Pizza
113184    Cakefacemafia Brows & Beauty
119974                     Spicy Mafia
140192                    Mafia Mike's
156211                  La'Bella MAFIA
180250              Mafia Mike's PIzza
Name: name, dtype: object


In [486]:
x = ['crown prince' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

74627     Crown Prince Fine Dining & Banquet
106186            Crown Princess Fine Dining
Name: name, dtype: object


In [611]:
x = ['red lobster' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

4501                  Red Lobster
9494                  Red Lobster
9599                  Red Lobster
11793                 Red Lobster
15146                 Red Lobster
21612                 Red Lobster
25103                 Red Lobster
37750                 Red Lobster
39311                 Red Lobster
42883                 Red Lobster
46058                 Red Lobster
46742                 Red Lobster
51418                 Red Lobster
51780                 Red Lobster
52374                 Red Lobster
56591                 Red Lobster
66201                 Red Lobster
68260                 Red Lobster
70318                 Red Lobster
79983                 Red Lobster
105382                Red Lobster
108167                Red Lobster
112307                Red Lobster
115861                Red Lobster
119864                Red Lobster
126004                Red Lobster
134059                Red Lobster
140933                Red Lobster
145037                Red Lobster
154559        

In [599]:
x = ['queen and beaver' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

73128    The Queen And Beaver Public House
Name: name, dtype: object


In [700]:
x = ['miku' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

107476                            Miku
116702    Mikush Home Appliance Center
Name: name, dtype: object


## If the restaurant exists in df

In [701]:
business_df['business_id'][107476]

u'0a2O150ytxrDjDzXNfRWkA'

In [702]:
get_itemindex_from_business_id(ItemIndex, '0a2O150ytxrDjDzXNfRWkA')

273

In [723]:
len(np.where(df.ItemIndex == 8010)[0])

2

# Compare Prediction with Ground Truth

In [363]:
def get_itemindex_from_business_id(ItemIndex_list, business_id):
    business_id_list = ItemIndex_list['business_id'].tolist()
    return business_id_list.index(business_id)

## Spicy Mafia (fewer reviews Chinese food)

In [677]:
print get_restaurant_info(business_df, 'l_uAw0K2lkOsyVJATcnwsA')
print get_itemindex_from_business_id(ItemIndex, 'l_uAw0K2lkOsyVJATcnwsA')

{'review_count': 9, 'name': 'Spicy Mafia', 'stars': 3.0}
8010


In [709]:
### Predicted
test_list = list(map(int, explanation[8010])) 
np.array(keyphrases)[test_list]

array(['rice', 'noodle', 'wait', 'thai', 'beef', 'art', 'tea', 'pork',
       'mall', 'chicken', 'soup', 'spicy', 'quick', 'fresh', 'curry',
       'pot', 'meat', 'lunch', 'busy', 'friendly', 'egg', 'dinner',
       'fast', 'fried', 'bar', 'salad', 'vietnamese', 'milk', 'pop',
       'tart', 'fry', 'kimchi', 'cocktail', 'clean', 'downtown', 'chewy',
       'shrimp', 'salt', 'pasta', 'bubble', 'crispy', 'bubble tea',
       'coconut', 'belly', 'tapioca', 'spring roll', 'milk tea', 'fair',
       'tofu', 'wine', 'cake', 'fish', 'pork belly', 'sour', 'yummy',
       'pizza', 'stick', 'chinese', 'sandwich', 'asian', 'fruit', 'bun',
       'bean', 'cheese', 'attentive', 'rib', 'chili', 'tuna', 'wing',
       'crunchy'], dtype='|S19')

In [715]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[rice, noodle, wait, thai, beef, art, tea, pork, mall, chicken, soup, spicy, quick, fresh, curry, pot, meat, lunch, busy, friendly]


In [682]:
### Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:70]]

array(['tomato', 'noodle', 'egg', 'fish', 'art', 'pork', 'pot', 'lunch',
       'soup', 'meat', 'tofu', 'busy', 'tart', 'apple', 'strawberry',
       'avocado', 'pop', 'lettuce', 'miso', 'juice', 'skewer', 'scallop',
       'congee', 'calamari', 'cone', 'honey', 'cookie', 'banana',
       'croissant', 'octopus', 'espresso', 'olive', 'donut', 'booth',
       'sesame', 'kimchi', 'oyster', 'bacon', 'mango', 'lamb', 'sashimi',
       'duck', 'pancake', 'matcha', 'latte', 'sausage', 'fruit',
       'cheesecake', 'cocktail', 'bubble', 'patty', 'belly', 'toast',
       'poutine', 'corn', 'coconut', 'vegan', 'lemon', 'wrap', 'tuna',
       'crepe', 'four', 'tempura', 'tapioca', 'accept debit', 'squid',
       'takeout', 'downtown', 'vegetarian', 'birthday'], dtype='|S19')

In [716]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:20]]))

[tomato, noodle, egg, fish, art, pork, pot, lunch, soup, meat, tofu, busy, tart, apple, strawberry, avocado, pop, lettuce, miso, juice]


In [681]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

22

## Crown Princess Fine Dining (many reviews Chinese food)

In [654]:
print get_restaurant_info(business_df, 'ovlWOSKVjGecnaPuZLv_OQ')
print get_itemindex_from_business_id(ItemIndex, 'ovlWOSKVjGecnaPuZLv_OQ')

{'review_count': 235, 'name': 'Crown Princess Fine Dining', 'stars': 3.5}
8517


In [717]:
### Predicted
test_list = list(map(int, explanation[8517])) 
np.array(keyphrases)[test_list]

array(['rice', 'art', 'wait', 'thai', 'tea', 'mall', 'fresh', 'soup',
       'pot', 'noodle', 'bar', 'chicken', 'beef', 'friendly', 'dumpling',
       'dinner', 'curry', 'spicy', 'egg', 'pork', 'busy', 'tart', 'lunch',
       'quick', 'fried', 'clean', 'chinese', 'cake', 'fish', 'shrimp',
       'sushi', 'rib', 'meat', 'dim sum', 'dessert', 'tuna', 'pancake',
       'milk', 'fast', 'fair', 'crispy', 'stick', 'downtown', 'store',
       'attentive', 'bun', 'steamed', 'tofu', 'salt', 'salad', 'parking',
       'pop', 'coconut', 'seafood', 'wine', 'wing', 'bean', 'sour',
       'window', 'asian', 'creamy', 'music', 'fry', 'wrap', 'beer',
       'markham', 'comfortable', 'ice cream', 'reasonable', 'scallop'],
      dtype='|S19')

In [718]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[rice, art, wait, thai, tea, mall, fresh, soup, pot, noodle, bar, chicken, beef, friendly, dumpling, dinner, curry, spicy, egg, pork]


In [684]:
### Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:70]]

array(['dim sum', 'rice', 'tea', 'art', 'chinese', 'dumpling', 'downtown',
       'pork', 'wait', 'shrimp', 'bun', 'pot', 'fried', 'dinner',
       'chicken', 'congee', 'egg', 'cake', 'dessert', 'tart', 'bbq',
       'mall', 'fancy', 'rib', 'lunch', 'scallop', 'fresh', 'stick',
       'noodle', 'pricey', 'busy', 'markham', 'wrap', 'squid', 'clean',
       'duck', 'salt', 'crispy', 'bean', 'octopus', 'yummy', 'fair',
       'attentive', 'tax', 'french', 'brunch', 'soup', 'refill', 'quick',
       'corn', 'steamed', 'greasy', 'fast', 'curry', 'cheaper', 'solid',
       'quiet', 'sesame', 'meat', 'disappointing', 'asian', 'parking',
       'bar', 'baked', 'reasonable', 'crunchy', 'traditional', 'lobster',
       'milk', 'comfortable'], dtype='|S19')

In [719]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:20]]))

[dim sum, rice, tea, art, chinese, dumpling, downtown, pork, wait, shrimp, bun, pot, fried, dinner, chicken, congee, egg, cake, dessert, tart]


In [685]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

47

## The Queen And Beaver Public House (many reviews dessert place)

In [699]:
print get_restaurant_info(business_df, 'd_QHsjv4aPh2BKiHgk_Dcg')
print get_itemindex_from_business_id(ItemIndex, 'd_QHsjv4aPh2BKiHgk_Dcg')

{'review_count': 18, 'name': 'Japango Sushi & Noodle Restaurant', 'stars': 3.5}
6672


In [724]:
test_list = list(map(int, explanation[8764])) 
np.array(keyphrases)[test_list]

array(['art', 'rice', 'mall', 'pot', 'wait', 'bar', 'friendly', 'beer',
       'tea', 'fresh', 'chicken', 'pizza', 'beef', 'salad', 'sandwich',
       'busy', 'quick', 'lunch', 'tart', 'meat', 'egg', 'store', 'dinner',
       'cheese', 'fast', 'pop', 'soup', 'spicy', 'bun', 'pub', 'pork',
       'ice cream', 'bread', 'wing', 'fry', 'dessert', 'tofu', 'cake',
       'coffee', 'crust', 'fried', 'rib', 'stick', 'clean', 'burger',
       'downtown', 'corn', 'roasted', 'cocktail', 'apple', 'tuna', 'bean',
       'wine', 'noodle', 'potato', 'tomato', 'chocolate', 'crispy',
       'movie', 'comfortable', 'bbq', 'shrimp', 'fair', 'helpful', 'thai',
       'salt', 'cozy', 'topped', 'casual', 'chip'], dtype='|S19')

In [725]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[art, rice, mall, pot, wait, bar, friendly, beer, tea, fresh, chicken, pizza, beef, salad, sandwich, busy, quick, lunch, tart, meat]


In [687]:
np.array(keyphrases)[np.argsort(np.ravel(I_K[8764].todense()))[::-1][:70]]

array(['pub', 'beer', 'dinner', 'art', 'pot', 'rice', 'bar', 'burger',
       'chip', 'mall', 'fish', 'egg', 'wait', 'tea', 'fry', 'cozy',
       'fresh', 'brunch', 'beef', 'salt', 'cheese', 'cocktail', 'meat',
       'bacon', 'comfortable', 'downtown', 'friendly', 'potato', 'chair',
       'stick', 'pricey', 'pork', 'bun', 'bread', 'rib', 'lunch',
       'greasy', 'patty', 'toast', 'traditional', 'fast', 'wing', 'quick',
       'salad', 'tomato', 'tart', 'attentive', 'sausage', 'crispy',
       'lamb', 'clean', 'gravy', 'pleasant', 'vegetarian', 'fancy',
       'solid', 'dark', 'dessert', 'latte', 'breakfast', 'seasoned',
       'washroom', 'lemon', 'refreshing', 'bean', 'fried', 'steak',
       'rare', 'ice cream', 'pop'], dtype='|S19')

In [688]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8764].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

44

## Red Lobster

In [660]:
print get_restaurant_info(business_df, 'hTdJAjSZtHWwqqh5cCeAfA')
print get_itemindex_from_business_id(ItemIndex, 'hTdJAjSZtHWwqqh5cCeAfA')

{'review_count': 93, 'name': 'Red Lobster', 'stars': 3.0}
7312


In [689]:
# Predicted
test_list = list(map(int, explanation[7312])) 
np.array(keyphrases)[test_list]

array(['rice', 'tea', 'art', 'chicken', 'mall', 'wait', 'friendly',
       'fresh', 'fast', 'pot', 'quick', 'milk', 'clean', 'bubble',
       'bubble tea', 'busy', 'rib', 'dessert', 'fry', 'pop', 'cake',
       'lunch', 'milk tea', 'egg', 'tart', 'salad', 'breakfast', 'beef',
       'cheese', 'dinner', 'soup', 'store', 'noodle', 'fried', 'meat',
       'tapioca', 'bar', 'spicy', 'burger', 'fish', 'curry', 'parking',
       'steak', 'coffee', 'wing', 'fruit', 'chinese', 'corn', 'potato',
       'bread', 'bacon', 'chocolate', 'fair', 'tax', 'mango', 'plaza',
       'helpful', 'attentive', 'bean', 'crispy', 'toast', 'stick', 'bun',
       'reasonable', 'lemon', 'sour', 'juicy', 'asian', 'salt',
       'comfortable'], dtype='|S19')

In [690]:
# Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[7312].todense()))[::-1][:70]]

array(['lobster', 'wait', 'seafood', 'shrimp', 'friendly', 'fresh', 'art',
       'mall', 'fish', 'pot', 'dinner', 'salad', 'dessert', 'lunch',
       'potato', 'rice', 'tea', 'fried', 'refill', 'stuffed', 'four',
       'quick', 'busy', 'bread', 'dip', 'crispy', 'pasta', 'bar',
       'coconut', 'tart', 'soup', 'fry', 'baked', 'scallop', 'quiet',
       'chocolate', 'cheese', 'parking', 'chicken', 'chip', 'bun', 'rib',
       'tuna', 'greasy', 'salt', 'creamy', 'classic', 'casual',
       'immediately', 'pop', 'pizza', 'crowded', 'beer', 'wing', 'topped',
       'cheesecake', 'strawberry', 'roasted', 'markham', 'latte',
       'steamed', 'frozen', 'meat', 'juicy', 'deep fried', 'ice cream',
       'cookie', 'tomato', 'salmon', 'steak'], dtype='|S19')

In [691]:
# number of Hits 
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[7312].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

35

## Miku

In [704]:
print get_restaurant_info(business_df, '0a2O150ytxrDjDzXNfRWkA')
print get_itemindex_from_business_id(ItemIndex, '0a2O150ytxrDjDzXNfRWkA')

{'review_count': 604, 'name': 'Miku', 'stars': 4.0}
273


In [705]:
# Predicted
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['rice', 'wait', 'art', 'mall', 'tea', 'fresh', 'cake', 'pot',
       'egg', 'meat', 'pork', 'dessert', 'chicken', 'friendly', 'tart',
       'dinner', 'cheese', 'soup', 'busy', 'ice cream', 'bar', 'fish',
       'noodle', 'quick', 'fried', 'beef', 'lunch', 'spicy', 'salad',
       'pop', 'salt', 'sushi', 'matcha', 'japanese', 'rib', 'shrimp',
       'fast', 'salmon', 'ramen', 'tuna', 'corn', 'creamy', 'crispy',
       'seafood', 'coffee', 'milk', 'latte', 'wing', 'bean', 'oyster',
       'waffle', 'clean', 'bbq', 'lobster', 'brunch', 'fair', 'pricey',
       'parking', 'belly', 'cheesecake', 'attentive', 'sashimi',
       'green tea', 'chocolate', 'cocktail', 'fry', 'pancake', 'yummy',
       'bread', 'pork belly'], dtype='|S19')

In [706]:
# Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:70]]

array(['sushi', 'rice', 'dessert', 'salmon', 'fish', 'art', 'tea',
       'fresh', 'sashimi', 'green tea', 'cake', 'dinner', 'japanese',
       'matcha', 'lunch', 'tart', 'ice cream', 'tuna', 'wait',
       'chocolate', 'mall', 'scallop', 'miso', 'salad', 'beef', 'latte',
       'birthday', 'seafood', 'friendly', 'attentive', 'pot', 'rib',
       'modern', 'tofu', 'oyster', 'shrimp', 'bar', 'bean', 'pricey',
       'egg', 'cocktail', 'clean', 'nicely', 'lobster', 'fried', 'sesame',
       'baked', 'quick', 'busy', 'wine', 'creamy', 'salt', 'calamari',
       'spicy', 'traditional', 'crunchy', 'seasoned', 'meat', 'pop',
       'dark', 'spacious', 'squid', 'four', 'fruit', 'crispy', 'stick',
       'bacon', 'potato', 'bread', 'topped'], dtype='|S19')

In [707]:
# number of Hits 
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

48

# TF-IDF

# Create Synthetic User

In [35]:
# Get top_items items associated with keyphrase_ids 
def item_associated_with_keyphrase(I_K, keyphrase_ids, top_items = 100):
    """
    I_K: Item Keyphrase Matrix
    Keyphrase_ids: top items described by keyphrase_ids
    top_n: how many top items described by the keyphrases will be output 
    output item list (unique)
    """
    res = []
    for keyphrase_id in keyphrase_ids:
        res.append(np.argsort(np.ravel(I_K.todense()[:,keyphrase_id]))[::-1][:top_items])
    return np.unique(res)

# Modify U_U latent Space from U_I
def modify_user_preference(U_I, items, user_id = 0):
    """
    TODO: Fix the function s.t. it will not modify the initial matrix
    """
    U_I[user_id,:] = 0
    for i in items:
        U_I[0,i] = 1
    return U_I

def clear_user_keyphrase(U_K, user_id = 0):
    U_K[user_id,:] = 0

def explain_synthetic_user(rtrain, U_K, I_K, keyphrases, top_keyphrase = 20, user_id = 0, k = 100, top_items = 100, **Not_used):
    """
    
    """
    items = item_associated_with_keyphrase(I_K, keyphrases, top_items = top_items) # 8 is coffee
    U_I = modify_user_preference(rtrain, items, user_id=user_id)
    modified_U_U = train(U_I)
    U_K[user_id, :] = 0
    synthetic_user_keyphrase = normalize(predict(U_K, k, modified_U_U))[user_id]
    return np.argsort(synthetic_user_keyphrase)[::-1][:top_keyphrase]

def modify_user_keyphrase(U_K, keyphrase_ids, normalization = True, keyval = 1, user_id = 0, **Not_Used):
    """
    Change all keyphrase_ids to some fixed number, all others to 0
    Return the U_K matrix with user_id row the synthetic user1
    """
    U_K[user_id,:] = 0
    for keyphrase_id in keyphrase_ids:
        U_K[user_id,keyphrase_id] = keyval
    if normalization == True:
        return normalize(U_K)
    return U_K

In [36]:
# Modify user preference matrix
items = item_associated_with_keyphrase(I_K, [0], top_items = 200) # 'chinese'
U_I = modify_user_preference(rtrain, items, user_id = 0)



In [37]:
# make synthetic user1's keyphrase preference all 0
clear_user_keyphrase(U_K, user_id = 0)



In [42]:
U_K

<2343x233 sparse matrix of type '<type 'numpy.int32'>'
	with 298309 stored elements in Compressed Sparse Row format>

In [43]:
I_K

<7456x233 sparse matrix of type '<type 'numpy.int32'>'
	with 351924 stored elements in Compressed Sparse Row format>

In [44]:
# Get latent user similarity embedding
modified_U_U = train(U_I)
# predict
explanation_scores1 = predict(U_K, 100, modified_U_U)
explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 10)

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:01<00:00, 2151.52it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2343/2343 [00:00<00:00, 97624.32it/s]


In [45]:
explanation1[0]

array([ 50,  60, 155, 205,  57, 160, 197,  61, 164, 226], dtype=int64)

In [65]:
def rank_in_prediction(rtrain, U_K, I_K, top_items = 200, keyphrase = 0,user_i = 0):
    """
    Get the rank for user_i with keyphrase
    TODO: modify so that no need to reload U_K,I_K
    """  
    U_K = load_npz('../../data/yelp/U_K.npz')
    U_K = normalize(U_K)
    rtrain = load_npz("../../data/yelp/Rtrain.npz")
    
    # Modify user preference matrix
    items = item_associated_with_keyphrase(I_K, keyphrase, top_items = top_items) # 'raspberry'
    U_I = modify_user_preference(rtrain, items, user_id = 0)
    
    # make synthetic user1's keyphrase preference all 0
    clear_user_keyphrase(U_K, user_id = 0)
    
    # Get latent user similarity embedding
    modified_U_U = train(U_I)
    # predict
    explanation_scores1 = predict(U_K, 100, modified_U_U)
    explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 230)
    return list(explanation1[user_i]).index(keyphrase[0])
    
def evaluate_pilot_test(rtrain,U_K, I_K,keyphrase_list, top_items = 200, user_i = 0):
    # Get the average rank for user_i with keyphrase  
    res1 = 0
    for i in range(75):
        a = rank_in_prediction(rtrain, U_K, I_K, top_items = top_items, keyphrase = [i],user_i = user_i)
        print "keyphrase", keyphrase_list[i], "'s rank is ", a
        res1+= a
    return res1/75

In [None]:
evaluate_pilot_test(rtrain,U_K,I_K,keyphrases)

# Create Synthetic item

In [73]:
# Get top_users items associated with keyphrase_ids 
def users_with_keyphrase_preference(U_K, keyphrase_ids, top_users = 100, norm = True):
    """
    U_K: User Keyphrase Matrix
    Keyphrase_ids: top_users who like keyphrase_ids
    output item list (unique)
    """
    res = []
    if norm:
        U_K = normalize(U_K)
    try:
        for keyphrase_id in keyphrase_ids:
            res.append(np.argsort(np.ravel(U_K.todense()[:,keyphrase_id]))[::-1][:top_users])
    except:
        return np.argsort(np.ravel(U_K.todense()[:,keyphrase_ids]))[::-1][:top_users]
    return np.unique(res)

# Modify I_I latent Space from U_I
def modify_item_history(U_I, users, item_id = 0):
    """
    TODO: Fix the function s.t. it will not modify the initial matrix
    """
    U_I[:,item_id] = 0
    for i in users:
        U_I[i, item_id] = 1
    return normalize(U_I)

def clear_item_keyphrase(I_K, item_id = 0):
    I_K[item_id, :] = 0

def explain_synthetic_item(rtrain, U_K, I_K, keyphrases, top_keyphrase = 20, user_id = 0, k = 100, top_users = 100, **Not_used):
    """
    
    """
    items = item_associated_with_keyphrase(I_K, keyphrases, top_items = top_items) # 8 is coffee
    U_I = modify_user_preference(rtrain, items, user_id=user_id)
    modified_U_U = train(U_I)
    U_K[user_id, :] = 0
    synthetic_user_keyphrase = normalize(predict(U_K, k, modified_U_U))[user_id]
    return np.argsort(synthetic_user_keyphrase)[::-1][:top_keyphrase]

def modify_user_keyphrase(U_K, keyphrase_ids, normalization = True, keyval = 1, user_id = 0, **Not_Used):
    """
    Change all keyphrase_ids to some fixed number, all others to 0
    Return the U_K matrix with user_id row the synthetic user1
    """
    U_K[user_id,:] = 0
    for keyphrase_id in keyphrase_ids:
        U_K[user_id,keyphrase_id] = keyval
    if normalization == True:
        return normalize(U_K)
    return U_K

## Prediction Pipeline

In [67]:
U_K = load_npz('../../data/yelp/U_K.npz')
I_K = load_npz('../../data/yelp/I_K.npz')
U_K = normalize(U_K)
I_K = normalize(I_K)
rtrain = load_npz("../../data/yelp/Rtrain.npz")

In [74]:
# Modify user preference matrix
users = users_with_keyphrase_preference(U_K, 70, top_users = 100, norm = True) # 'raspberry'
U_I = modify_item_history(rtrain, users, item_id = 0)



In [75]:
# make synthetic item1's keyphrase to all 0
clear_item_keyphrase(I_K, item_id = 0)

In [76]:
# Get latent item similarity embedding
modified_I_I = train(np.transpose(U_I))
# predict
explanation_scores1 = predict(I_K, 100, modified_I_I)
explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 70)

100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:03<00:00, 2184.59it/s]
100%|██████████████████████████████████████████████████████████████████████████| 7456/7456 [00:00<00:00, 112969.72it/s]


In [77]:
def rank_in_prediction_item(rtrain, U_K, I_K, top_users = 200, keyphrase = 70, item_i = 0):
    """
    Get the rank for user_i with keyphrase
    TODO: modify so that no need to reload U_K,I_K
    """  
    U_K = load_npz('../../data/yelp/U_K.npz')
    I_K = load_npz('../../data/yelp/I_K.npz')
    U_K = normalize(U_K)
    I_K = normalize(I_K)
    rtrain = load_npz("../../data/yelp/Rtrain.npz")
    
    # Modify user preference matrix
    users = users_with_keyphrase_preference(U_K, keyphrase, top_users = top_users, norm = True) # 'raspberry'
    U_I = modify_item_history(rtrain, users, item_id = item_i)
    
    # make synthetic item1's keyphrase to all 0
    clear_item_keyphrase(I_K, item_id = item_i)
    
    # Get latent item similarity embedding
    modified_I_I = train(np.transpose(U_I))
    # predict
    explanation_scores1 = predict(I_K, 100, modified_I_I)
    explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 230)
    return list(explanation1[item_i]).index(keyphrase)

    
def evaluate_pilot_test_item(rtrain,U_K, I_K,keyphrase_list, top_users = 200, item_i = 0):
    # Get the average rank for item_i with keyphrase  
    res1 = 0
    for i in range(75):
        a = rank_in_prediction_item(rtrain, U_K, I_K, top_users = top_users, keyphrase = i, item_i = item_i)
        print "keyphrase", keyphrase_list[i], "'s rank is ", a
        res1+= a
    return res1/75

In [78]:
rank_in_prediction_item(rtrain, U_K, I_K, top_users = 200, keyphrase = 10, item_i = 0)

100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:03<00:00, 2163.04it/s]
100%|██████████████████████████████████████████████████████████████████████████| 7456/7456 [00:00<00:00, 112969.72it/s]


32

In [None]:
evaluate_pilot_test_item(rtrain,U_K, I_K,keyphrases, top_users = 200, item_i = 0)