In [1]:
from ast import literal_eval
from os import listdir
from os.path import isfile, join
from scipy.sparse import csr_matrix, load_npz, save_npz
from tqdm import tqdm
from sklearn.preprocessing import normalize

import seaborn as sns
import datetime
import json
import numpy as np
import pandas as pd
import time
import yaml
import scipy.sparse as sparse
from ast import literal_eval

# For Python2 this have to be done
from __future__ import division

# Load Dataframe

In [13]:
df = pd.read_csv('../../data/yelp/Data.csv')

# Split Data

In [14]:
def to_sparse_matrix(df, num_user, num_item, user_col, item_col, rating_col):

    dok = df[[user_col, item_col, rating_col]].copy()
    dok = dok.values
    dok = dok[dok[:, 2] > 0]
    shape = [num_user, num_item]

    return sparse.csr_matrix((dok[:, 2].astype(np.float32), (dok[:, 0], dok[:, 1])), shape=shape)

def leave_one_out_split(df, user_col, ratio, random_state=None):
    grouped = df.groupby(user_col, as_index=False)
    valid = grouped.apply(lambda x: x.sample(frac=ratio, random_state=random_state))
    train = df.loc[~df.index.isin([x[1] for x in valid.index])]
    return train, valid

def time_ordered_split(df, ratio, user_col = None, random_state=None):
    # Sort data based on timestamp
    argsort = np.argsort(df['timestamp'])
    df_ordered = df.reindex(argsort)
    train_offset = int((1-ratio)*len(df_ordered))
    
    train = df_ordered[:train_offset]
    valid = df_ordered[train_offset:]
    return train, valid


def main(enable_validation = False, time_ordered_split_en = True, implicit_en = False):
    df = pd.read_csv('../../data/yelp/' + 'Data.csv')

    num_users = df['UserIndex'].nunique()
    num_items = df['ItemIndex'].nunique()

    # Get timestamp 
    date_time_df = df[['Day','Month','Year']]
    date_time_df.rename(columns={'Year': 'year', 'Month': 'day', 'Day':'month'}, inplace=True)
    date_time = pd.to_datetime(date_time_df)
    df['timestamp'] = date_time

    rating_col = 'rating'
    if implicit_en == True:
        rating_col = 'Binary'
    
    if time_ordered_split_en:
        df_train, df_test = time_ordered_split(df, 0.2)
    else:
        df_train, df_test = leave_one_out_split(df, 'UserIndex', 0.2, random_state=8292)

    if enable_validation:
        if time_ordered_split_en:
            df_train, df_valid = time_ordered_split(df_train, 0.2)
        else:
            df_train, df_valid = leave_one_out_split(df_train, 'UserIndex', 0.2, random_state=8292)
        
        # Clean empty rows
        df_valid = df_valid.dropna().reset_index(drop = True)
        
        # Save
        df_valid.to_csv('../../data/yelp/' + 'Valid.csv')
        R_valid = to_sparse_matrix(df_valid, num_users, num_items, 'UserIndex','ItemIndex', rating_col)
        sparse.save_npz('../../data/yelp/' + 'Rvalid.npz', R_valid)
    
    # Clean empty rows
    df_train = df_train.dropna().reset_index(drop = True)
    df_test = df_test.dropna().reset_index(drop = True)
    
    # Save
    df_train.to_csv('../../data/yelp/'  + 'Train.csv')
    R_train = to_sparse_matrix(df_train, num_users, num_items, 'UserIndex', 'ItemIndex', rating_col)
    sparse.save_npz('../../data/yelp/' + 'Rtrain.npz', R_train)

    df_test.to_csv('../../data/yelp/' + 'Test.csv')
    R_test = to_sparse_matrix(df_test, num_users, num_items, 'UserIndex', 'ItemIndex', rating_col)
    sparse.save_npz('../../data/yelp/' + 'Rtest.npz', R_test)
    
def date_to_timestamp(date, **not_used):
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    return time.mktime(dt.timetuple())


In [15]:
df.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Unnamed: 0.1.1', u'business_id',
       u'friend_count', u'ghost', u'img_dsc', u'img_url', u'nr',
       u'photo_count', u'rating', u'review_count', u'review_date',
       u'review_id', u'review_language', u'review_text', u'ufc', u'user_id',
       u'user_loc', u'vote_count', u'Updated', u'Year', u'Month', u'Day',
       u'Binary', u'review', u'conca_review', u'keyVector',
       u'keyphrases_indices_length', u'UserIndex', u'ItemIndex'],
      dtype='object')

In [16]:
main(enable_validation = True, time_ordered_split_en = False, implicit_en = True)

# Load Data

In [31]:
# Load Original Data
df_train = pd.read_csv('../../data/yelp/Train.csv')
df_valid = pd.read_csv('../../data/yelp/Valid.csv')
df_test = pd.read_csv('../../data/yelp/Test.csv')
# keyphrases = pd.read_csv('../../data/yelp/KeyPhrases.csv')['Phrases'].tolist()

In [18]:
ItemIndex = pd.read_csv('../../data/yelp/ItemIndex.csv')
ItemIndex = ItemIndex.sort_values('ItemIndex').drop_duplicates(subset=['ItemIndex', 'business_id'])

In [28]:
# Load U-I Data 
rtrain = load_npz("../../data/yelp/Rtrain.npz")
rvalid = load_npz("../../data/yelp/Rvalid.npz")
rtest = load_npz("../../data/yelp/Rtest.npz")

In [5]:
rtrain

<2473x10282 sparse matrix of type '<type 'numpy.float32'>'
	with 102741 stored elements in Compressed Sparse Row format>

In [49]:
rtrain

<2473x10282 sparse matrix of type '<type 'numpy.float32'>'
	with 102741 stored elements in Compressed Sparse Row format>

In [24]:
len(df_train)

102741

In [32]:
len(keyphrases)

334

In [51]:
# Generate U_K and I_K
# For validation set
U_K = get_I_K(df_train, row_name = 'UserIndex', shape = (2473, 334))
I_K = get_I_K(df_train, row_name = 'ItemIndex', shape = (10282, 334))
# For test set
# U_K_test = get_I_K(df_test, row_name = 'UserIndex', shape = (2473, 235))
# I_K_test = get_I_K(df_test, row_name = 'ItemIndex', shape = (10282, 235))

100%|███████████████████████████████████████████████████████████████████████| 102741/102741 [00:05<00:00, 17790.65it/s]
100%|███████████████████████████████████████████████████████████████████████| 102741/102741 [00:05<00:00, 18116.91it/s]


In [79]:
# Save
save_npz('../../data/yelp/U_K.npz',U_K)
save_npz('../../data/yelp/I_K.npz',I_K)
# save_npz( '../../data/yelp/U_K_test.npz',U_K_test)
# save_npz('../../data/yelp/I_K_test.npz',I_K_test)

In [6]:
# Load 
U_K = load_npz('../../data/yelp/U_K.npz')
I_K = load_npz('../../data/yelp/I_K.npz')

In [7]:
U_K

<2473x235 sparse matrix of type '<type 'numpy.int32'>'
	with 252930 stored elements in Compressed Sparse Row format>

In [50]:
# Models
from sklearn.metrics.pairwise import cosine_similarity
def train(matrix_train):
    similarity = cosine_similarity(X=matrix_train, Y=None, dense_output=True)
    return similarity

def get_I_K(df, row_name = 'ItemIndex', shape = (3668,75)):
    rows = []
    cols = []
    vals = []
    for i in tqdm(range(df.shape[0])):
        key_vector = literal_eval(df['keyVector'][i])
        rows.extend([df[row_name][i]]*len(key_vector)) ## Item index
        cols.extend(key_vector) ## Keyword Index
#         if binary:
        vals.extend(np.array([1]*len(key_vector)))
#         else:
#             vals.extend(arr[arr.nonzero()])    
    return csr_matrix((vals, (rows, cols)), shape=shape)


def predict(matrix_train, k, similarity, item_similarity_en = False):
    """
    res = similarity * matrix_train    if item_similarity_en = False
    res = similarity * matrix_train.T  if item_similarity_en = True
    """
    prediction_scores = []
    
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores to all users
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        similar_users = vector_u.argsort()[::-1][1:k+1]
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        similar_users_ratings = matrix_train[similar_users].toarray()

        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]

        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    
    return res

def prediction(prediction_score, topK, matrix_Train):

    prediction = []

    for user_index in tqdm(range(matrix_Train.shape[0])):
        vector_u = prediction_score[user_index]
        vector_train = matrix_Train[user_index]
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return np.vstack(prediction)


def sub_routine(vector_u, vector_train, topK=500):

    train_index = vector_train.nonzero()[1]

    vector_u = vector_u

    candidate_index = np.argpartition(-vector_u, topK+len(train_index))[:topK+len(train_index)]
    vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
    vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    return vector_u[:topK]


In [53]:
# Evluation 
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)

def precisionk(vector_predict, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_predict)


def average_precisionk(vector_predict, hits, **unused):
    precisions = np.cumsum(hits, dtype=np.float32)/range(1, len(vector_predict)+1)
    return np.mean(precisions)


def r_precision(vector_true_dense, vector_predict, **unused):
    vector_predict_short = vector_predict[:len(vector_true_dense)]
    hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)


def _dcg_support(size):
    arr = np.arange(1, size+1)+1
    return 1./np.log2(arr)


def ndcg(vector_true_dense, vector_predict, hits):
    idcg = np.sum(_dcg_support(len(vector_true_dense)))
    dcg_base = _dcg_support(len(vector_predict))
    dcg_base[np.logical_not(hits)] = 0
    dcg = np.sum(dcg_base)
    return dcg/idcg


def click(hits, **unused):
    first_hit = next((i for i, x in enumerate(hits) if x), None)
    if first_hit is None:
        return 5
    else:
        return first_hit/10


def evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
            vector_true = matrix_Test[user_index]
            vector_true_dense = vector_true.nonzero()[1]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output

def explain_evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
#                 vector_true = matrix_Test[user_index]
                vector_true = np.ravel(matrix_Test.todense()[0])
                vector_true_dense = np.argsort(vector_true)[::-1][:k]
#                 vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
#             vector_true = matrix_Test[user_index]
#             vector_true_dense = vector_true.nonzero()[1]
            vector_true = np.ravel(matrix_Test.todense()[0])
            vector_true_dense = np.argsort(vector_true)[::-1][:k]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output

# Explanation Model

In [54]:
def explain(R,W2,k, model = "Cosine_similarity", item_similarity_en = True):
    """
    k: knn's hyperparameter k
    R: Rating Matrix with size U*I
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    Z: Joint Embedding/Latent Space with size U*U, generate r_ij and s_ij
    W2: Reconstruction matrix with size U*K 
    S: Output explanation prediction matrix with size U*K (dense numpy ndarray)
    """
    Z = train(R) # Cosine similarity as default
    S = predict(W2, k, Z, item_similarity_en=item_similarity_en) 
    if normalize_en == True:       
        return normalize(S) # prediction score
    return S

def predict(matrix_train, k, similarity, item_similarity_en = False, normalize_en = False):
    """
    matrix_train: Rating Matrix with size U*I
    k: knn's hyperparameter k
    similarity: Joint Embedding/Latent Space with size U*U or I*I
    
    res = similarity * matrix_train    if item_similarity_en = False
    res = similarity * matrix_train.T  if item_similarity_en = True
    
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    """
    prediction_scores = []
    
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores to all users
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        similar_users = vector_u.argsort()[::-1][1:k+1]
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        similar_users_ratings = matrix_train[similar_users].toarray()

        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]

        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    if normalize_en:
        res = normalize(res)
    return res

def explain_prediction(prediction_score, topK, matrix_Train):
    """
    output prediction res of the  top K items/keyphrase/whatever
    """
    prediction = []

    for user_index in tqdm(range(matrix_Train.shape[0])):
        vector_u = prediction_score[user_index]
        vector_train = matrix_Train[user_index]
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine_explain(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)
    return np.vstack(prediction)
#     return prediction

def sub_routine_explain(vector_u, vector_train, topK=30):
    """
    vector_u: predicted user vector
    vector_train: true user vector
    topK: top k items in vector
    vector_u: top k items predicted
    """
#     train_index = vector_train.nonzero()[1]
#     candidate_index = np.argpartition(-vector_u, topK+75)[:topK+75] #  10 here to make res consistent
#     candidate_index = np.argpartition(-vector_u, 74)[:topK]
#     vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
#     vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    candidate_index = np.argsort(vector_u)[::-1][:topK]
    return candidate_index

def predict_pilot_explanation(explanation_scores, top_keyphrase = 10):
    """
    Used for retrieve the 1st row of prediction scores, used for pilot test
    """
    explanation = []
    for explanation_score in tqdm(explanation_scores):
        explanation.append(np.argsort(explanation_score)[::-1][:top_keyphrase])
    return np.array(explanation)

In [55]:
def explain(R,W2,k):
    """
    R: Rating Matrix with size U*I
    r_ij: observed rating with user i and item j 
    s_ij: explanation vector with user i and item j 
    Z: Joint Embedding/Latent Space with size U*U, generate r_ij and s_ij
    W2: Reconstruction matrix with size U*K 
    S: Output explanation prediction matrix with size U*K
    """
    Z = train(R)
    S = predict(W2, k, Z)
    return normalize(S)

# Evaluation Model
def recall(vector_true_dense, vector_true_predict):
    """
    The fraction of relevant instances that have been retrieved over the total amount of relevant instances
    The length of vector_true_dense and vector_true_predict has to be the same
    Out put recall
    """
    hits = len(np.isin(vector_true_predict, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)

# Total Recall
def recall_all(true_matrix, predict_matrix, topK = 20):
    res = []
    for i in tqdm(range(len(Explanation_res1))):
        true_vector = np.argsort(np.ravel(normalize(true_matrix).todense()[i]))[-topK:]
        predict_vector = np.argsort(predict_matrix[i])[-topK:]
        res.append(recall(true_vector,predict_vector))
    return sum(res)/len(res)

# Evaluation

### explicit with time-ordered split

In [56]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:01<00:00, 1707.87it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:00<00:00, 7316.57it/s]


In [742]:
explain_evaluate(explanation, U_K, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 907.52it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 912.55it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 896.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 876.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 814.29it/s]


{'MAP@10': (0.30572436135912695, 0.0023586693682974545),
 'MAP@20': (0.45840955394632565, 0.0018530622660002188),
 'MAP@5': (0.0564626736111111, 0.0018899042160193539),
 'MAP@50': (0.7432506572809149, 0.0007380546195515702),
 'NDCG': (0.7156882409582186, 0.000962427851550532),
 'Precision@10': (0.35668402777777775, 0.002292811652556904),
 'Precision@20': (0.34036458333333336, 0.0018187403298631716),
 'Precision@5': (0.14505208333333333, 0.0035186949203039313),
 'Precision@50': (0.5203993055555556, 0.0009750286085532342),
 'R-Precision': (0.5203993055555556, 0.0009750286085532342),
 'Recall@10': (0.35668402777777775, 0.002292811652556904),
 'Recall@20': (0.34036458333333336, 0.0018187403298631716),
 'Recall@5': (0.14505208333333333, 0.0035186949203039313),
 'Recall@50': (0.5203993055555556, 0.0009750286085532342)}

In [57]:
# With Vader Keyphrase
explain_evaluate(explanation, U_K, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 636.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 637.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 640.51it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 642.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 627.19it/s]


{'MAP@10': (0.3057171275628307, 0.002359294896992973),
 'MAP@20': (0.45840593704817756, 0.0018534890803013125),
 'MAP@5': (0.0564626736111111, 0.0018899042160193539),
 'MAP@50': (0.7096259471824808, 0.001151950788477201),
 'NDCG': (0.7122241082228672, 0.001110137717433586),
 'Precision@10': (0.35668402777777775, 0.002292811652556904),
 'Precision@20': (0.34036458333333336, 0.0018187403298631716),
 'Precision@5': (0.14505208333333333, 0.0035186949203039313),
 'Precision@50': (0.5081076388888889, 0.0011381668693386043),
 'R-Precision': (0.5081076388888889, 0.0011381668693386043),
 'Recall@10': (0.35668402777777775, 0.002292811652556904),
 'Recall@20': (0.34036458333333336, 0.0018187403298631716),
 'Recall@5': (0.14505208333333333, 0.0035186949203039313),
 'Recall@50': (0.5081076388888889, 0.0011381668693386043)}

### implicit with time-ordered split

In [812]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:01<00:00, 1967.38it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:00<00:00, 7585.88it/s]


In [813]:
explain_evaluate(explanation, U_K, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 905.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 882.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 844.89it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:02<00:00, 890.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 813.49it/s]


{'MAP@10': (0.311798011739418, 0.0031561433837630068),
 'MAP@30': (0.6306166965447901, 0.0031362038353130012),
 'MAP@5': (0.0601345486111111, 0.002815488470400913),
 'MAP@50': (0.7330563022535985, 0.003249815201998381),
 'NDCG': (0.7132432268933633, 0.001959402705468077),
 'Precision@10': (0.3548177083333333, 0.0028022116113549708),
 'Precision@30': (0.4426070601851852, 0.0020332865390623677),
 'Precision@5': (0.13671875, 0.003680709859694958),
 'Precision@50': (0.5121440972222222, 0.0020030788652286792),
 'R-Precision': (0.5121440972222222, 0.0020030788652286792),
 'Recall@10': (0.3548177083333333, 0.0028022116113549708),
 'Recall@30': (0.4426070601851852, 0.0020332865390623677),
 'Recall@5': (0.13671875, 0.003680709859694958),
 'Recall@50': (0.5121440972222222, 0.0020030788652286792)}

### Implicit without time-ordered split

In [25]:
# ( U_I * I_U ) * U_K
similarity = train(rtrain)
explanation_scores = predict(U_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 100, U_K)

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:01<00:00, 2128.07it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:00<00:00, 7607.15it/s]


In [28]:
explain_evaluate(explanation, U_K_test, atK=[5,20,40]) 

100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 939.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 889.18it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 913.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:02<00:00, 882.49it/s]


{'MAP@20': (0.45316721230242296, 0.0019350810812115379),
 'MAP@40': (0.5842206191206047, 0.002377891946403873),
 'MAP@5': (0.623118509034002, 0.0029496286180747486),
 'NDCG': (0.7268562260242788, 0.0021141846532503274),
 'Precision@20': (0.3027102005975245, 0.0018155475022447942),
 'Precision@40': (0.5250213401621853, 0.0021673342673273256),
 'Precision@5': (0.3977806231327358, 0.0016873251019215718),
 'R-Precision': (0.5250213401621853, 0.0021673342673273256),
 'Recall@20': (0.3027102005975245, 0.0018155475022447942),
 'Recall@40': (0.5250213401621853, 0.0021673342673273256),
 'Recall@5': (0.3977806231327358, 0.0016873251019215718)}

# Tuning 

TODO

# Item-based Explanation

### explicit with time-oredered split 

In [58]:
similarity = train(np.transpose(rtrain))
explanation_scores = predict(I_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, I_K)

100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:06<00:00, 1582.82it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:01<00:00, 7445.33it/s]


In [825]:
explain_evaluate(explanation, I_K, atK=[5,10,30,50])

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:44<00:00, 232.73it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:43<00:00, 236.53it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:42<00:00, 244.69it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:42<00:00, 243.45it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:44<00:00, 233.05it/s]


{'MAP@10': (0.28026686614955226, 0.0020168575717200354),
 'MAP@30': (0.37520300707679344, 0.0013170483773279748),
 'MAP@5': (0.0017209636821287305, 0.0003842673027413293),
 'MAP@50': (0.33210548732975054, 0.0009162156264124513),
 'NDCG': (0.3364941611204058, 0.0008128281214049484),
 'Precision@10': (0.24431499460625677, 0.0013591829513627755),
 'Precision@30': (0.27102840704782455, 0.0008001989770720488),
 'Precision@5': (0.002761596548004315, 0.0005082563467914027),
 'Precision@50': (0.24768500539374325, 0.0005864023566947201),
 'R-Precision': (0.24768500539374325, 0.0005864023566947201),
 'Recall@10': (0.24431499460625677, 0.0013591829513627755),
 'Recall@30': (0.27102840704782455, 0.0008001989770720488),
 'Recall@5': (0.002761596548004315, 0.0005082563467914027),
 'Recall@50': (0.24768500539374325, 0.0005864023566947201)}

In [59]:
# With Vader
explain_evaluate(explanation, I_K, atK=[5,10,30,50])

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:00<00:00, 168.72it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:01<00:00, 167.46it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:58<00:00, 176.90it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:57<00:00, 178.76it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:58<00:00, 176.44it/s]


{'MAP@10': (0.30612538076616214, 0.0022405766064059534),
 'MAP@30': (0.3778614515602291, 0.0013079230179825654),
 'MAP@5': (0.12744735048254585, 0.0030434484506194712),
 'MAP@50': (0.34609231953227454, 0.0009031312108072494),
 'NDCG': (0.3879557787232804, 0.0007765785785828622),
 'Precision@10': (0.20611344311699498, 0.0010204111243237357),
 'Precision@30': (0.2785670720769203, 0.0007951049318394071),
 'Precision@5': (0.12123560434829406, 0.0019859885794326995),
 'Precision@50': (0.28675277149930045, 0.0005913220980868425),
 'R-Precision': (0.28675277149930045, 0.0005913220980868425),
 'Recall@10': (0.20611344311699498, 0.0010204111243237357),
 'Recall@30': (0.2785670720769203, 0.0007951049318394071),
 'Recall@5': (0.12123560434829406, 0.0019859885794326995),
 'Recall@50': (0.28675277149930045, 0.0005913220980868425)}

### implicit without time-oredered split

In [814]:
similarity = train(np.transpose(rtrain))
explanation_scores = predict(I_K, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, I_K)

100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:05<00:00, 1837.71it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:01<00:00, 7765.86it/s]


In [815]:
explain_evaluate(explanation, I_K_test, atK=[5,10,30,50]) 

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:40<00:00, 251.24it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:40<00:00, 256.84it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:40<00:00, 255.53it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:39<00:00, 261.03it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [00:40<00:00, 251.50it/s]


{'MAP@10': (0.1931789138884608, 0.0031984714634543656),
 'MAP@30': (0.28915286485493175, 0.003933334833419286),
 'MAP@5': (0.17321610931319667, 0.003704519739578873),
 'MAP@50': (0.39472789043500867, 0.001113353205085209),
 'NDCG': (0.3966737410561124, 0.0009858754594146636),
 'Precision@10': (0.15645091693635382, 0.002387857507926541),
 'Precision@30': (0.22858683926645093, 0.0030535196896196216),
 'Precision@5': (0.12323624595469257, 0.0021489295200967933),
 'Precision@50': (0.34073354908306364, 0.0012682162198424524),
 'R-Precision': (0.34073354908306364, 0.0012682162198424524),
 'Recall@10': (0.15645091693635382, 0.002387857507926541),
 'Recall@30': (0.22858683926645093, 0.0030535196896196216),
 'Recall@5': (0.12323624595469257, 0.0021489295200967933),
 'Recall@50': (0.34073354908306364, 0.0012682162198424524)}

# Understand item-based Explanation

## Get restaurant names correspond to ItemIndex

In [60]:
def get_business_df(path = "../../data/yelp/business.json" ):
    with open(path) as json_file:
        data = json_file.readlines()
        data = list(map(json.loads, data))
    df = pd.DataFrame(data)
    
    return df

def get_restaurant_info(business_df, business_id, name = True, review_count = True, stars = True ):
    output_list = {}
    row_idx = int(business_df.index[business_df['business_id'] == business_id].tolist()[0])
    if name == True:
        output_list['name'] = business_df['name'][row_idx].encode('utf-8').strip()
    if review_count == True:
        output_list['review_count'] = business_df['review_count'][row_idx]
    if stars == True:
        output_list['stars'] = business_df['stars'][row_idx] 
    return output_list

def get_businessid_from_Itemindex(ItemIndex_list, itemindex):
    return ItemIndex_list['business_id'].tolist()[itemindex]


In [61]:
business_df = get_business_df()

In [62]:
get_businessid_from_Itemindex(ItemIndex, 8010)

'l_uAw0K2lkOsyVJATcnwsA'

In [63]:
get_restaurant_info(business_df, 'l_uAw0K2lkOsyVJATcnwsA')

{'name': 'Spicy Mafia', 'review_count': 9, 'stars': 3.0}

In [64]:
rtrain

<2473x10282 sparse matrix of type '<type 'numpy.float32'>'
	with 102741 stored elements in Compressed Sparse Row format>

## Find specific Restaurant

In [65]:
x = ['sushi on bloor' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

35611     Sushi On Bloor
114653    Sushi On Bloor
Name: name, dtype: object


In [296]:
x = ['mafia' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

7031                       Spicy Mafia
28409                 Caricature Mafia
51959               Mafia Mike's Pizza
113184    Cakefacemafia Brows & Beauty
119974                     Spicy Mafia
140192                    Mafia Mike's
156211                  La'Bella MAFIA
180250              Mafia Mike's PIzza
Name: name, dtype: object


In [486]:
x = ['crown prince' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

74627     Crown Prince Fine Dining & Banquet
106186            Crown Princess Fine Dining
Name: name, dtype: object


In [611]:
x = ['red lobster' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

4501                  Red Lobster
9494                  Red Lobster
9599                  Red Lobster
11793                 Red Lobster
15146                 Red Lobster
21612                 Red Lobster
25103                 Red Lobster
37750                 Red Lobster
39311                 Red Lobster
42883                 Red Lobster
46058                 Red Lobster
46742                 Red Lobster
51418                 Red Lobster
51780                 Red Lobster
52374                 Red Lobster
56591                 Red Lobster
66201                 Red Lobster
68260                 Red Lobster
70318                 Red Lobster
79983                 Red Lobster
105382                Red Lobster
108167                Red Lobster
112307                Red Lobster
115861                Red Lobster
119864                Red Lobster
126004                Red Lobster
134059                Red Lobster
140933                Red Lobster
145037                Red Lobster
154559        

In [599]:
x = ['queen and beaver' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

73128    The Queen And Beaver Public House
Name: name, dtype: object


In [700]:
x = ['miku' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

107476                            Miku
116702    Mikush Home Appliance Center
Name: name, dtype: object


In [143]:
x = ['nando' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

470                              Fernando's
600                         Shenandoah Mill
4672                                 Nandos
8284                        Fernando's Food
18596                  Nando's Mexican Cafe
28152         Nando's Flame Grilled Chicken
39568                   Hernando's Hideaway
51248                               Nando's
57684                   Nando's Chickenland
59320                               Nando's
60928                   Fernando's Hideaway
66834                               Nando's
72563                               Nando's
78219                    Barry Fernando, MD
78984               San Fernando Apartments
85788                         Viva Fernando
88200                               Nando's
100964               Nando Milano Trattoria
110806                   Rene Fernando, DDS
111236              Fernando's Tree Service
122556                      Fernando's Cafe
127748                              Nando's
131533        Nando's Flame Gril

In [136]:
x = ['gyube' in business_df['name'][i].lower() for i in range(len(business_df))]
for i in np.array(x).nonzero():
    print business_df['name'][i]

64440    Gyubee Japanese BBQ - Downtown
65285     Gyubee Japanese BBQ - Markham
Name: name, dtype: object


## If the restaurant exists in df

In [68]:
def get_itemindex_from_business_id(ItemIndex_list, business_id):
    business_id_list = ItemIndex_list['business_id'].tolist()
    return business_id_list.index(business_id)

In [144]:
business_df['business_id'][4672]

u'l3rDLV3OrQVlLn53q-fYRA'

In [145]:
get_itemindex_from_business_id(ItemIndex, 'l3rDLV3OrQVlLn53q-fYRA')

ValueError: 'l3rDLV3OrQVlLn53q-fYRA' is not in list

In [70]:
len(np.where(df.ItemIndex == 7312)[0])

43

# Compare Prediction with Ground Truth

## Spicy Mafia (fewer reviews Chinese food)

In [749]:
print get_restaurant_info(business_df, 'l_uAw0K2lkOsyVJATcnwsA')
print get_itemindex_from_business_id(ItemIndex, 'l_uAw0K2lkOsyVJATcnwsA')

{'review_count': 9, 'name': 'Spicy Mafia', 'stars': 3.0}
8010


In [783]:
### Predicted TF
test_list = list(map(int, explanation[8010])) 
np.array(keyphrases)[test_list]

array(['noodle', 'wait', 'thai', 'beef', 'tea', 'pork', 'mall', 'chicken',
       'soup', 'spicy', 'quick', 'fresh', 'curry', 'pot', 'meat', 'lunch',
       'busy', 'friendly', 'egg', 'dinner', 'fast', 'fried', 'bar',
       'salad', 'vietnamese', 'milk', 'pop', 'tart', 'fry', 'kimchi',
       'cocktail', 'clean', 'downtown', 'chewy', 'shrimp', 'salt',
       'pasta', 'bubble', 'crispy', 'bubble tea', 'coconut', 'belly',
       'tapioca', 'spring roll', 'milk tea', 'fair', 'tofu', 'wine',
       'cake', 'fish', 'pork belly', 'sour', 'yummy', 'pizza', 'stick',
       'chinese', 'sandwich', 'asian', 'fruit', 'bun', 'bean', 'cheese',
       'attentive', 'rib', 'chili', 'tuna', 'wing', 'crunchy',
       'reasonable', 'creamy'], dtype='|S19')

In [843]:
### Predicted TFIDF
test_list = list(map(int, explanation[8010])) 
np.array(keyphrases)[test_list]

array(['wait', 'fresh', 'mall', 'friendly', 'pot', 'tea', 'chicken',
       'bar', 'cone', 'egg', 'cheese', 'salad', 'beef', 'meat',
       'croissant', 'cocktail', 'sashimi', 'gravy', 'busy', 'quick',
       'plaza', 'congee', 'patty', 'tart', 'english muffin', 'pork',
       'matcha', 'wild boar', 'fast', 'pale ale', 'fried', 'dessert',
       'dinner', 'lunch', 'soup', 'mexico', 'noodle', 'cake', 'clean',
       'salmon', 'dim sum', 'taco', 'octopus', 'smoked', 'brunch',
       'waffle', 'booth', 'steak', 'north york', 'cheesecake', 'spicy',
       'gelato', 'scallop', 'duck', 'pasta', 'cookie', 'general tao',
       'pizza', 'bacon', 'public transit', 'cocktail', 'poutine', 'donut',
       'indian', 'mexican', 'theatre', 'baked', 'bread', 'burger',
       'pork belly'], dtype='|S19')

In [751]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[noodle, wait, thai, beef, tea, pork, mall, chicken, soup, spicy, quick, fresh, curry, pot, meat, lunch, busy, friendly, egg, dinner]


In [844]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[wait, fresh, mall, friendly, pot, tea, chicken, bar, cone, egg, cheese, salad, beef, meat, croissant, cocktail, sashimi, gravy, busy, quick]


In [752]:
### Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:70]]

array(['tomato', 'soup', 'meat', 'noodle', 'pork', 'fish', 'egg', 'busy',
       'pot', 'tart', 'lunch', 'tofu', 'juice', 'avocado', 'apple',
       'strawberry', 'pop', 'lettuce', 'miso', 'skewer', 'congee',
       'sashimi', 'scallop', 'cone', 'honey', 'cookie', 'lamb', 'banana',
       'croissant', 'mango', 'bacon', 'octopus', 'espresso', 'olive',
       'donut', 'duck', 'booth', 'matcha', 'calamari', 'latte', 'sausage',
       'fruit', 'cheesecake', 'cocktail', 'oyster', 'bubble', 'belly',
       'toast', 'pancake', 'poutine', 'corn', 'coconut', 'vegan', 'lemon',
       'wrap', 'tuna', 'crepe', 'four', 'tempura', 'sesame', 'patty',
       'kimchi', 'tapioca', 'accept debit', 'squid', 'takeout',
       'downtown', 'vegetarian', 'birthday', 'asian'], dtype='|S19')

In [753]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:20]]))

[tomato, soup, meat, noodle, pork, fish, egg, busy, pot, tart, lunch, tofu, juice, avocado, apple, strawberry, pop, lettuce, miso, skewer]


In [784]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8010].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

22

## Crown Princess Fine Dining (many reviews Chinese food)

In [754]:
print get_restaurant_info(business_df, 'ovlWOSKVjGecnaPuZLv_OQ')
print get_itemindex_from_business_id(ItemIndex, 'ovlWOSKVjGecnaPuZLv_OQ')

{'review_count': 235, 'name': 'Crown Princess Fine Dining', 'stars': 3.5}
8517


In [785]:
### Predicted TF
test_list = list(map(int, explanation[8517])) 
np.array(keyphrases)[test_list]

array(['wait', 'thai', 'tea', 'mall', 'fresh', 'soup', 'pot', 'noodle',
       'bar', 'chicken', 'beef', 'friendly', 'dumpling', 'dinner',
       'curry', 'spicy', 'egg', 'pork', 'busy', 'tart', 'lunch', 'quick',
       'fried', 'clean', 'chinese', 'cake', 'fish', 'shrimp', 'sushi',
       'rib', 'meat', 'dim sum', 'dessert', 'tuna', 'pancake', 'milk',
       'fast', 'fair', 'crispy', 'stick', 'downtown', 'store',
       'attentive', 'bun', 'steamed', 'tofu', 'salt', 'salad', 'parking',
       'pop', 'coconut', 'seafood', 'wine', 'wing', 'bean', 'sour',
       'window', 'asian', 'creamy', 'music', 'fry', 'wrap', 'beer',
       'markham', 'comfortable', 'ice cream', 'reasonable', 'scallop',
       'yummy', 'bread'], dtype='|S19')

In [845]:
### Predicted TFIDF
test_list = list(map(int, explanation[8517])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'fresh', 'pot', 'friendly', 'chicken',
       'tempura', 'salad', 'pork', 'beef', 'noodle', 'meat', 'skewer',
       'cone', 'egg', 'avocado', 'spicy', 'patty', 'tart', 'cocktail',
       'pork bone soup', 'quick', 'busy', 'cheese', 'accept debit',
       'soup', 'balsamic vinegar', 'fried', 'public transit', 'gong cha',
       'dessert', 'lactose intolerant', 'dinner', 'lunch', 'pale ale',
       'wild boar', 'bar', 'alcoholic beverage', 'fast', 'stick', 'thai',
       'cake', 'clean', 'sour', 'burger', 'fish', 'bacon', 'cocktail',
       'fry', 'waffle', 'miso', 'vietnamese', 'chip', 'kimchi', 'lamb',
       'sushi', 'lobster', 'ramen', 'coffee', 'pancake', 'brunch',
       'buffet', 'store', 'bread', 'bubble tea', 'pork belly', 'rib',
       'wrap', 'topped'], dtype='|S19')

In [95]:
### Predicted TFIDF Vader
test_list = list(map(int, explanation[8517])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'amazing service', 'tea', 'fresh', 'pot',
       'friendly', 'stick', 'sour', 'chicken', 'sure everything',
       'pleasant', 'la carnita', 'favourite place', 'balsamic vinegar',
       'hair cut', 'accept debit', 'public transit', 'much fun',
       'great coffee', 'escape room', 'egg', 'great custom service',
       'taste menu', 'super nice', 'awesome place', 'many flavour',
       'great staff', 'nail salon', 'farmer market', 'bang bang',
       'pale ale', 'cheese', 'wild boar', 'gong cha', 'pork bone soup',
       'quick', 'busy', 'cocktail', 'deer garden', 'tart', 'patty',
       'avocado', 'skewer', 'cone', 'tempura', 'spicy', 'meat', 'noodle',
       'beef', 'pork', 'salad', 'vietnamese coffee', 'lactose intolerant',
       'favourit thing', 'alcoholic beverage', 'quick meal',
       'averag price', 'fried', 'bar', 'second chance', 'dessert', 'fast',
       'dry side', 'bit bland', 'soup', 'empty table', 'instant noodle',
       'dinner', 'bad service']

In [756]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[wait, thai, tea, mall, fresh, soup, pot, noodle, bar, chicken, beef, friendly, dumpling, dinner, curry, spicy, egg, pork, busy, tart]


In [846]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, fresh, pot, friendly, chicken, tempura, salad, pork, beef, noodle, meat, skewer, cone, egg, avocado, spicy, patty, tart]


In [96]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, amazing service, tea, fresh, pot, friendly, stick, sour, chicken, sure everything, pleasant, la carnita, favourite place, balsamic vinegar, hair cut, accept debit, public transit, much fun, great coffee]


In [684]:
### Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:70]]

array(['dim sum', 'rice', 'tea', 'art', 'chinese', 'dumpling', 'downtown',
       'pork', 'wait', 'shrimp', 'bun', 'pot', 'fried', 'dinner',
       'chicken', 'congee', 'egg', 'cake', 'dessert', 'tart', 'bbq',
       'mall', 'fancy', 'rib', 'lunch', 'scallop', 'fresh', 'stick',
       'noodle', 'pricey', 'busy', 'markham', 'wrap', 'squid', 'clean',
       'duck', 'salt', 'crispy', 'bean', 'octopus', 'yummy', 'fair',
       'attentive', 'tax', 'french', 'brunch', 'soup', 'refill', 'quick',
       'corn', 'steamed', 'greasy', 'fast', 'curry', 'cheaper', 'solid',
       'quiet', 'sesame', 'meat', 'disappointing', 'asian', 'parking',
       'bar', 'baked', 'reasonable', 'crunchy', 'traditional', 'lobster',
       'milk', 'comfortable'], dtype='|S19')

In [757]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:20]]))

[dim sum, tea, chinese, dumpling, shrimp, pork, downtown, wait, bun, pot, fried, dinner, chicken, congee, cake, egg, dessert, tart, bbq, mall]


In [786]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8517].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

47

## The Queen And Beaver Public House (many reviews dessert place)

In [769]:
print get_restaurant_info(business_df, 'qaNt4vtVdge_S68DVjw5Jg')
print get_itemindex_from_business_id(ItemIndex, 'qaNt4vtVdge_S68DVjw5Jg')

{'review_count': 355, 'name': 'The Queen And Beaver Public House', 'stars': 4.0}
8764


In [787]:
# Predicted TF
test_list = list(map(int, explanation[8764])) 
np.array(keyphrases)[test_list]

array(['mall', 'pot', 'wait', 'bar', 'friendly', 'beer', 'tea', 'fresh',
       'chicken', 'pizza', 'beef', 'salad', 'sandwich', 'busy', 'quick',
       'lunch', 'tart', 'meat', 'egg', 'store', 'dinner', 'cheese',
       'fast', 'pop', 'soup', 'spicy', 'bun', 'pub', 'pork', 'ice cream',
       'bread', 'wing', 'fry', 'dessert', 'tofu', 'cake', 'coffee',
       'crust', 'fried', 'rib', 'stick', 'clean', 'burger', 'downtown',
       'corn', 'roasted', 'cocktail', 'apple', 'tuna', 'bean', 'wine',
       'noodle', 'potato', 'tomato', 'chocolate', 'crispy', 'movie',
       'comfortable', 'bbq', 'shrimp', 'fair', 'helpful', 'thai', 'salt',
       'cozy', 'topped', 'casual', 'chip', 'theatre', 'reasonable'],
      dtype='|S19')

In [847]:
# Predicted TFIDF
test_list = list(map(int, explanation[8764])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'pot', 'friendly', 'fresh', 'chicken',
       'salad', 'pork', 'beef', 'egg', 'meat', 'fried', 'tapioca', 'bar',
       'uber eats', 'tart', 'cocktail', 'bubble tea', 'fast', 'quick',
       'soup', 'lunch', 'busy', 'gong cha', 'general tao', 'wild boar',
       'dinner', 'dessert', 'cheese', 'noodle', 'fry', 'vietnamese',
       'cake', 'kimchi', 'clean', 'spicy', 'fried rice', 'sushi',
       'japanese', 'gelato', 'pork bone soup', 'fish', 'matcha',
       'pork belly', 'scallop', 'coffee', 'alcoholic beverage', 'bubble',
       'ramen', 'octopus', 'dietary restriction', 'spring roll', 'congee',
       'lobster', 'poutine', 'skewer', 'store', 'squid', 'tempura',
       'mexican', 'juicy', 'accept debit', 'croissant', 'cone', 'crispy',
       'seafood', 'calamari', 'milk tea', 'rib'], dtype='|S19')

In [788]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, pot, wait, bar, friendly, beer, tea, fresh, chicken, pizza, beef, salad, sandwich, busy, quick, lunch, tart, meat, egg, store]


In [848]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, pot, friendly, fresh, chicken, salad, pork, beef, egg, meat, fried, tapioca, bar, uber eats, tart, cocktail, bubble tea, fast]


In [772]:
np.array(keyphrases)[np.argsort(np.ravel(I_K[8764].todense()))[::-1][:70]]

array(['pub', 'beer', 'dinner', 'pot', 'bar', 'chip', 'burger', 'mall',
       'fish', 'egg', 'cozy', 'tea', 'wait', 'fry', 'fresh', 'beef',
       'brunch', 'cocktail', 'salt', 'cheese', 'bacon', 'stick',
       'comfortable', 'downtown', 'meat', 'friendly', 'chair', 'potato',
       'rib', 'lunch', 'bun', 'bread', 'pork', 'greasy', 'pricey',
       'patty', 'traditional', 'tomato', 'salad', 'wing', 'sausage',
       'quick', 'attentive', 'tart', 'fast', 'toast', 'fancy', 'dark',
       'solid', 'breakfast', 'dessert', 'seasoned', 'crispy', 'washroom',
       'pleasant', 'lemon', 'vegetarian', 'clean', 'latte', 'lamb',
       'gravy', 'refreshing', 'fried', 'reasonable', 'refill', 'french',
       'crowded', 'soup', 'rare', 'bean'], dtype='|S19')

In [773]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8764].todense()))[::-1][:20]]))

[pub, beer, dinner, pot, bar, chip, burger, mall, fish, egg, cozy, tea, wait, fry, fresh, beef, brunch, cocktail, salt, cheese]


In [789]:
# number of Hit
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[8764].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

42

## Red Lobster

In [71]:
print get_restaurant_info(business_df, 'hTdJAjSZtHWwqqh5cCeAfA')
print get_itemindex_from_business_id(ItemIndex, 'hTdJAjSZtHWwqqh5cCeAfA')

{'review_count': 93, 'name': 'Red Lobster', 'stars': 3.0}
7312


In [790]:
# Predicted TF
test_list = list(map(int, explanation[7312])) 
np.array(keyphrases)[test_list]

array(['tea', 'chicken', 'mall', 'wait', 'friendly', 'fresh', 'fast',
       'pot', 'quick', 'milk', 'clean', 'bubble', 'bubble tea', 'busy',
       'rib', 'dessert', 'fry', 'pop', 'cake', 'lunch', 'milk tea', 'egg',
       'tart', 'salad', 'breakfast', 'beef', 'cheese', 'dinner', 'soup',
       'store', 'noodle', 'fried', 'meat', 'tapioca', 'bar', 'spicy',
       'burger', 'fish', 'curry', 'parking', 'steak', 'coffee', 'wing',
       'fruit', 'chinese', 'corn', 'potato', 'bread', 'bacon',
       'chocolate', 'fair', 'tax', 'mango', 'plaza', 'helpful',
       'attentive', 'bean', 'crispy', 'toast', 'stick', 'bun',
       'reasonable', 'lemon', 'sour', 'juicy', 'asian', 'salt',
       'comfortable', 'markham', 'pleasant'], dtype='|S19')

In [72]:
# Predicted TF with Vader
test_list = list(map(int, explanation[7312])) 
np.array(keyphrases)[test_list]

array(['tea', 'chicken', 'mall', 'wait', 'friendly', 'fresh', 'fast',
       'pot', 'quick', 'milk', 'clean', 'bubble', 'bubble tea', 'busy',
       'rib', 'dessert', 'fry', 'pop', 'cake', 'lunch', 'milk tea', 'egg',
       'tart', 'salad', 'breakfast', 'beef', 'cheese', 'dinner', 'soup',
       'store', 'noodle', 'fried', 'meat', 'tapioca', 'bar', 'spicy',
       'burger', 'fish', 'curry', 'parking', 'steak', 'coffee', 'wing',
       'fruit', 'chinese', 'corn', 'potato', 'bread', 'bacon',
       'chocolate', 'fair', 'tax', 'mango', 'plaza', 'helpful',
       'attentive', 'bean', 'crispy', 'toast', 'stick', 'bun',
       'reasonable', 'lemon', 'sour', 'juicy', 'asian', 'salt',
       'comfortable', 'markham', 'pleasant'], dtype='|S20')

In [849]:
# Predicted TFIDF
test_list = list(map(int, explanation[7312])) 
np.array(keyphrases)[test_list]

array(['wait', 'mall', 'tea', 'fresh', 'pot', 'friendly', 'chicken',
       'cheese', 'salad', 'pork', 'beef', 'meat', 'croissant', 'cocktail',
       'tart', 'cake', 'english muffin', 'pork belly', 'pork bone soup',
       'quick', 'clean', 'busy', 'egg', 'accept debit', 'soup',
       'pale ale', 'public transit', 'financial district', 'wild boar',
       'dinner', 'lunch', 'balsamic vinegar', 'dessert', 'mexico',
       'fried', 'uber eats', 'general tao', 'bar', 'grand opening',
       'fast', 'noodle', 'pleasant', 'spicy', 'cocktail', 'sashimi',
       'fry', 'fluffy', 'dim sum', 'oyster', 'tempura', 'fish', 'duck',
       'congee', 'italian', 'scallop', 'soggy', 'octopus', 'smoked',
       'ramen', 'tofu', 'beer', 'sushi', 'taco', 'wine', 'classic',
       'cookie', 'cone', 'pasta', 'rib', 'japanese'], dtype='|S19')

In [91]:
# Predicted TFIDF with Vader
test_list = list(map(int, explanation[7312])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'fresh', 'ok nothing', 'pot', 'friendly',
       'chicken', 'pleasant', 'little gem', 'escape room',
       'super helpful', 'great coffee', 'pork bone soup',
       'favourite place', 'great quality', 'mexico', 'taste menu',
       'great custom service', 'la carnita', 'soup', 'amazing food',
       'lunch', 'many flavour', 'croissant', 'farmer market', 'bang bang',
       'sure everything', 'perfect place', 'sushi bar', 'quick',
       'amazing experience', 'wonderful experience',
       'favourite restaurant', 'cake', 'egg', 'cheese', 'deer garden',
       'salad', 'pork', 'beef', 'meat', 'general tao', 'wild boar',
       'financial district', 'pale ale', 'public transit',
       'balsamic vinegar', 'uber eats', 'busy', 'grand opening', 'clean',
       'english muffin', 'accept debit', 'dinner', 'favourit thing',
       'pork belly', 'vietnamese coffee', 'second chance', 'tart', 'fast',
       'instant noodle', 'cocktail', 'high side', 'averag price',


In [791]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[tea, chicken, mall, wait, friendly, fresh, fast, pot, quick, milk, clean, bubble, bubble tea, busy, rib, dessert, fry, pop, cake, lunch]


In [850]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[wait, mall, tea, fresh, pot, friendly, chicken, cheese, salad, pork, beef, meat, croissant, cocktail, tart, cake, english muffin, pork belly, pork bone soup, quick]


In [92]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, fresh, ok nothing, pot, friendly, chicken, pleasant, little gem, escape room, super helpful, great coffee, pork bone soup, favourite place, great quality, mexico, taste menu, great custom service, la carnita]


In [776]:
# Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[7312].todense()))[::-1][:70]]

array(['lobster', 'wait', 'seafood', 'shrimp', 'friendly', 'fresh',
       'mall', 'fish', 'dinner', 'pot', 'potato', 'tea', 'salad',
       'dessert', 'fried', 'lunch', 'crispy', 'refill', 'dip', 'bar',
       'stuffed', 'quick', 'busy', 'four', 'pasta', 'bread', 'soup',
       'coconut', 'tart', 'baked', 'fry', 'chocolate', 'chip', 'scallop',
       'quiet', 'bun', 'cheese', 'chicken', 'parking', 'rib', 'tuna',
       'creamy', 'greasy', 'salt', 'casual', 'classic', 'immediately',
       'roasted', 'cheaper', 'pizza', 'crowded', 'beer', 'wing', 'cake',
       'topped', 'cheesecake', 'latte', 'strawberry', 'steamed',
       'markham', 'pop', 'meat', 'juicy', 'ice cream', 'tomato', 'cookie',
       'deep fried', 'salmon', 'steak', 'dog'], dtype='|S19')

In [792]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[7312].todense()))[::-1][:20]]))

[lobster, wait, seafood, shrimp, friendly, fresh, mall, fish, dinner, pot, potato, tea, salad, dessert, fried, lunch, crispy, refill, dip, bar]


In [793]:
# number of Hits 
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[7312].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

35

## Miku

In [794]:
print get_restaurant_info(business_df, '0a2O150ytxrDjDzXNfRWkA')
print get_itemindex_from_business_id(ItemIndex, '0a2O150ytxrDjDzXNfRWkA')

{'review_count': 604, 'name': 'Miku', 'stars': 4.0}
273


In [799]:
# Predicted TF
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['wait', 'mall', 'tea', 'fresh', 'cake', 'pot', 'egg', 'meat',
       'pork', 'dessert', 'chicken', 'friendly', 'tart', 'dinner',
       'cheese', 'soup', 'busy', 'ice cream', 'bar', 'fish', 'noodle',
       'quick', 'fried', 'beef', 'lunch', 'spicy', 'salad', 'pop', 'salt',
       'sushi', 'matcha', 'japanese', 'rib', 'shrimp', 'fast', 'salmon',
       'ramen', 'tuna', 'corn', 'creamy', 'crispy', 'seafood', 'coffee',
       'milk', 'latte', 'wing', 'bean', 'oyster', 'waffle', 'clean',
       'bbq', 'lobster', 'brunch', 'fair', 'pricey', 'parking', 'belly',
       'cheesecake', 'attentive', 'sashimi', 'green tea', 'chocolate',
       'cocktail', 'fry', 'pancake', 'yummy', 'bread', 'pork belly',
       'curry', 'potato'], dtype='|S19')

In [75]:
# Predicted TF with vader
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['wait', 'mall', 'tea', 'fresh', 'cake', 'pot', 'egg', 'meat',
       'pork', 'dessert', 'chicken', 'friendly', 'tart', 'dinner',
       'cheese', 'soup', 'busy', 'ice cream', 'bar', 'fish', 'noodle',
       'quick', 'fried', 'beef', 'lunch', 'spicy', 'salad', 'pop', 'salt',
       'sushi', 'matcha', 'japanese', 'rib', 'shrimp', 'fast', 'salmon',
       'ramen', 'tuna', 'corn', 'creamy', 'crispy', 'seafood', 'coffee',
       'milk', 'latte', 'wing', 'bean', 'oyster', 'waffle', 'clean',
       'bbq', 'lobster', 'brunch', 'fair', 'pricey', 'parking', 'belly',
       'cheesecake', 'attentive', 'sashimi', 'green tea', 'chocolate',
       'cocktail', 'fry', 'pancake', 'yummy', 'bread', 'pork belly',
       'curry', 'potato'], dtype='|S20')

In [851]:
# Predicted TFIDF
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'fresh', 'friendly', 'pot', 'chicken',
       'bar', 'tart', 'quick', 'meat', 'beef', 'salad', 'cheese', 'egg',
       'busy', 'cocktail', 'gong cha', 'fast', 'fried', 'lunch', 'dinner',
       'soup', 'dessert', 'pork', 'cake', 'noodle', 'spicy', 'pale ale',
       'clean', 'fry', 'wild boar', 'alcoholic beverage', 'general tao',
       'uber eats', 'fish', 'pork bone soup', 'mexico', 'english muffin',
       'poutine', 'burger', 'mexican', 'dim sum', 'gravy', 'coffee',
       'accept debit', 'store', 'bread', 'congee', 'buffet', 'cocktail',
       'rib', 'sandwich', 'gelato', 'brunch', 'vietnamese', 'skewer',
       'sashimi', 'balsamic vinegar', 'croissant', 'bakery', 'tempura',
       'pizza', 'curry', 'indian', 'beer', 'sushi', 'spring roll',
       'donut', 'olive'], dtype='|S19')

In [93]:
# Predicted TFIDF Vader
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'friendly', 'fresh', 'pot', 'chicken',
       'bar', 'favourite place', 'averag price', 'meat',
       'great custom service', 'beef', 'tart', 'cheese', 'egg',
       'nail salon', 'favourit thing', 'salad', 'gong cha', 'fast',
       'cocktail', 'busy', 'soup', 'lunch', 'dinner', 'fried', 'quick',
       'pork', 'dessert', 'escape room', 'cake', 'terrible service',
       'noodle', 'hair cut', 'deer garden', 'board game', 'spicy',
       'pale ale', 'asian legend', 'clean', 'la carnita', 'farmer market',
       'wild boar', 'fry', 'alcoholic beverage', 'general tao', 'fish',
       'pork bone soup', 'uber eats', 'dry side', 'english muffin',
       'dim sum place', 'mexico', 'poutine', 'burger', 'mexican', 'gravy',
       'dim sum', 'swiss chalet', 'vietnamese coffee', 'bread', 'coffee',
       'store', 'congee', 'buffet', 'accept debit', 'skewer', 'bad day',
       'cocktail'], dtype='|S20')

In [800]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[wait, mall, tea, fresh, cake, pot, egg, meat, pork, dessert, chicken, friendly, tart, dinner, cheese, soup, busy, ice cream, bar, fish]


In [852]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, fresh, friendly, pot, chicken, bar, tart, quick, meat, beef, salad, cheese, egg, busy, cocktail, gong cha, fast, fried]


In [94]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, friendly, fresh, pot, chicken, bar, favourite place, averag price, meat, great custom service, beef, tart, cheese, egg, nail salon, favourit thing, salad, gong cha]


In [797]:
# Ground Truth
np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:70]]

array(['sushi', 'dessert', 'salmon', 'fish', 'tea', 'fresh', 'sashimi',
       'green tea', 'cake', 'dinner', 'japanese', 'matcha', 'tart',
       'lunch', 'ice cream', 'tuna', 'chocolate', 'wait', 'scallop',
       'mall', 'miso', 'salad', 'beef', 'latte', 'birthday', 'seafood',
       'friendly', 'attentive', 'pot', 'rib', 'tofu', 'bar', 'shrimp',
       'oyster', 'modern', 'bean', 'egg', 'pricey', 'cocktail', 'clean',
       'nicely', 'fried', 'lobster', 'sesame', 'baked', 'quick', 'wine',
       'salt', 'creamy', 'busy', 'spicy', 'traditional', 'calamari',
       'dark', 'pop', 'meat', 'crunchy', 'seasoned', 'four', 'spacious',
       'squid', 'fruit', 'stick', 'crispy', 'bacon', 'topped', 'bread',
       'potato', 'tomato', 'pleasant'], dtype='|S19')

In [798]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:20]]))

[sushi, dessert, salmon, fish, tea, fresh, sashimi, green tea, cake, dinner, japanese, matcha, tart, lunch, ice cream, tuna, chocolate, wait, scallop, mall]


In [801]:
# number of Hits 
len(np.where([i in np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:70]] for i in np.array(keyphrases)[test_list]])[0])

47

# Nando's

In [117]:
print get_restaurant_info(business_df, '0a2O150ytxrDjDzXNfRWkA')
print get_itemindex_from_business_id(ItemIndex, '0a2O150ytxrDjDzXNfRWkA')

{'review_count': 604, 'name': 'Miku', 'stars': 4.0}
273


In [118]:
# Predicted TF-IDF Vader
test_list = list(map(int, explanation[273])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'tea', 'friendly', 'fresh', 'pot', 'chicken',
       'bar', 'favourite place', 'averag price', 'meat',
       'great custom service', 'beef', 'tart', 'cheese', 'egg',
       'nail salon', 'favourit thing', 'salad', 'gong cha', 'fast',
       'cocktail', 'busy', 'soup', 'lunch', 'dinner', 'fried', 'quick',
       'pork', 'dessert', 'escape room', 'cake', 'terrible service',
       'noodle', 'hair cut', 'deer garden', 'board game', 'spicy',
       'pale ale', 'asian legend', 'clean', 'la carnita', 'farmer market',
       'wild boar', 'fry', 'alcoholic beverage', 'general tao', 'fish',
       'pork bone soup', 'uber eats', 'dry side', 'english muffin',
       'dim sum place', 'mexico', 'poutine', 'burger', 'mexican', 'gravy',
       'dim sum', 'swiss chalet', 'vietnamese coffee', 'bread', 'coffee',
       'store', 'congee', 'buffet', 'accept debit', 'skewer', 'bad day',
       'cocktail'], dtype='|S20')

In [120]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, tea, friendly, fresh, pot, chicken, bar, favourite place, averag price, meat, great custom service, beef, tart, cheese, egg, nail salon, favourit thing, salad, gong cha]


In [119]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[273].todense()))[::-1][:20]]))

[wait, chicken, mall, friendly, fresh, pot, tea, la carnita, cheese, cake, egg, fish, burger, gelato, salad, lactose intolerant, beef, meat, chili, refreshing]


# GYUBEE

In [139]:
# 8745
print get_restaurant_info(business_df, 'qR4EIktJfQKc4rnKgBzvtw')
print get_itemindex_from_business_id(ItemIndex, 'qR4EIktJfQKc4rnKgBzvtw')

{'review_count': 106, 'name': 'Gyubee Japanese BBQ - Downtown', 'stars': 4.0}
8745


In [140]:
# Predicted TF-IDF Vader
test_list = list(map(int, explanation[8745])) 
np.array(keyphrases)[test_list]

array(['mall', 'wait', 'fresh', 'tea', 'pot', 'chicken', 'friendly',
       'la carnita', 'favourite place', 'public transit',
       'balsamic vinegar', 'hair cut', 'escape room', 'wild boar',
       'great coffee', 'great custom service', 'favourit thing', 'mexico',
       'pale ale', 'cheese', 'general tao', 'egg', 'lunch', 'salad',
       'beef', 'meat', 'congee', 'tart', 'cocktail', 'busy', 'quick',
       'soup', 'english muffin', 'fast', 'bad taste', 'averag price',
       'bar', 'dessert', 'dinner', 'plus side', 'fried', 'spicy', 'pork',
       'noodle', 'cake', 'gelato', 'clean', 'fry', 'filet mignon',
       'patty', 'bang bang', 'swiss chalet', 'terrible service',
       'nail salon', 'second chance', 'buffet', 'foie gras',
       'great recommend', 'real deal', 'thai', 'coffee', 'separate bill',
       'pizza', 'dim sum', 'extra star', 'mediocre food', 'quick meal',
       'croissant', 'bad day', 'sushi bar'], dtype='|S20')

In [142]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[test_list][:20]))

[mall, wait, fresh, tea, pot, chicken, friendly, la carnita, favourite place, public transit, balsamic vinegar, hair cut, escape room, wild boar, great coffee, great custom service, favourit thing, mexico, pale ale, cheese]


In [141]:
print '[%s]' % ', '.join(map(str, np.array(keyphrases)[np.argsort(np.ravel(I_K[8745].todense()))[::-1][:20]]))

[pot, fresh, mall, wait, tea, chicken, friendly, milk, fried chicken, milk tea, green tea, spring roll, bubble tea, dumpling, fried rice, pork bone soup, parking, shopping, pasta, la carnita]


# TF-IDF

## User-based TF-IDF

In [80]:
def df(U_K):
    """
    Get the df for keyphrase matrix
    """
    DF = {}
    for i in range(U_K.shape[0]):
        vector_u = np.ravel(U_K[i].todense())
        for keyword_idx in range(U_K.shape[1]):
            keyword_frequency = vector_u[keyword_idx]
            if keyword_idx in DF:
                DF[keyword_idx] += keyword_frequency
            else:
                DF[keyword_idx] = keyword_frequency
    return DF

def keyword_popularity_matrix(U_K,df_uk):
    """
    return the U_K normalized by keyphrase frequency
    """
    df = normalize(np.array(df_uk.values()).reshape(1,-1))[0]
    for i in tqdm(range(U_K.shape[0])):
        U_K[i] = U_K[i]/df
#         vector_u = np.ravel(U_K[i].todense())
#         for entry in range(U_K.shape[1]):
#             U_K[i,entry] = U_K[i,entry]/df[entry]
    return normalize(U_K)
    
def tf_idf(U_K,df_uk):
    """
    Change the U_K/I_K keyphrase matrix to TF-IDF.
    where:
    IDF = log(N/(DF+1))
    TF_IDF = (1+log(TF))* IDF
    """
    for i in tqdm(range(U_K.shape[0])):
        vector_u = np.ravel(U_K[i].todense())
        words_count = np.sum(vector_u)
        
        for entry in range(U_K.shape[1]):
            if U_K[i,entry] == 0:
                continue
            else:
                tf = U_K[i,entry]/words_count
                df = df_uk[entry]
                idf = np.log(U_K.shape[0]/(df+1))
                tf_idf = (1+ np.log(tf))*idf
                U_K[i,entry] = tf_idf
    return normalize(U_K)

In [81]:
# Load 
U_K = load_npz('../../data/yelp/U_K.npz')
I_K = load_npz('../../data/yelp/I_K.npz')
# U_K_test = load_npz('../../data/yelp/U_K_test.npz')
# I_K_test = load_npz('../../data/yelp/I_K_test.npz')

In [82]:
df_uk = df(U_K)

In [83]:
U_K_tfidf = tf_idf(U_K, df_uk)

100%|██████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:56<00:00, 43.61it/s]


In [85]:
similarity = train(rtrain)
explanation_scores = predict(U_K_tfidf, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, U_K_tfidf)

100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:01<00:00, 1637.75it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:00<00:00, 7168.12it/s]


In [836]:
explain_evaluate(explanation, U_K_tfidf, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:04<00:00, 500.20it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:05<00:00, 456.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:05<00:00, 458.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:05<00:00, 492.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:05<00:00, 478.34it/s]


{'MAP@10': (0.8037531863150353, 0.0025584264470937825),
 'MAP@20': (0.8887167514060423, 0.0006770233228284741),
 'MAP@5': (0.6919010416666667, 0.006329136104857982),
 'MAP@50': (0.8761517727900731, 0.00023810741276318617),
 'NDCG': (0.862984800655276, 0.00015005118952633647),
 'Precision@10': (0.6789496527777779, 0.0016965583911800802),
 'Precision@20': (0.756814236111111, 0.0010347377179239256),
 'Precision@5': (0.6246527777777777, 0.0041900668251206455),
 'Precision@50': (0.7324131944444444, 0.0005940355445510568),
 'R-Precision': (0.7324131944444444, 0.0005940355445510568),
 'Recall@10': (0.6789496527777779, 0.0016965583911800802),
 'Recall@20': (0.756814236111111, 0.0010347377179239256),
 'Recall@5': (0.6246527777777777, 0.0041900668251206455),
 'Recall@50': (0.7324131944444444, 0.0005940355445510568)}

In [86]:
# With Vader keyphrases
explain_evaluate(explanation, U_K_tfidf, atK=[5,10,20,50])

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:07<00:00, 351.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:07<00:00, 349.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:07<00:00, 347.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:07<00:00, 347.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:07<00:00, 344.91it/s]


{'MAP@10': (0.7252943466159611, 0.0034826018579463785),
 'MAP@20': (0.8975108855170579, 0.0006411193458138319),
 'MAP@5': (0.7052025462962963, 0.006176983458372947),
 'MAP@50': (0.8379645827078837, 0.0002830322565324534),
 'NDCG': (0.838787940381189, 0.00017489650280682394),
 'Precision@10': (0.6920138888888889, 0.0019660558509479948),
 'Precision@20': (0.7953993055555555, 0.000999342626346402),
 'Precision@5': (0.6313368055555555, 0.004201135758998467),
 'Precision@50': (0.6735590277777778, 0.0005995899582526865),
 'R-Precision': (0.6735590277777778, 0.0005995899582526865),
 'Recall@10': (0.6920138888888889, 0.0019660558509479948),
 'Recall@20': (0.7953993055555555, 0.000999342626346402),
 'Recall@5': (0.6313368055555555, 0.004201135758998467),
 'Recall@50': (0.6735590277777778, 0.0005995899582526865)}

## Item-based TF-IDF

In [87]:
df_ik = df(I_K)

In [88]:
I_K_tfidf = tf_idf(I_K,df_ik)

100%|████████████████████████████████████████████████████████████████████████████| 10282/10282 [02:25<00:00, 70.47it/s]


In [89]:
similarity = train(np.transpose(rtrain))
explanation_scores = predict(I_K_tfidf, 100, similarity)
# explanation =  predict_explanation(explanation_scores)
explanation = explain_prediction(explanation_scores, 70, I_K_tfidf)

100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:06<00:00, 1651.20it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:01<00:00, 7472.38it/s]


In [842]:
explain_evaluate(explanation, I_K_tfidf, atK=[5,10,20,50])

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:08<00:00, 150.27it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:08<00:00, 149.16it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:07<00:00, 152.02it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:07<00:00, 152.07it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:07<00:00, 151.56it/s]


{'MAP@10': (0.35213217629153576, 0.0010968833246985527),
 'MAP@20': (0.27081058935900426, 0.0010039121441790747),
 'MAP@5': (0.4431528783835303, 0.002145923498719915),
 'MAP@50': (0.27487874661333556, 0.0015240019863223988),
 'NDCG': (0.31342782756537074, 0.001028350554491715),
 'Precision@10': (0.20875587749396368, 0.000586581336588827),
 'Precision@20': (0.16421400432075234, 0.0015456733985390193),
 'Precision@5': (0.356157072054899, 0.0016005530998861061),
 'Precision@50': (0.2102732240437159, 0.0013700342110130942),
 'R-Precision': (0.2102732240437159, 0.0013700342110130942),
 'Recall@10': (0.20875587749396368, 0.000586581336588827),
 'Recall@20': (0.16421400432075234, 0.0015456733985390193),
 'Recall@5': (0.356157072054899, 0.0016005530998861061),
 'Recall@50': (0.2102732240437159, 0.0013700342110130942)}

In [90]:
# With Vader
explain_evaluate(explanation, I_K_tfidf, atK=[5,10,20,50])

100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:38<00:00, 104.87it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:37<00:00, 105.14it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:38<00:00, 104.65it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:38<00:00, 104.45it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10282/10282 [01:38<00:00, 104.81it/s]


{'MAP@10': (0.3507409410095894, 0.0011045160159365245),
 'MAP@20': (0.24377436981475908, 0.0005716380060796823),
 'MAP@5': (0.4456696208764156, 0.002164483417861961),
 'MAP@50': (0.23215810481762375, 0.0007119926667678546),
 'NDCG': (0.22997626464891138, 0.0007149598007652743),
 'Precision@10': (0.20089857213195475, 0.00019460249576673538),
 'Precision@20': (0.10384662727720337, 0.0003505696725099028),
 'Precision@5': (0.3505169867060562, 0.0016703656098363416),
 'Precision@50': (0.1551649433776465, 0.0007062145480243385),
 'R-Precision': (0.1551649433776465, 0.0007062145480243385),
 'Recall@10': (0.20089857213195475, 0.00019460249576673538),
 'Recall@20': (0.10384662727720337, 0.0003505696725099028),
 'Recall@5': (0.3505169867060562, 0.0016703656098363416),
 'Recall@50': (0.1551649433776465, 0.0007062145480243385)}

# Create Synthetic User

In [35]:
# Get top_items items associated with keyphrase_ids 
def item_associated_with_keyphrase(I_K, keyphrase_ids, top_items = 100):
    """
    I_K: Item Keyphrase Matrix
    Keyphrase_ids: top items described by keyphrase_ids
    top_n: how many top items described by the keyphrases will be output 
    output item list (unique)
    """
    res = []
    for keyphrase_id in keyphrase_ids:
        res.append(np.argsort(np.ravel(I_K.todense()[:,keyphrase_id]))[::-1][:top_items])
    return np.unique(res)

# Modify U_U latent Space from U_I
def modify_user_preference(U_I, items, user_id = 0):
    """
    TODO: Fix the function s.t. it will not modify the initial matrix
    """
    U_I[user_id,:] = 0
    for i in items:
        U_I[0,i] = 1
    return U_I

def clear_user_keyphrase(U_K, user_id = 0):
    U_K[user_id,:] = 0

def explain_synthetic_user(rtrain, U_K, I_K, keyphrases, top_keyphrase = 20, user_id = 0, k = 100, top_items = 100, **Not_used):
    """
    
    """
    items = item_associated_with_keyphrase(I_K, keyphrases, top_items = top_items) # 8 is coffee
    U_I = modify_user_preference(rtrain, items, user_id=user_id)
    modified_U_U = train(U_I)
    U_K[user_id, :] = 0
    synthetic_user_keyphrase = normalize(predict(U_K, k, modified_U_U))[user_id]
    return np.argsort(synthetic_user_keyphrase)[::-1][:top_keyphrase]

def modify_user_keyphrase(U_K, keyphrase_ids, normalization = True, keyval = 1, user_id = 0, **Not_Used):
    """
    Change all keyphrase_ids to some fixed number, all others to 0
    Return the U_K matrix with user_id row the synthetic user1
    """
    U_K[user_id,:] = 0
    for keyphrase_id in keyphrase_ids:
        U_K[user_id,keyphrase_id] = keyval
    if normalization == True:
        return normalize(U_K)
    return U_K

In [36]:
# Modify user preference matrix
items = item_associated_with_keyphrase(I_K, [0], top_items = 200) # 'chinese'
U_I = modify_user_preference(rtrain, items, user_id = 0)



In [37]:
# make synthetic user1's keyphrase preference all 0
clear_user_keyphrase(U_K, user_id = 0)



In [42]:
U_K

<2343x233 sparse matrix of type '<type 'numpy.int32'>'
	with 298309 stored elements in Compressed Sparse Row format>

In [43]:
I_K

<7456x233 sparse matrix of type '<type 'numpy.int32'>'
	with 351924 stored elements in Compressed Sparse Row format>

In [44]:
# Get latent user similarity embedding
modified_U_U = train(U_I)
# predict
explanation_scores1 = predict(U_K, 100, modified_U_U)
explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 10)

100%|████████████████████████████████████████████████████████████████████████████| 2343/2343 [00:01<00:00, 2151.52it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2343/2343 [00:00<00:00, 97624.32it/s]


In [45]:
explanation1[0]

array([ 50,  60, 155, 205,  57, 160, 197,  61, 164, 226], dtype=int64)

In [65]:
def rank_in_prediction(rtrain, U_K, I_K, top_items = 200, keyphrase = 0,user_i = 0):
    """
    Get the rank for user_i with keyphrase
    TODO: modify so that no need to reload U_K,I_K
    """  
    U_K = load_npz('../../data/yelp/U_K.npz')
    U_K = normalize(U_K)
    rtrain = load_npz("../../data/yelp/Rtrain.npz")
    
    # Modify user preference matrix
    items = item_associated_with_keyphrase(I_K, keyphrase, top_items = top_items) # 'raspberry'
    U_I = modify_user_preference(rtrain, items, user_id = 0)
    
    # make synthetic user1's keyphrase preference all 0
    clear_user_keyphrase(U_K, user_id = 0)
    
    # Get latent user similarity embedding
    modified_U_U = train(U_I)
    # predict
    explanation_scores1 = predict(U_K, 100, modified_U_U)
    explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 230)
    return list(explanation1[user_i]).index(keyphrase[0])
    
def evaluate_pilot_test(rtrain,U_K, I_K,keyphrase_list, top_items = 200, user_i = 0):
    # Get the average rank for user_i with keyphrase  
    res1 = 0
    for i in range(75):
        a = rank_in_prediction(rtrain, U_K, I_K, top_items = top_items, keyphrase = [i],user_i = user_i)
        print "keyphrase", keyphrase_list[i], "'s rank is ", a
        res1+= a
    return res1/75

In [None]:
evaluate_pilot_test(rtrain,U_K,I_K,keyphrases)

# Create Synthetic item

In [73]:
# Get top_users items associated with keyphrase_ids 
def users_with_keyphrase_preference(U_K, keyphrase_ids, top_users = 100, norm = True):
    """
    U_K: User Keyphrase Matrix
    Keyphrase_ids: top_users who like keyphrase_ids
    output item list (unique)
    """
    res = []
    if norm:
        U_K = normalize(U_K)
    try:
        for keyphrase_id in keyphrase_ids:
            res.append(np.argsort(np.ravel(U_K.todense()[:,keyphrase_id]))[::-1][:top_users])
    except:
        return np.argsort(np.ravel(U_K.todense()[:,keyphrase_ids]))[::-1][:top_users]
    return np.unique(res)

# Modify I_I latent Space from U_I
def modify_item_history(U_I, users, item_id = 0):
    """
    TODO: Fix the function s.t. it will not modify the initial matrix
    """
    U_I[:,item_id] = 0
    for i in users:
        U_I[i, item_id] = 1
    return normalize(U_I)

def clear_item_keyphrase(I_K, item_id = 0):
    I_K[item_id, :] = 0

def explain_synthetic_item(rtrain, U_K, I_K, keyphrases, top_keyphrase = 20, user_id = 0, k = 100, top_users = 100, **Not_used):
    """
    
    """
    items = item_associated_with_keyphrase(I_K, keyphrases, top_items = top_items) # 8 is coffee
    U_I = modify_user_preference(rtrain, items, user_id=user_id)
    modified_U_U = train(U_I)
    U_K[user_id, :] = 0
    synthetic_user_keyphrase = normalize(predict(U_K, k, modified_U_U))[user_id]
    return np.argsort(synthetic_user_keyphrase)[::-1][:top_keyphrase]

def modify_user_keyphrase(U_K, keyphrase_ids, normalization = True, keyval = 1, user_id = 0, **Not_Used):
    """
    Change all keyphrase_ids to some fixed number, all others to 0
    Return the U_K matrix with user_id row the synthetic user1
    """
    U_K[user_id,:] = 0
    for keyphrase_id in keyphrase_ids:
        U_K[user_id,keyphrase_id] = keyval
    if normalization == True:
        return normalize(U_K)
    return U_K

## Prediction Pipeline

In [67]:
U_K = load_npz('../../data/yelp/U_K.npz')
I_K = load_npz('../../data/yelp/I_K.npz')
U_K = normalize(U_K)
I_K = normalize(I_K)
rtrain = load_npz("../../data/yelp/Rtrain.npz")

In [74]:
# Modify user preference matrix
users = users_with_keyphrase_preference(U_K, 70, top_users = 100, norm = True) # 'raspberry'
U_I = modify_item_history(rtrain, users, item_id = 0)



In [75]:
# make synthetic item1's keyphrase to all 0
clear_item_keyphrase(I_K, item_id = 0)

In [76]:
# Get latent item similarity embedding
modified_I_I = train(np.transpose(U_I))
# predict
explanation_scores1 = predict(I_K, 100, modified_I_I)
explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 70)

100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:03<00:00, 2184.59it/s]
100%|██████████████████████████████████████████████████████████████████████████| 7456/7456 [00:00<00:00, 112969.72it/s]


In [77]:
def rank_in_prediction_item(rtrain, U_K, I_K, top_users = 200, keyphrase = 70, item_i = 0):
    """
    Get the rank for user_i with keyphrase
    TODO: modify so that no need to reload U_K,I_K
    """  
    U_K = load_npz('../../data/yelp/U_K.npz')
    I_K = load_npz('../../data/yelp/I_K.npz')
    U_K = normalize(U_K)
    I_K = normalize(I_K)
    rtrain = load_npz("../../data/yelp/Rtrain.npz")
    
    # Modify user preference matrix
    users = users_with_keyphrase_preference(U_K, keyphrase, top_users = top_users, norm = True) # 'raspberry'
    U_I = modify_item_history(rtrain, users, item_id = item_i)
    
    # make synthetic item1's keyphrase to all 0
    clear_item_keyphrase(I_K, item_id = item_i)
    
    # Get latent item similarity embedding
    modified_I_I = train(np.transpose(U_I))
    # predict
    explanation_scores1 = predict(I_K, 100, modified_I_I)
    explanation1 =  predict_pilot_explanation(explanation_scores1, top_keyphrase = 230)
    return list(explanation1[item_i]).index(keyphrase)

    
def evaluate_pilot_test_item(rtrain,U_K, I_K,keyphrase_list, top_users = 200, item_i = 0):
    # Get the average rank for item_i with keyphrase  
    res1 = 0
    for i in range(75):
        a = rank_in_prediction_item(rtrain, U_K, I_K, top_users = top_users, keyphrase = i, item_i = item_i)
        print "keyphrase", keyphrase_list[i], "'s rank is ", a
        res1+= a
    return res1/75

In [78]:
rank_in_prediction_item(rtrain, U_K, I_K, top_users = 200, keyphrase = 10, item_i = 0)

100%|████████████████████████████████████████████████████████████████████████████| 7456/7456 [00:03<00:00, 2163.04it/s]
100%|██████████████████████████████████████████████████████████████████████████| 7456/7456 [00:00<00:00, 112969.72it/s]


32

In [None]:
evaluate_pilot_test_item(rtrain,U_K, I_K,keyphrases, top_users = 200, item_i = 0)

# Extract different words from single review and mutiple reviews

In [8]:
num_reviews = len(df_train)

In [10]:
df_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,business_id,friend_count,ghost,img_dsc,img_url,nr,...,Month,Day,Binary,review,conca_review,keyVector,keyphrases_indices_length,UserIndex,ItemIndex,timestamp
0,0,0,6,6,Xo1LNzhnwE-ilqsM3ybs9Q,105,False,"['Photo of Happy Lemon - Markham, ON, Canada. ...",['https://s3-media4.fl.yelpcdn.com/bphoto/tu7j...,False,...,23,8,0,"['ordered', 'lemon', 'mango', 'slush', 'lemon'...",ordered lemon mango slush lemon taste strong ...,"[53, 92, 99, 112, 130, 212]",6,2464,5546,2016-08-23
1,1,1,7,7,Xo1LNzhnwE-ilqsM3ybs9Q,171,False,"['Photo of Happy Lemon - Markham, ON, Canada. ...",['https://s3-media3.fl.yelpcdn.com/bphoto/h110...,False,...,2,10,0,"['came', 'sunday', 'afternoon', 'nt', 'busy', ...",came sunday afternoon nt busy came sunday spe...,"[53, 99, 126, 128, 130, 148, 151, 171]",8,1021,5546,2016-10-02
2,2,2,8,8,Xo1LNzhnwE-ilqsM3ybs9Q,239,False,"['Photo of Happy Lemon - Markham, ON, Canada. ...",['https://s3-media4.fl.yelpcdn.com/bphoto/tS6Y...,False,...,6,11,1,"['grapefruit', 'yakult', 'green', 'tea', 'aloe...",grapefruit yakult green tea aloe jelly found ...,"[53, 92, 103, 129, 192]",5,529,5546,2016-11-06
3,3,3,9,9,Xo1LNzhnwE-ilqsM3ybs9Q,10,False,"['Photo of Happy Lemon - Markham, ON, Canada',...",['https://s3-media3.fl.yelpcdn.com/bphoto/rfB0...,False,...,25,9,0,"['saw', 'newly', 'opened', 'bubble', 'tea', 's...",saw newly opened bubble tea shop wanted give ...,"[49, 53, 99, 126, 128, 130, 136, 161, 206, 212]",10,1616,5546,2016-09-25
4,4,4,22,22,Xo1LNzhnwE-ilqsM3ybs9Q,80,False,"['Photo of Happy Lemon - Markham, ON, Canada',...",['https://s3-media1.fl.yelpcdn.com/bphoto/2jVn...,False,...,30,6,1,"['happy', 'lemon', 'become', 'new', 'favourite...",happy lemon become new favourite place sweet ...,"[53, 92, 99, 103, 126, 128, 148, 152, 165, 197]",10,590,5546,2018-06-30


In [18]:
literal_eval(df_train.keyVector[0])

[53, 92, 99, 112, 130, 212]

In [13]:
# high covariance 
literal_eval(df_train['review'][0])

['ordered',
 'lemon',
 'mango',
 'slush',
 'lemon',
 'taste',
 'strong',
 'love',
 'lemon',
 'love',
 'drink',
 'informed',
 'jelly',
 'drink',
 'added',
 'bonus',
 'since',
 'love',
 'drink',
 'topping',
 'jelly',
 'different',
 'compared',
 'bubble',
 'tea',
 'place',
 'soft',
 'chewy',
 'overall',
 'enjoyed',
 'drink']

In [9]:
num_reviews

102741