In [1]:
import sys
sys.path

['/Users/litos/opt/anaconda3/envs/tensorflow_cpu/lib/python36.zip',
 '/Users/litos/opt/anaconda3/envs/tensorflow_cpu/lib/python3.6',
 '/Users/litos/opt/anaconda3/envs/tensorflow_cpu/lib/python3.6/lib-dynload',
 '',
 '/Users/litos/opt/anaconda3/envs/tensorflow_cpu/lib/python3.6/site-packages',
 '/Users/litos/opt/anaconda3/envs/tensorflow_cpu/lib/python3.6/site-packages/IPython/extensions',
 '/Users/litos/.ipython']

In [2]:
from gurobipy import *

from scipy.sparse import csr_matrix, load_npz, save_npz
from tqdm import tqdm
from sklearn.preprocessing import normalize
from collections import *
import datetime
import json
import pandas as pd
import time
# import yaml
import scipy.sparse as sparse
from ast import literal_eval

import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
from sklearn.linear_model import Ridge
from sklearn.utils.extmath import randomized_svd

from scipy.optimize import minimize

# Utils

In [3]:
def save_dataframe_csv(df, path, name):
    df.to_csv(path+name, index=False)


def load_dataframe_csv(path, name, index_col=None):
    return pd.read_csv(path+name, index_col=index_col)


# Load Data

In [4]:
# Load Original Data
df_train = pd.read_csv('../../data/yelp/Train.csv',encoding='latin-1')
# df_valid = pd.read_csv('../../data/yelp/Valid.csv',encoding='latin-1')
# df_test = pd.read_csv('../../data/yelp/Test.csv',encoding='latin-1')

In [5]:
keyphrases = pd.read_csv('../../data/yelp/KeyPhrases.csv')['Phrases'].tolist()
keyphrase_popularity = np.loadtxt('../data/yelp/'+'keyphrase_popularity.txt', dtype=int)

# Load U-I Data 
rtrain = load_npz("../../data/yelp/Rtrain.npz")
rvalid = load_npz("../../data/yelp/Rvalid.npz")
rtest = load_npz("../../data/yelp/Rtest.npz")

# Load user/item keyphrase data
U_K = load_npz("../../data/yelp/U_K.npz")
I_K = load_npz("../../data/yelp/I_K.npz")

# Models

In [6]:
def get_I_K(df, row_name = 'ItemIndex', shape = (3668,75)):
    rows = []
    cols = []
    vals = []
    for i in tqdm(range(df.shape[0])):
        key_vector = literal_eval(df['keyVector'][i])
        rows.extend([df[row_name][i]]*len(key_vector)) ## Item index
        cols.extend(key_vector) ## Keyword Index
        vals.extend(np.array([1]*len(key_vector)))
    return csr_matrix((vals, (rows, cols)), shape=shape)


In [7]:
# PLREC 
def inhour(elapsed):
    return time.strftime('%H:%M:%S', time.gmtime(elapsed))

def plrec(matrix_train, iteration=4, lamb=80, rank=200, seed=1):
    """
    Function used to achieve generalized projected lrec w/o item-attribute embedding
    :param matrix_train: user-item matrix with shape m*n
    :param iteration: number of power iterations in randomized svd
    :param lamb: parameter of penalty
    :param rank: latent dimension size
    :param seed: the seed of the pseudo random number generator to use when shuffling the data
    :return: prediction in sparse matrix
    """
    print ("Randomized SVD")
    start_time = time.time()
    P, sigma, Qt = randomized_svd(matrix_train,
                                  n_components=rank,
                                  n_iter=iteration,
                                  random_state=seed)

    RQ = matrix_train.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma)))

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    print ("Closed-Form Linear Optimization")
    start_time = time.time()
    pre_inv = RQ.T.dot(RQ) + lamb * sparse.identity(rank, dtype=np.float32)
    inverse = sparse.linalg.inv(pre_inv.tocsc())
    Y = inverse.dot(RQ.T).dot(matrix_train)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    return np.array(RQ.todense()), np.array(Y.todense()), None

# def predict_vector(rating_vector, train_vector, remove_train=True):
#     dim = len(rating_vector)
#     candidate_index = np.argpartition(-rating_vector, dim-1)[:dim]
#     prediction_items = candidate_index[rating_vector[candidate_index].argsort()[::-1]]
    
#     if remove_train:
#         return np.delete(prediction_items, np.isin(prediction_items, train_vector.nonzero()[1]).nonzero()[0])
#     else:
#         return prediction_items

    
def predict_scores(matrix_U, matrix_V, bias=None,
                   penalize = False,
                   keyphrase_freq = I_K, 
                   critiqued_keyphrase = 0, 
                   matrix_Train = rtrain,
                   alpha = 0):
    prediction = matrix_U.dot(matrix_V.T)
    # Penalize
    if penalize == True:
        items_with_keyphrase = np.ravel(keyphrase_freq.T[critiqued_keyphrase].nonzero()[1])
        items_without_keyphrase = np.setdiff1d(np.arange(matrix_Train.shape[1]), items_with_keyphrase)
        prediction[items_without_keyphrase] = alpha # penalize
    
    return prediction

def predict_vector(rating_vector, train_vector, remove_train=True):
    dim = len(rating_vector)
    candidate_index = np.argpartition(-rating_vector, dim-1)[:dim]
    prediction_items = candidate_index[rating_vector[candidate_index].argsort()[::-1]]
    
    if remove_train:
        return np.delete(prediction_items, np.isin(prediction_items, train_vector.nonzero()[1]).nonzero()[0])
    else:
        return prediction_items


In [8]:
# initial Prediction
def predict_scores(matrix_U, matrix_V, bias=None,
                   penalize = False,
                   keyphrase_freq = I_K, 
                   critiqued_keyphrase = 0, 
                   matrix_Train = rtrain,
                   alpha = 0):
    
    prediction = matrix_U.dot(matrix_V.T)
    # Penalize
    if penalize == True:
        items_with_keyphrase = np.ravel(keyphrase_freq.T[critiqued_keyphrase].nonzero()[1])
        items_without_keyphrase = np.setdiff1d(np.arange(matrix_Train.shape[1]), items_with_keyphrase)
        prediction[items_without_keyphrase] = alpha # penalize
    
    return prediction


In [9]:
# Keyphrase Selection Helpers
def get_valid_keyphrases(keyphrase_freq,top_recommendations,item = None,threshold=50,mutiple_keyphrases_en = False, top_items = None):
    """
    Wrapper function to get either top 1 or top n keyphrases
    """
    if mutiple_keyphrases_en:
        top_keyphrases = []
        for item in top_items:
            top_keyphrases.extend(get_valid_keyphrases_for_one_item(keyphrase_freq,top_recommendations,item,threshold=threshold))
        return np.ravel(list(set(top_keyphrases))) # remove duplicate and reformat to np array
    else:
        return get_valid_keyphrases_for_one_item(keyphrase_freq,top_recommendations,item,threshold=threshold)

def get_valid_keyphrases_for_one_item(keyphrase_freq,top_recommendations, item,threshold=50):
    """
    Get keyphrases of item that make sense
    E.g. if the item has fewer than threshold=50 keyphrases, get all of them
    otherwise get top 50 keyphrases
    """
    keyphrase_length = len(keyphrase_freq[item].nonzero()[1])
    if keyphrase_length<threshold:
        return keyphrase_freq[item].nonzero()[1]
    else:
        keyphrases = np.ravel(keyphrase_freq[top_recommendations[0]].todense())
        top_keyphrases = np.argsort(keyphrases)[::-1][:threshold]
        return top_keyphrases
    
# For keyphrase selecting method # 3 "diff" 
def get_item_keyphrase_freq(keyphrase_freq,item):
    """
    Get item's keyphrase frequency 
    """
    count = keyphrase_freq[item].todense()
    return np.ravel(count/(np.sum(count)+0.001))

def get_all_item_keyphrase_freq(item_keyphrase_freq = I_K):
    res = []
    num_items = item_keyphrase_freq.shape[0]
    for item in range(num_items):
        res.append(np.ravel(get_item_keyphrase_freq(item_keyphrase_freq,item)))
    return np.array(res)

def get_keyphrase_popularity(df,keyphrases):
    """
    Get keyphrase popularity (count) from dataframe
    """
    keyphrase_popularity = np.zeros(len(keyphrases)) #initialize
    for i in range(len(df)):
        keyphrase_vector = literal_eval(df['keyVector'][i])
        keyphrase_popularity[keyphrase_vector] += 1 # count
    return keyphrase_popularity

In [10]:
all_item_keyphrase_freq = get_all_item_keyphrase_freq()

In [11]:
# One hot encoding of critiquing
def get_critiqued_UK(user_keyphrase_frequency,user_index,critiqued_keyphrase):
    """
    user_keyphrase_frequency is the U_K matrix (csr sparse matrix)
    return the one-hot encoding of the critique
    """
    U_K_cp = user_keyphrase_frequency.copy()
    U_K_cp[user_index] = 0
    U_K_cp[user_index,critiqued_keyphrase] = 1
    return U_K_cp

def project_one_hot_encoding(reg, user_keyphrase_frequency,user_index = 0,critiqued_keyphrase = 0, normalize_en = True):
    """
    Return the projection on user_sim space from one-hot encoding of critiqued keyphrase
    The res[user_index] should be target embedding row
    """
    critiqued_matrix = get_critiqued_UK(user_keyphrase_frequency, user_index, critiqued_keyphrase)
    res = reg.predict(critiqued_matrix)
    if normalize_en:
        res = normalize((res))
    return res

In [12]:
# Upper bound method 
def get_all_affected_items(wanted_keyphrases,keyphrase_freq):
    res = []
    for keyphrase in wanted_keyphrases:
        items = np.ravel(keyphrase_freq.T[keyphrase].nonzero()[1])
        res.extend(items)
    return np.array(list(set(res)))
    
def select_only_wanted_keyphrase(top_recommendations, wanted_keyphrases, keyphrase_freq, matrix_Train = rtrain):
    all_items_with_keyphrases = get_all_affected_items(wanted_keyphrases,keyphrase_freq)
    affected_items = np.setdiff1d(np.arange(matrix_Train.shape[1]), all_items_with_keyphrases) # Get all other keyphrases
    top_recommendations[~np.in1d(top_recommendations, affected_items)]
    return top_recommendations

def pruning(prediction_score, 
           wanted_keyphrases_random, 
           top_recommendations, 
           keyphrase_freq, 
           matrix_Train = rtrain,
           alpha = 0):
    items_with_keyphrase = get_all_affected_items(wanted_keyphrases_random, keyphrase_freq)
    #Return the unique values in ar1 that are not in ar2.
    items_without_keyphrase = np.setdiff1d(np.arange(matrix_Train.shape[1]), items_with_keyphrase)
#     print (items_without_keyphrase)
    print (sum(prediction_score[items_without_keyphrase]))
    score = np.copy(prediction_score)
    score[items_without_keyphrase] = alpha # penalize
    return score

# Utils

In [13]:
# Utility function for getting restaurant info from ItemIndex
def get_business_df(path = "../../data/yelp/business.json" ):
    with open(path,encoding="utf8") as json_file:
        data = json_file.readlines()
        data = list(map(json.loads, data))
    df = pd.DataFrame(data)
    
    return df

def get_restaurant_info(business_df, business_id, name = True, review_count = True, stars = True ):
    output_list = {}
    row_idx = int(business_df.index[business_df['business_id'] == business_id].tolist()[0])
    if name == True:
        output_list['name'] = business_df['name'][row_idx].encode('utf-8').strip()
    if review_count == True:
        output_list['review_count'] = business_df['review_count'][row_idx]
    if stars == True:
        output_list['stars'] = business_df['stars'][row_idx] 
    return output_list

# def get_businessid_from_Itemindex(ItemIndex_list, itemindex):
#     return ItemIndex_list['business_id'].tolist()[itemindex]

def get_restaurant_name(df_train, business_df, ItemIndex):
    rows = np.where(df_train['ItemIndex'] == ItemIndex)
    if len(rows)!= 0:
        business_id = df_train.loc[rows[0][0]]['business_id']
        item_info = get_restaurant_info(business_df, business_id)
        return item_info['name']
    return "NOT_FOUND"

# Evaluation 

In [14]:
# Evluation 
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)

def precisionk(vector_predict, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_predict)


def average_precisionk(vector_predict, hits, **unused):
    precisions = np.cumsum(hits, dtype=np.float32)/range(1, len(vector_predict)+1)
    return np.mean(precisions)


def r_precision(vector_true_dense, vector_predict, **unused):
    vector_predict_short = vector_predict[:len(vector_true_dense)]
    hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)


def _dcg_support(size):
    arr = np.arange(1, size+1)+1
    return 1./np.log2(arr)


def ndcg(vector_true_dense, vector_predict, hits):
    idcg = np.sum(_dcg_support(len(vector_true_dense)))
    dcg_base = _dcg_support(len(vector_predict))
    dcg_base[np.logical_not(hits)] = 0
    dcg = np.sum(dcg_base)
    return dcg/idcg


def click(hits, **unused):
    first_hit = next((i for i, x in enumerate(hits) if x), None)
    if first_hit is None:
        return 5
    else:
        return first_hit/10


def evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
            vector_true = matrix_Test[user_index]
            vector_true_dense = vector_true.nonzero()[1]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output



# Critiquing Pipline

In [15]:
business_df = get_business_df()

In [16]:
keyphrase_popularity = np.loadtxt('../data/yelp/'+'keyphrase_popularity.txt', dtype=int)

In [17]:
Y, RQt, Bias = plrec(rtrain,
                    iteration = 10,
                    lamb = 200,
                    rank = 200)
RQ = RQt.T
reg = LinearRegression().fit(normalize(U_K), Y)

Randomized SVD
Elapsed: 00:00:00
Closed-Form Linear Optimization
Elapsed: 00:00:00


In [18]:
# Set up dataframe 

# post_ranki is post rank with different lambda ratio for combining pre-post User similarity matrix 

columns = ['user_id', 'target_item', 'item_name', 'iter', 'pre_rank', 
           'top_prediction_item_name',
           'post_rank_random_all',
           'post_rank_random_upper',
           'random_scores',
           'post_rank_pop_all',
           'post_rank_pop_upper',
           'pop_scores',
           'post_rank_diff_all',
           'post_rank_diff_upper',
           'diff_scores',
           'critiqued_keyphrase_random',
           'keyphrase_name_random',
           'critiqued_keyphrase_pop',
           'keyphrase_name_pop',
           'critiqued_keyphrase_diff',
           'keyphrase_name_diff',
           'num_existing_keyphrases',
           'pure_pruning_rank'] 
df = pd.DataFrame(columns=columns)
row = {}

## LP Objectives

In [19]:
def Average(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, query, test_user, item_latent, reg, all_equal = True):
    critiqued_vector = np.zeros(keyphrase_freq.shape[1])
    
    for q in query:
        critiqued_vector[q] = 1
#         critiqued_vector[q] = keyphrase_freq[test_user,q]
        
    num_critiques = len(query)
    
    # Get item latent for updating prediction
    W2 = reg.coef_
    W = item_latent.dot(W2)
    
    optimal_lambda = 1 # weight all critiquing equally
    lambdas = [optimal_lambda]*num_critiques
    
    # Record lambda values 
    for k in range(num_critiques):
        critiqued_vector[query[k]] *= optimal_lambda

    critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)

    if all_equal:
        # weight initial and each critiquing equally 
        new_prediction = initial_prediction_u/(num_critiques) + critique_score.flatten()
    else:
        # weight intial and combined critiquing equally
        new_prediction = initial_prediction_u + critique_score.flatten() 
#     print (len(new_prediction))
    return new_prediction, lambdas   


In [20]:
def LP1SimplifiedOptimize(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, query, test_user, item_latent, reg):

    critiqued_vector = np.zeros(keyphrase_freq.shape[1])

    for q in query:
#         critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
        critiqued_vector[q] = max(2*keyphrase_freq[test_user , q],1)

    num_critiques = len(query)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP1Simplified") # Create gurobi model with name

    # Assignment variables
    lambs = []

    for k in range(num_critiques):
        lambs.append(m.addVar(lb=-1,
                              ub=1,
                              vtype=GRB.CONTINUOUS,
                              name="lamb%d" % query[k]))

    m.setObjective(quicksum(initial_prediction_u[affected_item] * num_unaffected_items + quicksum(lambs[k] * critiqued_vector[query[k]] * W[affected_item][query[k]] * num_unaffected_items for k in range(num_critiques)) for affected_item in affected_items) - quicksum(initial_prediction_u[unaffected_item] * num_affected_items + quicksum(lambs[k] * critiqued_vector[query[k]] * W[unaffected_item][query[k]] * num_affected_items for k in range(num_critiques)) for unaffected_item in unaffected_items), GRB.MINIMIZE)

    # Optimize
    m.optimize()

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    lambdas = []
    for k in range(num_critiques):
        optimal_lambda = m.getVars()[k].X
        lambdas.append(optimal_lambda)
        critiqued_vector[query[k]] *= optimal_lambda

#     modified_user_laten = reg.predict(critiqued_vector.reshape(1, -1)) + Y
#     new_prediction = predict_scores(matrix_U=modified_user_laten,
#                                     matrix_V=item_latent)
    critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)
    new_prediction = initial_prediction_u + critique_score.flatten()

    return new_prediction, lambdas
    
    

In [21]:
def LP1SumToOneOptimize(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, query, test_user, item_latent, reg):

    critiqued_vector = np.zeros(keyphrase_freq.shape[1])

    for q in query:
        critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
#         critiqued_vector[q] = -keyphrase_freq[test_user][q]

    num_critiques = len(query)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP1SumToOneOptimize")

    # Assignment variables
    lambs = []
    for k in range(1+num_critiques):
        lambs.append(m.addVar(lb=0,
                              ub=1,
                              vtype=GRB.CONTINUOUS,
                              name="lamb%d" % k))

    m.addConstr((sum(lambs[k] for k in range(1+num_critiques)) == 1), name="sum_to_one")

    m.setObjective(quicksum(lambs[0] * initial_prediction_u[affected_item] * num_unaffected_items + quicksum(lambs[k+1] * critiqued_vector[query[k]] * W[affected_item][query[k]] * num_unaffected_items for k in range(num_critiques)) for affected_item in affected_items) - quicksum(lambs[0] * initial_prediction_u[unaffected_item] * num_affected_items + quicksum(lambs[k+1] * critiqued_vector[query[k]] * W[unaffected_item][query[k]] * num_affected_items for k in range(num_critiques)) for unaffected_item in unaffected_items), GRB.MINIMIZE)

    # Optimize
    m.optimize()

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    lambdas = []
    for k in range(1+num_critiques):
        optimal_lambda = m.getVars()[k].X
        lambdas.append(optimal_lambda)

    for k in range(num_critiques):
        critiqued_vector[query[k]] *= lambdas[k+1]
    
    modified_user_laten = reg.predict(critiqued_vector.reshape(1, -1)) + self.Y
    new_prediction = predict_scores(matrix_U=modified_user_laten,
                                    matrix_V=item_latent)
#     critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
#                                     matrix_V=item_latent)
#     new_prediction = lambdas[0]*initial_prediction_u + critique_score.flatten()

    return new_prediction, lambdas


## RankSVM Objective

In [61]:
U_K.shape[1]

235

In [319]:
#### !!!!!!!!!!!!
#### Below is incorrect version
#### Totally incorrect

def rankSVM(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, query, test_user, item_latent, reg, item_keyphrase_freq = I_K):
    critiqued_vector = np.zeros(keyphrase_freq.shape[1])

    for q in query:
#         critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
        critiqued_vector[q] = max(keyphrase_freq[test_user , q],1)
        print ('critiqued_vector setting: ',critiqued_vector[q])

    num_critiques = len(query)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP2RankSVM")
    
    # Assignment variables
    lambs = []
    us = []
    xis = []
    # weight w
    for k in range(num_critiques):
        lambs.append(m.addVar(lb=-1,
                              ub=1,
                              vtype=GRB.CONTINUOUS,
                              name="lamb%d" % k))
    # dummy variable u for absolute lamb
    for k in range(num_critiques):
        us.append(m.addVar(vtype=GRB.CONTINUOUS,
                          name="u%d" % k))
    
    # slack variables xi
    for k in range(num_critiques):
        for i in range(num_affected_items):
            for j in range(num_unaffected_items):
                xis.append(m.addVar(lb = 0, 
                                    vtype = GRB.CONTINUOUS,
                                    name = "xi_%d_%d_%d" % (i,j,k) ))
      
    ## constraints
    # constraints for dummy variable
    for k in range(num_critiques):
        m.addConstr(us[k] >= lambs[k])
        m.addConstr(us[k] >= -lambs[k])
    
    # constraints for rankSVM 
    for k in range(num_critiques):
        for i in range(num_affected_items):
            for j in range(num_unaffected_items):
                m.addConstr(lambs[k]*item_keyphrase_freq[affected_items[i] , query[k]] >= lambs[k]*item_keyphrase_freq[unaffected_items[i] , query[k]] + 1 - xis[k*(num_affected_items*num_unaffected_items) + i*num_unaffected_items + j] , name = "constraints%d_%d_%d" % (k,i,j))
#                 m.addConstr(lambs[k]*item_keyphrase_freq[affected_items[i]][query[k]] >= lambs[k]*item_keyphrase_freq[unaffected_items[i]][query[k]] + 1 - xis[k*(num_affected_items*num_unaffected_items) + i*num_unaffected_items + j] , name = "constraints%d_%d_%d" % (k,i,j))
    
    C = 1 #regularization parameter (trading-off margin size against training error
    m.setObjective(quicksum(us) + C * quicksum(xis) ,GRB.MINIMIZE)
                
    # Optimize
    m.optimize()

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    lambdas = []
    for k in range(num_critiques):
        optimal_lambda = m.getVarByName("lamb%d" % k).X
        lambdas.append(optimal_lambda)
        
    for k in range(num_critiques):
        critiqued_vector[query[k]] *= lambdas[k]
        
    critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)
    new_prediction = initial_prediction_u + critique_score.flatten() # now adding initial to combined critiquing 
    return new_prediction, lambdas

    

In [230]:
#### See https://www.overleaf.com/read/wwftdhpcmxnx
#### For the RankSVM math

def rankSVM(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, 
            query, test_user, item_latent, reg, user_latent_embedding, item_keyphrase_freq = I_K):
    critiques = query # fix this variable name later
    
#     critiqued_vector = np.zeros(keyphrase_freq.shape[1])
    
#     for c in critiques:
# #         critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
#         critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
# #         print ('critiqued_vector setting: ',critiqued_vector[q])
#     print ('affected items, ',affected_items)
#     print ('unaffected items, ',unaffected_items)
    
    num_critiques = len(critiques)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP2RankSVM")
    m.setParam('OutputFlag', 0)
    
    # Assignment variables
    thetas = []
    us = []
    xis = []
    # weight thetas
    for k in range(num_critiques + 1):
        thetas.append(m.addVar(lb=-1,
                              ub=1,
                              vtype=GRB.CONTINUOUS,
                              name="theta%d" % k))
    thetas = np.array(thetas)
    # dummy variable u for absolute theta
    for k in range(num_critiques + 1):
        us.append(m.addVar(vtype=GRB.CONTINUOUS,
                          name="u%d" % k))
        
    # slack variables xi
    for i in range(num_affected_items):
        for j in range(num_unaffected_items):
            xis.append(m.addVar(lb = 0, 
                                vtype = GRB.CONTINUOUS,
                                name = "xi_%d_%d" % (i,j) ))
      
    ## constraints
    # constraints for dummy variable u's
    for k in range(num_critiques+1):
        m.addConstr(us[k] >= thetas[k])
        m.addConstr(us[k] >= -thetas[k])
        
    
    ## Pre-calculate critique embedding
    u_i = Y[test_user]
    phi_js = []
    phi_jprimes = []
    k_cis = []
    
#     user_latent_embedding = [u_i]
    
#     # concat all latent embeddings
#     for k in range(num_critiques):
#         # Get critique vector 
#         c = critiques[k] 
#         critiqued_vector = np.zeros(keyphrase_freq.shape[1])
#         critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
        
#         # map user critique to user latent embedding
#         k_ci = reg.predict(critiqued_vector.reshape(1, -1)).flatten()
        
#         # concat u_i and k_cis
#         user_latent_embedding.append(k_ci)
        
#     user_latent_embedding = np.array(user_latent_embedding) # convert to numpy array for dot product
    
#     print ('user_latent_embedding size: ', user_latent_embedding.shape )
#     print ('v_j size: ', RQ[j].shape)
#     print ('-----')

    user_latent_embedding = np.array(user_latent_embedding)
#     print ('user latent embedding shape: ', user_latent_embedding.shape)
    # constraints for rankSVM 
    for j in range(num_affected_items):
        for j_ in range(num_unaffected_items):
            m.addConstr( thetas.dot(user_latent_embedding.dot(RQ[affected_items[j]])) >= thetas.dot(user_latent_embedding.dot(RQ[unaffected_items[j_]])) + 1 - xis[j*num_affected_items + j_], name = "constraints%d_%d" % (j,j_))
#             print ('item j embedding :',user_latent_embedding.dot(RQ[j]) )
#             print ('item j_ embedding:',user_latent_embedding.dot(RQ[j_]) )
    lamb = 5 #regularization parameter (trading-off margin size against training error
    m.setObjective(quicksum(us) + lamb * quicksum(xis), GRB.MINIMIZE)
                
    # Optimize
    m.optimize()

#     print("Elapsed: {}".format(inhour(time.time() - start_time)))

    thetas = []
    for k in range(num_critiques+1):
        optimal_theta = m.getVarByName("theta%d" % k).X
        thetas.append(optimal_theta)
        
#     print ('optimal thetas: ',thetas)


    critiqued_vector = np.zeros(keyphrase_freq.shape[1])
    
    # Combine weights to critiqued vector
    for c in critiques:
#         critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
        critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
    for k in range(num_critiques):
        critiqued_vector[critiques[k]] *= thetas[k+1]
    
    # Get rating score
    critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)
    new_prediction = thetas[0]*initial_prediction_u + critique_score.flatten()
    
    return new_prediction, thetas

In [229]:
#### See https://www.overleaf.com/read/wwftdhpcmxnx
#### For the RankSVM math

def rankSVM2(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, 
            query, test_user, item_latent, reg, user_latent_embedding, item_keyphrase_freq = I_K):
    critiques = query # fix this variable name later
    
#     critiqued_vector = np.zeros(keyphrase_freq.shape[1])
    
#     for c in critiques:
# #         critiqued_vector[q] = 1 # set critiqued/boosted keyphrase to 1
#         critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
# #         print ('critiqued_vector setting: ',critiqued_vector[q])
#     print ('affected items, ',affected_items)
    
    num_critiques = len(critiques)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP2RankSVM2")
    m.setParam('OutputFlag', 0) # set to 1 for outputing details
    
    # Assignment variables
    thetas = []
    us = []
    xi_pos = []
    xi_neg = []
    # weight thetas
    for k in range(num_critiques + 1):
        thetas.append(m.addVar(lb=-1,
                              ub=1,
                              vtype=GRB.CONTINUOUS,
                              name="theta%d" % k))
    thetas = np.array(thetas)
    
    # dummy variable u for absolute theta
    for k in range(num_critiques + 1):
        us.append(m.addVar(vtype=GRB.CONTINUOUS,
                          name="u%d" % k))
        
    # slack variables xi
    for i in range(num_affected_items):
        xi_pos.append(m.addVar(lb = 0, 
                                vtype = GRB.CONTINUOUS,
                                name = "xi_pos%d" % i ))
    for i in range(num_unaffected_items):
        xi_neg.append(m.addVar(lb = 0, 
                                vtype = GRB.CONTINUOUS,
                                name = "xi_neg%d" % i ))
        
    ## constraints
    # constraints for dummy variable u's
    for k in range(num_critiques+1):
        m.addConstr(us[k] >= thetas[k])
        m.addConstr(us[k] >= -thetas[k])
        
    ## Pre-calculate critique embedding
#     u_i = Y[test_user]
#     phi_js = []
#     phi_jprimes = []
#     k_cis = []
    
#     user_latent_embedding = [u_i]
    
#     # concat all latent embeddings
#     for k in range(num_critiques):
#         # Get critique vector 
#         c = critiques[k] 
#         critiqued_vector = np.zeros(keyphrase_freq.shape[1])
#         critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
        
#         # map user critique to user latent embedding
#         k_ci = reg.predict(critiqued_vector.reshape(1, -1)).flatten()
        
#         # concat u_i and k_cis
#         user_latent_embedding.append(k_ci)
        
#     user_latent_embedding = np.array(user_latent_embedding) # convert to numpy array for dot product
    
#     print ('user_latent_embedding size: ', user_latent_embedding.shape )
#     print ('v_j size: ', RQ[j].shape)
#     print ('-----')

    user_latent_embedding = np.array(user_latent_embedding)
#     print ('user latent embedding shape: ', user_latent_embedding.shape)
    
    ## constraints for rankSVM 
    
    # Affected items rank higher
    for j in range(num_affected_items):
        m.addConstr( thetas.dot(user_latent_embedding.dot(RQ[affected_items[j]])) >= initial_prediction_u[affected_items[j]] + 1 - xi_pos[j], name = "pos_constraint%d" % j )
    
    # Unaffected items rank lower
    for j in range(num_unaffected_items):
        m.addConstr( initial_prediction_u[unaffected_items[j]] - thetas.dot(user_latent_embedding.dot(RQ[unaffected_items[j]])) >=  1 - xi_neg[j], name = "neg_constraint%d" % j )
    
            
    # objective
    lamb = 5 #regularization parameter (trading-off margin size against training error
    m.setObjective(quicksum(us) + lamb * (quicksum(xi_pos)+quicksum(xi_neg)), GRB.MINIMIZE)
                
    # Optimize
    m.optimize()

#     print("Elapsed: {}".format(inhour(time.time() - start_time)))

    thetas = []
    for k in range(num_critiques+1):
        optimal_theta = m.getVarByName("theta%d" % k).X
        thetas.append(optimal_theta)
        
#     print ('optimal thetas: ',thetas)
    critiqued_vector = np.zeros(keyphrase_freq.shape[1])
    
    # Combine weights to critiqued vector
    for c in critiques:
#         critiqued_vector[c] = 1 # set critiqued/boosted keyphrase to 1
        critiqued_vector[c] = max(keyphrase_freq[test_user , c],1)
    for k in range(num_critiques):
        critiqued_vector[critiques[k]] *= thetas[k+1]
    
    # Get rating score
    critique_score = predict_scores(matrix_U=reg.predict(critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)
    new_prediction = thetas[0]*initial_prediction_u + critique_score.flatten()
    
    return new_prediction, thetas

# Main

In [249]:
class LP1Simplified(object):
    def __init__(self, keyphrase_freq, item_keyphrase_freq, row, matrix_Train, matrix_Test, test_users,
                 target_ranks, num_items_sampled, num_keyphrases, df,
                 max_iteration_threshold, keyphrase_popularity, dataset_name,
                 model, parameters_row, keyphrases_names, keyphrase_selection_method, max_wanted_keyphrase, **unused):
        self.keyphrase_freq = keyphrase_freq
        self.item_keyphrase_freq = item_keyphrase_freq
        self.row = row
        self.matrix_Train = matrix_Train
        self.num_users, self.num_items = matrix_Train.shape
        self.matrix_Test = matrix_Test
        self.test_users = test_users
        self.target_ranks = target_ranks
        self.num_items_sampled = num_items_sampled
        self.num_keyphrases = num_keyphrases
        self.df = df
        self.max_iteration_threshold = max_iteration_threshold
        self.keyphrase_popularity = keyphrase_popularity
        self.dataset_name = dataset_name
        self.model = model
        self.parameters_row = parameters_row
        self.keyphrase_selection_method = keyphrase_selection_method
        self.max_wanted_keyphrase = max_wanted_keyphrase
        
        
        self.keyphrases_names = keyphrases_names

    def start_critiquing(self):
#         self.get_initial_predictions() # No need to do it every time
        self.RQ = RQ
        Yt = Y.T 
        self.Y = Y

        self.reg = reg

        self.prediction_scores = predict_scores(matrix_U=self.RQ,
                                                matrix_V=self.Y,
                                                bias=Bias).T
        
        for user in tqdm(self.test_users):
            start_time = time.time()
            # User id starts from 0
            self.row['user_id'] = user
            
            initial_prediction_items = predict_vector(rating_vector=self.prediction_scores[user],
                                                            train_vector=self.matrix_Train[user],
                                                            remove_train=True)
            # For keyphrase selection method 'diff' 
            top_recommended_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = initial_prediction_items[0])
            
            # The iteration will stop if the wanted item is in top n
            for target_rank in self.target_ranks:
                self.row['target_rank'] = target_rank
                
                # Pick wanted items in test items
                candidate_items = self.matrix_Test[user].nonzero()[1]
                train_items = self.matrix_Train[user].nonzero()[1]
                wanted_items = np.setdiff1d(candidate_items, train_items)
                
#                 # Pick only num_items_sampled items
#                 try:
#                     wanted_items = random.sample(wanted_items,num_items_sampled)
#                 except:
#                     pass 
                for item in wanted_items:
                    # Item id starts from 0
                    self.row['item_id'] = item
                    try:
                        self.row['item_name'] = get_restaurant_name(df_train, business_df,item)
                    except:
                        self.row['item_name'] = 'NOT_FOUND'
                    # Set the wanted item's initial rank as None
                    self.row['item_rank'] = None
                    # Set the wanted item's initial prediction score as None
                    self.row['item_score'] = None
                    
                    if self.keyphrase_selection_method == "random" or self.keyphrase_selection_method == "pop":
                        # Get the item's existing keyphrases (we can boost)
                        remaining_keyphrases = self.item_keyphrase_freq[item].nonzero()[1]
                    if self.keyphrase_selection_method == "diff":
                        # For keyphrase selection method 'diff' 
                        target_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = item)
                        diff_keyphrase_freq = target_keyphrase_freq - top_recommended_keyphrase_freq
                        remaining_keyphrases = np.argsort(np.ravel(diff_keyphrase_freq))[::-1][:self.max_wanted_keyphrase]
                        
#                    print("The number of remaining_keyphrases is {}. remaining_keyphrases are: {}".format(len(remaining_keyphrases), remaining_keyphrases))
                    self.row['num_existing_keyphrases'] = len(remaining_keyphrases)
                    if len(remaining_keyphrases) == 0:
                        break
                    self.row['iteration'] = 0
                    self.row['critiqued_keyphrase'] = None
                    self.row['result'] = None
                    self.df = self.df.append(self.row, ignore_index=True)

                    query = []
                    affected_items = np.array([])
                    
                    # Set up latent embedding
                    user_latent_embedding = [Y[user]]
                    
                    for iteration in range(self.max_iteration_threshold):
                        self.row['iteration'] = iteration + 1
                                                
                        if self.keyphrase_selection_method == "pop":
                            # Always critique the most popular keyphrase
                            critiqued_keyphrase = remaining_keyphrases[np.argmax(self.keyphrase_popularity[remaining_keyphrases])]
    #                        print("remaining keyphrases popularity: {}".format(self.keyphrase_popularity[remaining_keyphrases]))
                        elif self.keyphrase_selection_method == "random":
                            critiqued_keyphrase = np.random.choice(remaining_keyphrases, size=1, replace=False)[0]
            
                        elif self.keyphrase_selection_method == "diff":
                            critiqued_keyphrase = remaining_keyphrases[0]
#                             print ('critiqued_keyphrase', critiqued_keyphrase)
                        
                        self.row['critiqued_keyphrase'] = critiqued_keyphrase
                        self.row['critiqued_keyphrase_name'] = keyphrases_names[critiqued_keyphrase]
                        query.append(critiqued_keyphrase)

                        # Get affected items (items have critiqued keyphrase)
                        current_affected_items = self.item_keyphrase_freq[:, critiqued_keyphrase].nonzero()[0]
                        affected_items = np.unique(np.concatenate((affected_items, current_affected_items))).astype(int)
                        unaffected_items = np.setdiff1d(range(self.num_items), affected_items)

                        if iteration == 0:
                            prediction_items = initial_prediction_items #calculated once for each user

                        affected_items_mask = np.in1d(prediction_items, affected_items)
                        affected_items_index_rank = np.where(affected_items_mask == True)
                        unaffected_items_index_rank = np.where(affected_items_mask == False)

                        import copy
                        
                        
                        ## concat critique embeddings to user latent embedding
                        # Get critique vector 
                        critiqued_vector = np.zeros(self.keyphrase_freq.shape[1])
                        critiqued_vector[c] = max(self.keyphrase_freq[user , critiqued_keyphrase],1)
                        # map user critique to user latent embedding
                        k_ci = reg.predict(critiqued_vector.reshape(1, -1)).flatten()
                        user_latent_embedding.append(k_ci)
#                         print ('user latent embedding shape: ', np.array(user_latent_embedding).shape)

                        
#                         prediction_scores_u, lambdas = LP1SimplifiedOptimize(initial_prediction_u=self.prediction_scores[user],
#                                                                              keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
#                                                                              affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
#                                                                              unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
#                                                                              num_keyphrases=self.num_keyphrases,
#                                                                              query=query,
#                                                                              test_user=user,
#                                                                              item_latent=self.RQ,
#                                                                              reg=self.reg)
#                         prediction_scores_u, lambdas = LP1SumToOneOptimize(initial_prediction_u=self.prediction_scores[user],
#                                                                            keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
#                                                                            affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
#                                                                            unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
#                                                                            num_keyphrases=self.num_keyphrases,
#                                                                            query=query,
#                                                                            test_user=user,
#                                                                            item_latent=self.RQ,
#                                                                            reg=self.reg)
                        prediction_scores_u, lambdas = Average(initial_prediction_u=self.prediction_scores[user],
                                                                             keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
                                                                             affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
                                                                             unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
                                                                             num_keyphrases=self.num_keyphrases,
                                                                             query=query,
                                                                             test_user=user,
                                                                             item_latent=self.RQ,
                                                                             reg=self.reg)
#                         prediction_scores_u, lambdas = rankSVM(initial_prediction_u=self.prediction_scores[user],
#                                                                              keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
#                                                                              affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
#                                                                              unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
#                                                                              num_keyphrases=self.num_keyphrases,
#                                                                              query=query,
#                                                                              test_user=user,
#                                                                              item_latent=self.RQ,
#                                                                              reg=self.reg,
#                                                                              user_latent_embedding = user_latent_embedding,
#                                                                              item_keyphrase_freq = all_item_keyphrase_freq
#                                                                              )
#                         prediction_scores_u, lambdas = rankSVM2(initial_prediction_u=self.prediction_scores[user],
#                                                                              keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
#                                                                              affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
#                                                                              unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
#                                                                              num_keyphrases=self.num_keyphrases,
#                                                                              query=query,
#                                                                              test_user=user,
#                                                                              item_latent=self.RQ,
#                                                                              reg=self.reg,
#                                                                              user_latent_embedding = user_latent_embedding,
#                                                                              item_keyphrase_freq = all_item_keyphrase_freq
#                                                                              )    
                    
#                         item_keyphrase_freq = get_all_item_keyphrase_freq()
                        
                        self.row['lambda'] = lambdas
                        prediction_items = predict_vector(rating_vector=prediction_scores_u,
                                                          train_vector=self.matrix_Train[user],
                                                          remove_train=False)
                        recommended_items = prediction_items
                        
                        # Current item rank
                        item_rank = np.where(recommended_items == item)[0][0]

                        self.row['item_rank'] = item_rank
                        self.row['item_score'] = prediction_scores_u[item]

                        if item_rank + 1 <= target_rank:
                            # Items is ranked within target rank
                            self.row['result'] = 'successful'
                            self.df = self.df.append(self.row, ignore_index=True)
                            break
                        else:
                            remaining_keyphrases = np.setdiff1d(remaining_keyphrases, critiqued_keyphrase)
                            # Continue if more keyphrases and iterations remained
                            if len(remaining_keyphrases) > 0 and self.row['iteration'] < self.max_iteration_threshold:
                                self.row['result'] = None
                                self.df = self.df.append(self.row, ignore_index=True)
                            else:
                                # Otherwise, mark fail
                                self.row['result'] = 'fail'
                                self.df = self.df.append(self.row, ignore_index=True)
                                break
#                         break ## For Testing LP Objective
        
            print("User ", user ,"Elapsed: {}".format(inhour(time.time() - start_time)))
        return self.df


    def get_initial_predictions(self):
        self.RQ, Yt, Bias = plrec(self.matrix_Train,
                                       iteration=self.parameters_row['iter'],
                                       lamb=self.parameters_row['lambda'],
                                       rank=self.parameters_row['rank'])
        self.Y = Yt.T

        self.reg = LinearRegression().fit(self.keyphrase_freq, self.RQ)

        self.prediction_scores = predict_scores(matrix_U=self.RQ,
                                                matrix_V=self.Y,
                                                bias=Bias)



In [250]:
row = {}
matrix_Train = rtrain
matrix_Test = rtest
test_users = np.arange(50)
# test_users = [1]
target_ranks = [20, 50]
num_items_sampled = 5
num_keyphrases = 235
df = pd.DataFrame(row)
max_iteration_threshold = 20
keyphrase_popularity = keyphrase_popularity
dataset_name = "yelp"
model = "plrec"
parameters_row = {'iter': 10,
                  'lambda':200,
                  'rank':200}
keyphrases_names = keyphrases
keyphrase_selection_method = 'diff'
max_wanted_keyphrase = 20

critiquing_model = LP1Simplified(keyphrase_freq=U_K,
                                item_keyphrase_freq=I_K,
                                row=row,
                                matrix_Train=matrix_Train,
                                matrix_Test=matrix_Test,
                                test_users=test_users,
                                target_ranks=target_ranks,
                                num_items_sampled=num_items_sampled,
                                num_keyphrases=num_keyphrases,
                                df=df,
                                max_iteration_threshold=max_iteration_threshold,
                                keyphrase_popularity=keyphrase_popularity,
                                dataset_name=dataset_name,
                                model=model,
                                parameters_row=parameters_row,
                                keyphrases_names = keyphrases_names,
                                keyphrase_selection_method = keyphrase_selection_method,
                                max_wanted_keyphrase = max_wanted_keyphrase)
df = critiquing_model.start_critiquing()

table_path = '../tables/critiquing/multi_step_critiquing/yelp/avg/'
name = 'avg_50user.csv'
save_dataframe_csv(df, table_path, name)



  0%|          | 0/50 [00:00<?, ?it/s][A[A

  2%|▏         | 1/50 [00:07<06:08,  7.51s/it][A[A

User  0 Elapsed: 00:00:07




  4%|▍         | 2/50 [00:44<12:58, 16.22s/it][A[A

User  1 Elapsed: 00:00:36




  6%|▌         | 3/50 [00:52<10:49, 13.82s/it][A[A

User  2 Elapsed: 00:00:08




  8%|▊         | 4/50 [01:01<09:28, 12.37s/it][A[A

User  3 Elapsed: 00:00:08




 10%|█         | 5/50 [01:04<07:17,  9.72s/it][A[A

User  4 Elapsed: 00:00:03




 12%|█▏        | 6/50 [01:09<06:00,  8.19s/it][A[A

User  5 Elapsed: 00:00:04




 14%|█▍        | 7/50 [01:14<05:07,  7.15s/it][A[A

User  6 Elapsed: 00:00:04




 16%|█▌        | 8/50 [01:19<04:40,  6.67s/it][A[A

User  7 Elapsed: 00:00:05




 18%|█▊        | 9/50 [01:26<04:35,  6.71s/it][A[A

User  8 Elapsed: 00:00:06




 20%|██        | 10/50 [01:40<05:56,  8.91s/it][A[A

User  9 Elapsed: 00:00:14




 22%|██▏       | 11/50 [01:51<06:09,  9.48s/it][A[A

User  10 Elapsed: 00:00:10




 24%|██▍       | 12/50 [02:00<05:57,  9.41s/it][A[A

User  11 Elapsed: 00:00:09




 26%|██▌       | 13/50 [02:07<05:15,  8.54s/it][A[A

User  12 Elapsed: 00:00:06




 28%|██▊       | 14/50 [02:16<05:16,  8.78s/it][A[A

User  13 Elapsed: 00:00:09




 30%|███       | 15/50 [02:25<05:13,  8.95s/it][A[A

User  14 Elapsed: 00:00:09




 32%|███▏      | 16/50 [02:34<05:04,  8.95s/it][A[A

User  15 Elapsed: 00:00:08




 34%|███▍      | 17/50 [02:55<06:49, 12.40s/it][A[A

User  16 Elapsed: 00:00:20




 36%|███▌      | 18/50 [03:16<08:06, 15.20s/it][A[A

User  17 Elapsed: 00:00:21




 38%|███▊      | 19/50 [03:26<06:59, 13.52s/it][A[A

User  18 Elapsed: 00:00:09




 40%|████      | 20/50 [03:48<08:03, 16.10s/it][A[A

User  19 Elapsed: 00:00:22




 42%|████▏     | 21/50 [04:04<07:42, 15.93s/it][A[A

User  20 Elapsed: 00:00:15




 44%|████▍     | 22/50 [04:16<06:55, 14.82s/it][A[A

User  21 Elapsed: 00:00:12




 46%|████▌     | 23/50 [04:21<05:17, 11.76s/it][A[A

User  22 Elapsed: 00:00:04
User  23 Elapsed: 00:00:00




 50%|█████     | 25/50 [04:28<03:53,  9.35s/it][A[A

User  24 Elapsed: 00:00:07




 52%|█████▏    | 26/50 [05:04<06:54, 17.26s/it][A[A

User  25 Elapsed: 00:00:35




 54%|█████▍    | 27/50 [05:41<08:56, 23.31s/it][A[A

User  26 Elapsed: 00:00:37




 56%|█████▌    | 28/50 [05:49<06:48, 18.59s/it][A[A

User  27 Elapsed: 00:00:07




 58%|█████▊    | 29/50 [05:57<05:22, 15.36s/it][A[A

User  28 Elapsed: 00:00:07




 60%|██████    | 30/50 [06:08<04:41, 14.05s/it][A[A

User  29 Elapsed: 00:00:11




 62%|██████▏   | 31/50 [06:15<03:51, 12.19s/it][A[A

User  30 Elapsed: 00:00:07




 64%|██████▍   | 32/50 [06:24<03:21, 11.22s/it][A[A

User  31 Elapsed: 00:00:08




 66%|██████▌   | 33/50 [06:35<03:09, 11.12s/it][A[A

User  32 Elapsed: 00:00:10
User  33 Elapsed: 00:00:00




 70%|███████   | 35/50 [06:46<02:21,  9.43s/it][A[A

User  34 Elapsed: 00:00:10




 72%|███████▏  | 36/50 [07:00<02:31, 10.80s/it][A[A

User  35 Elapsed: 00:00:13




 74%|███████▍  | 37/50 [07:18<02:46, 12.82s/it][A[A

User  36 Elapsed: 00:00:17




 76%|███████▌  | 38/50 [07:55<04:01, 20.08s/it][A[A

User  37 Elapsed: 00:00:37




 78%|███████▊  | 39/50 [08:00<02:52, 15.67s/it][A[A

User  38 Elapsed: 00:00:05




 80%|████████  | 40/50 [08:10<02:18, 13.81s/it][A[A

User  39 Elapsed: 00:00:09




 82%|████████▏ | 41/50 [08:17<01:47, 11.95s/it][A[A

User  40 Elapsed: 00:00:07




 84%|████████▍ | 42/50 [08:36<01:52, 14.00s/it][A[A

User  41 Elapsed: 00:00:18
User  42 Elapsed: 00:00:00




 88%|████████▊ | 44/50 [08:42<01:04, 10.69s/it][A[A

User  43 Elapsed: 00:00:05
User  44 Elapsed: 00:00:00




 92%|█████████▏| 46/50 [08:56<00:38,  9.59s/it][A[A

User  45 Elapsed: 00:00:14




 94%|█████████▍| 47/50 [09:08<00:30, 10.20s/it][A[A

User  46 Elapsed: 00:00:11




 96%|█████████▌| 48/50 [09:17<00:19,  9.89s/it][A[A

User  47 Elapsed: 00:00:09




 98%|█████████▊| 49/50 [11:14<00:42, 42.16s/it][A[A

User  48 Elapsed: 00:01:57




100%|██████████| 50/50 [11:32<00:00, 13.85s/it][A[A

User  49 Elapsed: 00:00:17





In [242]:
# ranksvm 1 
df.head(40)

Unnamed: 0,critiqued_keyphrase,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,user_id,critiqued_keyphrase_name,lambda
0,,78.0,b'The Works Gourmet Burger Bistro',,,0.0,20.0,,20.0,0.0,,
1,48.0,78.0,b'The Works Gourmet Burger Bistro',7041.0,-0.177066,1.0,20.0,,20.0,0.0,burger,"[-1.0, -0.925324640481886]"
2,4.0,78.0,b'The Works Gourmet Burger Bistro',5548.0,-0.0507418,2.0,20.0,,20.0,0.0,fry,"[1.0, -1.0, 0.7307518624865557]"
3,5.0,78.0,b'The Works Gourmet Burger Bistro',7121.0,-0.32054,3.0,20.0,,20.0,0.0,fried,"[1.0, -1.0, 0.5613626369228537, 1.0]"
4,7.0,78.0,b'The Works Gourmet Burger Bistro',6470.0,-0.169562,4.0,20.0,,20.0,0.0,dinner,"[1.0, -1.0, 1.0, 1.0, -0.4462269212507798]"
5,25.0,78.0,b'The Works Gourmet Burger Bistro',675.0,0.600058,5.0,20.0,,20.0,0.0,vegetarian,"[0.7629022137921742, -1.0, -1.0, -1.0, -1.0, 0..."
6,27.0,78.0,b'The Works Gourmet Burger Bistro',406.0,0.657362,6.0,20.0,,20.0,0.0,bbq,"[0.54962052931279, -1.0, -1.0, -1.0, -1.0, 0.5..."
7,34.0,78.0,b'The Works Gourmet Burger Bistro',707.0,0.521767,7.0,20.0,,20.0,0.0,potato,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.62469214374814..."
8,35.0,78.0,b'The Works Gourmet Burger Bistro',597.0,0.724383,8.0,20.0,,20.0,0.0,crispy,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.80149860895964..."
9,49.0,78.0,b'The Works Gourmet Burger Bistro',1696.0,0.297518,9.0,20.0,,20.0,0.0,cheese,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.98762462846839..."


In [237]:
df.head(50)

Unnamed: 0,critiqued_keyphrase,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,user_id,critiqued_keyphrase_name,lambda
0,,78.0,b'The Works Gourmet Burger Bistro',,,0.0,20.0,,20.0,0.0,,
1,48.0,78.0,b'The Works Gourmet Burger Bistro',7123.0,-0.152876,1.0,20.0,,20.0,0.0,burger,"[1.0, -0.5137705260013593]"
2,4.0,78.0,b'The Works Gourmet Burger Bistro',2668.0,0.0135268,2.0,20.0,,20.0,0.0,fry,"[1.0, 0.0, 0.23857817773147239]"
3,5.0,78.0,b'The Works Gourmet Burger Bistro',7112.0,-0.199599,3.0,20.0,,20.0,0.0,fried,"[1.0, 0.0, 0.0, 0.6629007437362784]"
4,7.0,78.0,b'The Works Gourmet Burger Bistro',6753.0,-0.169066,4.0,20.0,,20.0,0.0,dinner,"[1.0, 0.0, 0.39049164436270845, 1.0, 0.0]"
5,25.0,78.0,b'The Works Gourmet Burger Bistro',810.0,0.453526,5.0,20.0,,20.0,0.0,vegetarian,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.623968061728974]"
6,27.0,78.0,b'The Works Gourmet Burger Bistro',446.0,0.637728,6.0,20.0,,20.0,0.0,bbq,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.56032592019824..."
7,34.0,78.0,b'The Works Gourmet Burger Bistro',703.0,0.532229,7.0,20.0,,20.0,0.0,potato,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.63431157329170..."
8,35.0,78.0,b'The Works Gourmet Burger Bistro',594.0,0.74042,8.0,20.0,,20.0,0.0,crispy,"[1.0, -1.0, -1.0, -1.0, -1.0, 0.81624412238244..."
9,49.0,78.0,b'The Works Gourmet Burger Bistro',1540.0,0.348976,9.0,20.0,,20.0,0.0,cheese,"[1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0,..."


In [24]:
df

Unnamed: 0,critiqued_keyphrase,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,user_id,critiqued_keyphrase_name,lambda
0,,75.0,b'Salad King Restaurant',,,0.0,20.0,,20.0,1.0,,
1,2,75.0,b'Salad King Restaurant',3,4.43009,1.0,20.0,successful,20.0,1.0,thai,[1]
2,,78.0,b'The Works Gourmet Burger Bistro',,,0.0,20.0,,20.0,1.0,thai,[1]
3,48,78.0,b'The Works Gourmet Burger Bistro',7343,-0.502279,1.0,20.0,,20.0,1.0,burger,[1]
4,4,78.0,b'The Works Gourmet Burger Bistro',5525,-0.0278124,2.0,20.0,,20.0,1.0,fry,"[1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1589,121,7354.0,b'Spectacle',2008,0.150263,16.0,20.0,,50.0,1.0,dumpling,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1590,136,7354.0,b'Spectacle',2001,0.168824,17.0,20.0,,50.0,1.0,store,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1591,150,7354.0,b'Spectacle',2135,0.146572,18.0,20.0,,50.0,1.0,clean,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1592,157,7354.0,b'Spectacle',2304,0.123868,19.0,20.0,,50.0,1.0,greeted,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [206]:
table_path = '../tables/critiquing/multi_step_critiquing/yelp/ranksvm/'
name = 'ranksvm2test.csv'
# save_dataframe_csv(df, table_path, name)
df = load_dataframe_csv(table_path,name)

In [243]:
def avg_successful_rate(df):
    num_runs = len(np.where(df['iteration'] == 0)[0])
    num_success = len(np.where(df['result'] == 'successful')[0])
    
    return num_success/num_runs
def avg_length(df,include_fail = True):
    num_runs = len(np.where(df['iteration'] == 0)[0])
    return (len(df)-num_runs)/num_runs

In [251]:
# df_5 =  df[df['target_rank'] == 5]
# df_10 = df[df['target_rank'] == 10]
df_20 = df[df['target_rank'] == 20]
df_50 = df[df['target_rank'] == 50]

In [245]:
# Ranksvm 1 50 users 20 topaffected lamb = 5
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

18.271084337349397
0.10090361445783133
16.94277108433735
0.1822289156626506


In [239]:
# Ranksvm2 50 users 20topaffected lamb = 5
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

18.370481927710845
0.09789156626506024
17.049698795180724
0.17620481927710843


In [252]:
# Avg
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

16.003012048192772
0.2756024096385542
14.045180722891565
0.39457831325301207


In [219]:
# Ranksvm 2 bot 20 affected
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

20.0
0.0
19.08888888888889
0.06666666666666667


In [290]:
# Rating obj
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

19.377777777777776
0.08888888888888889
18.88888888888889
0.15555555555555556


In [258]:
# top20items
print (avg_length(df_20))
print (avg_successful_rate(df_20))
print (avg_length(df_50))
print (avg_successful_rate(df_50))

18.466666666666665
0.15555555555555556
17.177777777777777
0.26666666666666666
