In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

from ast import literal_eval
from collections import defaultdict
import statistics

In [2]:
# Data loading and saving
def save_dataframe_csv(df, path, name):
    df.to_csv(path+name, index=False)

def load_dataframe_csv(path, name, index_col=None):
    return pd.read_csv(path+name, index_col=index_col)

# Get Avearge length and success rate
def get_average_length(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    iteration = df_s_f[df_s_f['target_rank']==n].groupby('user_id', as_index=False).agg({'iteration':'mean'})['iteration'].to_numpy()
    return (np.average(iteration), 1.96*np.std(iteration)/np.sqrt(len(iteration)))

def get_success_num(df, n):
    return len(df[(df['result'] == 'successful') & (df['target_rank'] == n)])

def get_fail_num(df, n):
    return len(df[(df['result'] == 'fail') & (df['target_rank'] == n)])

def get_success_rate(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    df_list_result = df_s_f[df_s_f['target_rank']==n].groupby('user_id')['result'].apply(list).reset_index(name='result')
    successful_rate = df_list_result['result'].apply(lambda r: r.count("successful")/len(r)).to_numpy()
    return (np.average(successful_rate), 1.96*np.std(successful_rate)/np.sqrt(len(successful_rate)))

# For creating latex table
def get_2_metric(df, topk = 1, print_result = True):
    if topk != 1:
        df = change_target_rank(df,topk=topk)
    if print_result:
        print ('avg_length: ',get_average_length(df,topk))
        print ('suc_rate: ',get_success_rate(df,topk))
        return 
    else:
        return get_average_length(df,topk), get_success_rate(df,topk)


def latex_row(df, table_name = "success_rate", value_precision = 3, uncertainty_precision = 4):
    row = {}
    for i in columns:
        sr, al = get_2_metric(df, topk = i,  print_result=False)
#         print (sr,al)
        if table_name == "success_rate":
            metric = al
        else:
            metric = sr
        row[i] = str(round(metric[0],value_precision)) + r'$\pm$' + str(round(metric[1],uncertainty_precision))
    return row
def latex_table_row(df, table_name = "success_rate", value_precision = 3, uncertainty_precision = 4):
    columns = [1,5,10,20,50] 
    table = pd.DataFrame(columns=columns)
    row = latex_row(df, 
                     table_name = table_name, 
                     value_precision = value_precision, 
                     uncertainty_precision = uncertainty_precision)
    table = table.append(row, ignore_index=True)
    print (table.to_latex(escape=False))

# Get tuning result 
def get_best_metric_with_single_lambda(data_path, lambs, dataset_name = "yelp", keyphrase_selection_method = "random",topk = 20, return_all = False, top_affected = None):
    avg_lengths = []
    suc_rates = []
    for lamb in lambs:
        table_name = '../tables/'  + dataset_name +   '/tune/tuning_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
        if top_affected != None:
            table_name = '../tables/'  + dataset_name +   '/tune_new_objective/topk_' + str(top_affected) + '/tuning_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
        df = load_dataframe_csv(data_path, table_name)
        avg_length, suc_rate = get_2_metric(df, topk = topk, print_result = False)

        avg_lengths.append(avg_length)
        suc_rates.append(suc_rate)
        
    if return_all:
        return avg_lengths, suc_rates
    else:
        suc_rates_temp = [suc_rate[0] for suc_rate in suc_rates]
        optimal_lambda_index = np.argmax(suc_rates_temp)
#         print (optimal_lambda_index)
        return lambs[optimal_lambda_index], avg_lengths[optimal_lambda_index], suc_rates[optimal_lambda_index]

columns = [1,5,10,20,50]

In [3]:
def get_drop_index(df, topk = 20):
    drop_index = []
    iter_flag = 0 
    for i in range(len(df)):
        if df["iteration"][i] == 0:
            iter_flag = 0
        if df['item_rank'][i] <= topk and iter_flag == 0:
            iter_flag = 1
        elif iter_flag == 1:
            drop_index.append(i)
    return drop_index 

def change_target_rank(df,topk = 20):
    modified_df = df.drop(get_drop_index(df,topk = topk))
    modified_df = modified_df.reset_index(drop=True)
    modified_df['target_rank'] = topk
    for i in range(len(modified_df)):
        if modified_df['item_rank'][i] <= modified_df['target_rank'][i]:
            modified_df.at[i,"result"] = "successful"
    return modified_df

In [4]:
def get_df_freq(df, precision = 1, theta_col = "theta"):
    """
    get theta freq for entire dataframe
    store in dictionary
    """
    theta_freq = defaultdict(int)
    for i in range(len(df)):
        # Theta line
#         try:
        theta_line = literal_eval(df[theta_col][i])
#             theta_freq = get_list_freqs(theta_freq, theta_line, precision=precision)
        for i in range(len(theta_line)):
            if precision == None:
                theta_freq[theta_line[i]] += 1
            else:
                val = round(theta_line[i], precision)
                theta_freq[val] += 1
#         except:
            continue
    return theta_freq

# Yelp

## UAC

In [6]:
# uac random
df = load_dataframe_csv("../tables/reproduce/yelp/uac_random.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0046$\pm$0.003 &  0.0369$\pm$0.012 &  0.0561$\pm$0.016 &  0.083$\pm$0.018 &  0.1367$\pm$0.024 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.964$\pm$0.027 &  9.701$\pm$0.103 &  9.545$\pm$0.135 &  9.308$\pm$0.154 &  8.864$\pm$0.195 \\
\bottomrule
\end{tabular}



In [7]:
# uac diff
df = load_dataframe_csv("../tables/reproduce/yelp/uac_diff.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0066$\pm$0.004 &  0.0353$\pm$0.012 &  0.0533$\pm$0.015 &  0.086$\pm$0.018 &  0.1422$\pm$0.025 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.955$\pm$0.029 &  9.711$\pm$0.098 &  9.573$\pm$0.128 &  9.277$\pm$0.157 &  8.808$\pm$0.208 \\
\bottomrule
\end{tabular}



## BAC

In [10]:
# bac random
df = load_dataframe_csv("../tables/reproduce/yelp/bac_random.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0036$\pm$0.003 &  0.0309$\pm$0.011 &  0.0475$\pm$0.015 &  0.0719$\pm$0.017 &  0.1153$\pm$0.021 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &              10 &              20 &               50 \\
\midrule
0 &  9.968$\pm$0.025 &  9.723$\pm$0.096 &  9.58$\pm$0.131 &  9.357$\pm$0.15 &  8.974$\pm$0.187 \\
\bottomrule
\end{tabular}



In [11]:
# bac diff
df = load_dataframe_csv("../tables/reproduce/yelp/bac_diff.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &               10 &                20 &              50 \\
\midrule
0 &  0.004$\pm$0.003 &  0.0322$\pm$0.011 &  0.046$\pm$0.014 &  0.0755$\pm$0.017 &  0.12$\pm$0.022 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.965$\pm$0.026 &  9.715$\pm$0.096 &  9.593$\pm$0.126 &  9.326$\pm$0.153 &  8.924$\pm$0.197 \\
\bottomrule
\end{tabular}



## LLC Score

In [12]:
# llcscore random
df = load_dataframe_csv("../tables/reproduce/yelp/llcscore_random.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.015$\pm$0.007 &  0.0519$\pm$0.014 &  0.0667$\pm$0.016 &  0.0983$\pm$0.019 &  0.1609$\pm$0.026 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.919$\pm$0.048 &  9.616$\pm$0.117 &  9.479$\pm$0.134 &  9.226$\pm$0.154 &  8.708$\pm$0.205 \\
\bottomrule
\end{tabular}



In [13]:
# llcscore diff
df = load_dataframe_csv("../tables/reproduce/yelp/llcscore_diff.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0049$\pm$0.005 &  0.0339$\pm$0.011 &  0.0517$\pm$0.015 &  0.0772$\pm$0.017 &  0.1276$\pm$0.022 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &              10 &               20 &               50 \\
\midrule
0 &  9.957$\pm$0.042 &  9.705$\pm$0.102 &  9.551$\pm$0.13 &  9.328$\pm$0.151 &  8.858$\pm$0.196 \\
\bottomrule
\end{tabular}



## LLC Rank

In [14]:
# llcrank random
df = load_dataframe_csv("../tables/reproduce/yelp/llcrank_lamb200_top10_random.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &               10 &                20 &                50 \\
\midrule
0 &  0.0161$\pm$0.008 &  0.069$\pm$0.017 &  0.0893$\pm$0.02 &  0.1224$\pm$0.023 &  0.2058$\pm$0.027 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &              5  &               10 &               20 &               50 \\
\midrule
0 &  9.905$\pm$0.052 &  9.542$\pm$0.12 &  9.363$\pm$0.146 &  9.123$\pm$0.161 &  8.546$\pm$0.198 \\
\bottomrule
\end{tabular}



In [15]:
# llcrank diff
df = load_dataframe_csv("../tables/reproduce/yelp/llcrank_lamb200_top10_diff.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0196$\pm$0.009 &  0.0585$\pm$0.016 &  0.0849$\pm$0.019 &  0.134$\pm$0.024 &  0.2156$\pm$0.031 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.896$\pm$0.052 &  9.627$\pm$0.105 &  9.438$\pm$0.128 &  9.098$\pm$0.158 &  8.487$\pm$0.224 \\
\bottomrule
\end{tabular}



# Beer

# UAC

In [5]:
# uac random
df = load_dataframe_csv("../tables/reproduce/beer/uac_random.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0171$\pm$0.023 &  0.0274$\pm$0.025 &  0.0686$\pm$0.037 &  0.106$\pm$0.041 &  0.2074$\pm$0.054 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &              1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  6.44$\pm$0.492 &  6.396$\pm$0.495 &  6.301$\pm$0.502 &  6.208$\pm$0.511 &  5.879$\pm$0.521 \\
\bottomrule
\end{tabular}



In [51]:
# uac diff
df = load_dataframe_csv("../tables/reproduce/beer/uac_diff.csv","")
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0286$\pm$0.032 &  0.0528$\pm$0.035 &  0.0792$\pm$0.039 &  0.1093$\pm$0.042 &  0.2142$\pm$0.054 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &               10 &               20 &               50 \\
\midrule
0 &  9.806$\pm$0.216 &  9.622$\pm$0.263 &  9.349$\pm$0.331 &  9.075$\pm$0.359 &  8.082$\pm$0.484 \\
\bottomrule
\end{tabular}

