# Keyphrase Boosting 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

In [2]:
def save_dataframe_csv(df, path, name):
    df.to_csv(path+name, index=False)

def load_dataframe_csv(path, name, index_col=None):
    return pd.read_csv(path+name, index_col=index_col)

In [3]:
def get_average_length(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    iteration = df_s_f[df_s_f['target_rank']==n].groupby('user_id', as_index=False).agg({'iteration':'mean'})['iteration'].to_numpy()
    return (np.average(iteration), 1.96*np.std(iteration)/np.sqrt(len(iteration)))

def get_success_num(df, n):
    return len(df[(df['result'] == 'successful') & (df['target_rank'] == n)])

def get_fail_num(df, n):
    return len(df[(df['result'] == 'fail') & (df['target_rank'] == n)])

def get_success_rate(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    df_list_result = df_s_f[df_s_f['target_rank']==n].groupby('user_id', as_index=False)['result'].apply(list).reset_index(name='result')
    successful_rate = df_list_result['result'].apply(lambda r: r.count("successful")/len(r)).to_numpy()
    return (np.average(successful_rate), 1.96*np.std(successful_rate)/np.sqrt(len(successful_rate)))



In [4]:
def get_2_metric(df, topk = 1, print_result = True):
    if topk != 1:
        df = change_target_rank(df,topk=topk)
    if print_result:
        print ('avg_length: ',get_average_length(df,topk))
        print ('suc_rate: ',get_success_rate(df,topk))
        return 
    else:
        return get_average_length(df,topk), get_success_rate(df,topk)

In [107]:
def get_best_metric_with_single_lambda(data_path, lambs, dataset_name = "yelp", keyphrase_selection_method = "random",topk = 20, return_all = False, top_affected = None):
    avg_lengths = []
    suc_rates = []
    for lamb in lambs:
        table_name = '../tables/tuning_lambda/'+dataset_name+'/tuning_'+dataset_name+'_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
        if top_affected != None:
            table_name = '../tables/tuning_lambda/topk_' + str(top_affected) +'/'+dataset_name+'/tuning_'+dataset_name+'_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
        df = load_dataframe_csv(data_path, table_name)
        avg_length, suc_rate = get_2_metric(df, topk = topk, print_result = False)

        avg_lengths.append(avg_length)
        suc_rates.append(suc_rate)
        
    if return_all:
        return avg_lengths, suc_rates
    else:
        suc_rates_temp = [suc_rate[0] for suc_rate in suc_rates]
        optimal_lambda_index = np.argmax(suc_rates_temp)
#         print (optimal_lambda_index)
        return lambs[optimal_lambda_index], avg_lengths[optimal_lambda_index], suc_rates[optimal_lambda_index]

In [6]:
def get_drop_index(df, topk = 20):
    drop_index = []
    iter_flag = 0 

    for i in range(len(df)):
        if df["iteration"][i] == 0:
            iter_flag = 0
        if df['item_rank'][i] < topk and iter_flag == 0:
            iter_flag = 1
        elif iter_flag == 1:
            drop_index.append(i)
    return drop_index        
def change_target_rank(df,topk = 20):
    modified_df = df.drop(get_drop_index(df,topk = topk))
    modified_df = modified_df.reset_index(drop=True)
    modified_df['target_rank'] = topk
    for i in range(len(modified_df)):
        if modified_df['item_rank'][i] < modified_df['target_rank'][i]:
            modified_df.at[i,"result"] = "successful"
    return modified_df

In [146]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_sample_25users.csv"
df = load_dataframe_csv(table_path,"")    

In [187]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_50users_random.csv"
df = load_dataframe_csv(table_path,"")    

In [151]:
# new rating random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)
print ('top20 result:')
df_20 = change_target_rank(df,topk=20)
get_2_metric(df_20, topk = 20, print_result= True)

top1 result:
avg_length:  (19.74962962962963, 0.29107414958393807)
suc_rate:  (0.022623456790123456, 0.020989635923557414)
top20 result:
avg_length:  (17.746574074074076, 1.1476013615327938)
suc_rate:  (0.1270356803690137, 0.0616629059833558)


In [167]:
# average random
table_path = "../tables/reproducing/beer/avg_top100_sample_25users.csv"
df = load_dataframe_csv(table_path,"")  

In [168]:
 print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)
print ('top20 result:')
get_2_metric(df_20, topk = 20, print_result= True)

top1 result:
avg_length:  (19.958333333333332, 0.07994717584709247)
suc_rate:  (0.005208333333333333, 0.009993396980886561)
top20 result:
avg_length:  (17.770470787382553, 1.0594595506005924)
suc_rate:  (0.17312225719088464, 0.07581358661361602)


In [169]:
# RankSVM3 random
dataset_name = "beer"
lamb = 0.01
keyphrase_selection_method = "random"
table_path = '../tables/tuning_lambda/'+dataset_name+'/tuning_'+dataset_name+'_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
df = load_dataframe_csv(table_path,"")     

In [170]:
 print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)
print ('top20 result:')
get_2_metric(df_20, topk = 20, print_result= True)

top1 result:
avg_length:  (19.92509259259259, 0.09356016847696566)
suc_rate:  (0.00558641975308642, 0.006657888556071205)
top20 result:
avg_length:  (17.770470787382553, 1.0594595506005924)
suc_rate:  (0.17312225719088464, 0.07581358661361602)


In [137]:
get_2_metric(df, topk = 1, print_result= True)

avg_length:  (19.308145135547093, 0.510198578509671)
suc_rate:  (0.04709948366811111, 0.033635644108119644)


In [140]:
df_20 = change_target_rank(df,topk=20)
get_2_metric(df_20, topk = 20, print_result= True)

avg_length:  (16.003874304707637, 1.4707664579856916)
suc_rate:  (0.22928394408786568, 0.08431886915092991)


In [141]:
dataset_name = "beer"
lamb = 1
keyphrase_selection_method = "random"
table_path = '../tables/tuning_lambda/'+dataset_name+'/tuning_'+dataset_name+'_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
df = load_dataframe_csv(table_path,"")

In [142]:
get_2_metric(df, topk = 1, print_result= True)

avg_length:  (19.870587437254105, 0.14267671270303392)
suc_rate:  (0.018618572785239453, 0.02253631479002173)


In [143]:
df_20 = change_target_rank(df,topk=20)
get_2_metric(df_20, topk = 20, print_result= True)

avg_length:  (18.15803243053243, 1.0057396691448985)
suc_rate:  (0.10936757686757687, 0.05489371263025239)


# Table for Overleaf

In [121]:
def latex_row(df, table_name = "success_rate", value_precision = 3, uncertainty_precision = 4):
    row = {}
    for i in columns:
        sr, al = get_2_metric(df, topk = i,  print_result=False)
        if table_name == "success_rate":
            metric = al
        else:
            metric = sr
        row[i] = str(round(metric[0],value_precision)) + r'$\pm$' + str(round(metric[1],uncertainty_precision))
    return row

In [122]:
def latex_table_row(df, table_name = "success_rate", value_precision = 3, uncertainty_precision = 4):
    columns = [1,5,10,20,50] 
    table = pd.DataFrame(columns=columns)
    row = latex_row(df, 
                     table_name = table_name, 
                     value_precision = value_precision, 
                     uncertainty_precision = uncertainty_precision)
    table = table.append(row, ignore_index=True)
    print (table.to_latex(escape=False))

In [123]:
latex_table_row(df, table_name = "success_rate", value_precision = 3, uncertainty_precision = 4)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  0.015$\pm$0.0218 &  0.069$\pm$0.0508 &  0.127$\pm$0.076 &  0.195$\pm$0.0835 &  0.243$\pm$0.0842 \\
\bottomrule
\end{tabular}



# Yelp

## Random

### Avg

In [127]:
# average random
table_path = "../tables/reproducing/yelp/average_50_users_random.csv"
df = load_dataframe_csv(table_path,"")   

In [128]:
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0158$\pm$0.01 &  0.0552$\pm$0.019 &  0.0845$\pm$0.024 &  0.1316$\pm$0.029 &  0.2153$\pm$0.039 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.036$\pm$0.357 &  18.603$\pm$0.377 &  18.155$\pm$0.465 &  17.454$\pm$0.524 &  16.145$\pm$0.659 \\
\bottomrule
\end{tabular}



In [237]:
# avg random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.03559877104511, 0.3572120449980587)
suc_rate:  (0.015840222299200627, 0.009591226262474677)
top5 result:
avg_length:  (18.60254403037814, 0.37723308447177045)
suc_rate:  (0.0551611808861551, 0.01883349549045509)
top10 result:
avg_length:  (18.15496984411071, 0.465279928173111)
suc_rate:  (0.0845436633942826, 0.024319431068433355)
top20 result:
avg_length:  (17.453566172962457, 0.5241739324116091)
suc_rate:  (0.13157701856489268, 0.028787120650399345)
top50 result:
avg_length:  (16.14465360276119, 0.6586009718201147)
suc_rate:  (0.21532475636964804, 0.03851832752271652)


### Rating 

In [133]:
table_path = "../tables/reproducing/yelp/lp1simplified_top100_50users_random.csv"
df = load_dataframe_csv(table_path,"")   

In [134]:
latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &               10 &                20 &                50 \\
\midrule
0 &  0.0111$\pm$0.009 &  0.022$\pm$0.013 &  0.0384$\pm$0.02 &  0.0665$\pm$0.024 &  0.1224$\pm$0.034 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &                50 \\
\midrule
0 &  19.029$\pm$0.338 &  18.88$\pm$0.347 &  18.624$\pm$0.399 &  18.133$\pm$0.514 &  17.178$\pm$0.611 \\
\bottomrule
\end{tabular}



In [235]:
# new rating random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.028766511600622, 0.3384511962364016)
suc_rate:  (0.011059850643183978, 0.008876222642763183)
top5 result:
avg_length:  (18.880373392374167, 0.3470727930073836)
suc_rate:  (0.022017180350513685, 0.012945332297826436)
top10 result:
avg_length:  (18.624498653078376, 0.3991413461197096)
suc_rate:  (0.03837325118026873, 0.019772108658030165)
top20 result:
avg_length:  (18.13328310793094, 0.5137145355516685)
suc_rate:  (0.06651036776165775, 0.023884815401298304)
top50 result:
avg_length:  (17.177852856780877, 0.6105745073551999)
suc_rate:  (0.12241955522786277, 0.034405594316651876)


### RankSVM3

In [106]:
# Top 20
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(10,
 (17.93324644749051, 0.44290528302809007),
 (0.12766005370236533, 0.03531259989093568))

In [227]:
table_path = "../tables/tuning_lambda/yelp/tuning_yelp_at_lamb_30_with_random.csv"
df = load_dataframe_csv(table_path,"")   

In [233]:
# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.034609094943203, 0.33549197179709683)
suc_rate:  (0.022188999272332605, 0.012952897978773284)
top5 result:
avg_length:  (18.52658623330192, 0.38175761171278694)
suc_rate:  (0.06590186454221543, 0.0256616527244365)
top10 result:
avg_length:  (18.103375154257506, 0.44793462922060046)
suc_rate:  (0.090069070792755, 0.031621431885388136)
top20 result:
avg_length:  (17.36721675455035, 0.5360957700454426)
suc_rate:  (0.14679590824456665, 0.035741131818330704)
top50 result:
avg_length:  (15.821626805474844, 0.7154176801839158)
suc_rate:  (0.25878101731429903, 0.052692963315011165)


In [7]:
## top 100
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(0.1,
 (17.74606602943935, 0.4742818250689491),
 (0.11851021570448815, 0.03132887341800652))

In [8]:
table_path = "../tables/tuning_lambda/yelp/tuning_yelp_at_lamb_0.1_with_random.csv"
df = load_dataframe_csv(table_path,"")   
# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.160659241190714, 0.3416172378009567)
suc_rate:  (0.010282946423297301, 0.00928726298792895)
top5 result:
avg_length:  (18.711211581962353, 0.34252710474322823)
suc_rate:  (0.05088513744215499, 0.019464453019866164)
top10 result:
avg_length:  (18.216013514778993, 0.39560407979512496)
suc_rate:  (0.08492479513377348, 0.025993019241378326)
top20 result:
avg_length:  (17.74606602943935, 0.4742818250689491)
suc_rate:  (0.11851021570448815, 0.03132887341800652)
top50 result:
avg_length:  (16.612341300961013, 0.6677124645629706)
suc_rate:  (0.19409974733633142, 0.04419279273860129)


In [7]:
## top 50
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(10,
 (17.93324644749051, 0.44290528302809007),
 (0.12766005370236533, 0.03531259989093568))

In [140]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False,
                                   top_affected=50)

(10,
 (17.93324644749051, 0.44290528302809007),
 (0.12766005370236533, 0.03531259989093568))

In [136]:
# top 10
table_path = "../tables/tuning_lambda/topk_10/yelp/tuning_yelp_at_lamb_70_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0226$\pm$0.012 &  0.0764$\pm$0.027 &  0.1097$\pm$0.033 &  0.1623$\pm$0.044 &  0.2587$\pm$0.046 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  18.983$\pm$0.342 &  18.416$\pm$0.413 &  17.998$\pm$0.45 &  17.336$\pm$0.566 &  15.944$\pm$0.681 \\
\bottomrule
\end{tabular}



In [141]:
# top 20
table_path = "../tables/tuning_lambda/topk_20/yelp/tuning_yelp_at_lamb_100000_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0223$\pm$0.014 &  0.0501$\pm$0.022 &  0.0986$\pm$0.033 &  0.1622$\pm$0.041 &  0.2545$\pm$0.054 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &               50 \\
\midrule
0 &  19.089$\pm$0.342 &  18.696$\pm$0.366 &  18.071$\pm$0.452 &  17.389$\pm$0.499 &  15.93$\pm$0.755 \\
\bottomrule
\end{tabular}



In [142]:
# top 50
table_path = "../tables/tuning_lambda/topk_50/yelp/tuning_yelp_at_lamb_10_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0293$\pm$0.015 &  0.0598$\pm$0.024 &  0.0832$\pm$0.028 &  0.1277$\pm$0.035 &  0.2119$\pm$0.043 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  19.013$\pm$0.334 &  18.741$\pm$0.345 &  18.31$\pm$0.398 &  17.933$\pm$0.443 &  16.818$\pm$0.547 \\
\bottomrule
\end{tabular}



In [143]:
# top 100
table_path = "../tables/tuning_lambda/topk_100/yelp/tuning_yelp_at_lamb_0.1_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0103$\pm$0.009 &  0.0509$\pm$0.019 &  0.0849$\pm$0.026 &  0.1185$\pm$0.031 &  0.1941$\pm$0.044 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.161$\pm$0.342 &  18.711$\pm$0.343 &  18.216$\pm$0.396 &  17.746$\pm$0.474 &  16.612$\pm$0.668 \\
\bottomrule
\end{tabular}



## Diff

### Avg

In [21]:
# average random
table_path = "../tables/reproducing/yelp/average_50_users_diff.csv"
df = load_dataframe_csv(table_path,"")   

# avg diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (18.984194443841186, 0.30060481206045847)
suc_rate:  (0.07804148017669353, 0.023076297560144423)
top5 result:
avg_length:  (17.656210925512397, 0.5623516044007251)
suc_rate:  (0.15401386399081243, 0.03873249194652888)
top10 result:
avg_length:  (16.905572819572853, 0.6508707049449296)
suc_rate:  (0.19554000976887637, 0.04234699877460717)
top20 result:
avg_length:  (15.944536041387906, 0.7545039677122095)
suc_rate:  (0.2516436886986423, 0.04341759823817416)
top50 result:
avg_length:  (13.867099421980633, 0.9210663061370019)
suc_rate:  (0.3887600489540521, 0.05534350943383171)


In [154]:
# average random
table_path = "../tables/reproducing/yelp/average_50_users_diff.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &               1  &               5  &                10 &                20 &                50 \\
\midrule
0 &  0.078$\pm$0.023 &  0.154$\pm$0.039 &  0.1955$\pm$0.042 &  0.2516$\pm$0.043 &  0.3888$\pm$0.055 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  18.984$\pm$0.301 &  17.656$\pm$0.562 &  16.906$\pm$0.651 &  15.945$\pm$0.755 &  13.867$\pm$0.921 \\
\bottomrule
\end{tabular}



### Rating

In [261]:
table_path = "../tables/reproducing/yelp/lp1simplified_top100_50users_diff.csv"
df = load_dataframe_csv(table_path,"")  

# rating diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.69089649072865, 0.18087670240336048)
suc_rate:  (0.023072894065372863, 0.013403120863550031)
top5 result:
avg_length:  (19.326414132720082, 0.3118213441873929)
suc_rate:  (0.04504713057765285, 0.020165883247737514)
top10 result:
avg_length:  (18.869447096133477, 0.4238667770155522)
suc_rate:  (0.08097138144755589, 0.02678793192479579)
top20 result:
avg_length:  (18.145935761808524, 0.5495414647468356)
suc_rate:  (0.11822160990622423, 0.034817509020406645)
top50 result:
avg_length:  (17.053157619589847, 0.687032093938541)
suc_rate:  (0.1754022981737008, 0.03968158044397756)


In [155]:
table_path = "../tables/reproducing/yelp/lp1simplified_top100_50users_diff.csv"
df = load_dataframe_csv(table_path,"")  

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &              5  &               10 &                20 &               50 \\
\midrule
0 &  0.0231$\pm$0.013 &  0.045$\pm$0.02 &  0.081$\pm$0.027 &  0.1182$\pm$0.035 &  0.1754$\pm$0.04 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  19.691$\pm$0.181 &  19.326$\pm$0.312 &  18.869$\pm$0.424 &  18.146$\pm$0.55 &  17.053$\pm$0.687 \\
\bottomrule
\end{tabular}



### RankSVM3

In [9]:
## top 20
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False)

(1,
 (15.8854973973395, 0.7586524035818217),
 (0.27935166067275036, 0.048777170782023295))

In [11]:
table_path = "../tables/tuning_lambda/yelp/tuning_yelp_at_lamb_1_with_diff.csv"
df = load_dataframe_csv(table_path,"")   
# ranksvm3 diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (18.849091372005372, 0.351516857654142)
suc_rate:  (0.10413941605723809, 0.028097781907989814)
top5 result:
avg_length:  (17.599548578552735, 0.5829075988289408)
suc_rate:  (0.16126934211299535, 0.04017829502830795)
top10 result:
avg_length:  (16.89499000120256, 0.65274502563568)
suc_rate:  (0.20826815928907394, 0.042430761938265255)
top20 result:
avg_length:  (15.8854973973395, 0.7586524035818217)
suc_rate:  (0.27935166067275036, 0.048777170782023295)
top50 result:
avg_length:  (14.120772580654041, 0.8887500473478734)
suc_rate:  (0.38747810324275855, 0.05615032543929048)


In [108]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False,
                                  top_affected=10)

(1,
 (15.73286308956781, 0.7457676901025765),
 (0.27937594815707617, 0.044958993185112504))

In [10]:
table_path = "../tables/tuning_lambda/yelp/tuning_yelp_at_lamb_0.1_with_diff.csv"
df = load_dataframe_csv(table_path,"")   
# ranksvm3 diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (18.843504412581257, 0.3506871248384387)
suc_rate:  (0.08511449869135963, 0.024419740386181714)
top5 result:
avg_length:  (17.582124907779438, 0.5934202205454729)
suc_rate:  (0.16149050173775908, 0.04158940627209419)
top10 result:
avg_length:  (16.58167493873553, 0.6982054686562067)
suc_rate:  (0.21801127340861826, 0.045932618580687604)
top20 result:
avg_length:  (15.67059917912026, 0.7653716069626618)
suc_rate:  (0.2767144099936198, 0.04598138117907132)
top50 result:
avg_length:  (13.72779260783114, 0.9052273274898496)
suc_rate:  (0.39964310500301203, 0.05572815872981459)


In [173]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "yelp", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False,
                                   top_affected=20)

(1,
 (15.8854973973395, 0.7586524035818217),
 (0.27935166067275036, 0.048777170782023295))

In [158]:
# top 10
table_path = "../tables/tuning_lambda/topk_10/yelp/tuning_yelp_at_lamb_1_with_diff.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.1093$\pm$0.03 &  0.1716$\pm$0.038 &  0.2232$\pm$0.044 &  0.2794$\pm$0.045 &  0.4032$\pm$0.057 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &                50 \\
\midrule
0 &  18.806$\pm$0.354 &  17.452$\pm$0.59 &  16.688$\pm$0.657 &  15.733$\pm$0.746 &  13.743$\pm$0.952 \\
\bottomrule
\end{tabular}



In [174]:
# top 20
table_path = "../tables/tuning_lambda/topk_20/yelp/tuning_yelp_at_lamb_1_with_diff.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &                50 \\
\midrule
0 &  0.1041$\pm$0.028 &  0.1613$\pm$0.04 &  0.2083$\pm$0.042 &  0.2794$\pm$0.049 &  0.3875$\pm$0.056 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &              5  &                10 &                20 &                50 \\
\midrule
0 &  18.849$\pm$0.352 &  17.6$\pm$0.583 &  16.895$\pm$0.653 &  15.885$\pm$0.759 &  14.121$\pm$0.889 \\
\bottomrule
\end{tabular}



In [163]:
# top 50
table_path = "../tables/tuning_lambda/topk_50/yelp/tuning_yelp_at_lamb_0.5_with_diff.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &              10 &                20 &                50 \\
\midrule
0 &  0.0896$\pm$0.026 &  0.1616$\pm$0.039 &  0.21$\pm$0.045 &  0.2706$\pm$0.044 &  0.4081$\pm$0.059 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  18.804$\pm$0.345 &  17.572$\pm$0.576 &  16.68$\pm$0.697 &  15.708$\pm$0.763 &  13.536$\pm$0.994 \\
\bottomrule
\end{tabular}



In [164]:
# top 100
table_path = "../tables/tuning_lambda/topk_100/yelp/tuning_yelp_at_lamb_0.1_with_diff.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  0.0851$\pm$0.024 &  0.1615$\pm$0.042 &  0.218$\pm$0.046 &  0.2767$\pm$0.046 &  0.3996$\pm$0.056 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  18.844$\pm$0.351 &  17.582$\pm$0.593 &  16.582$\pm$0.698 &  15.671$\pm$0.765 &  13.728$\pm$0.905 \\
\bottomrule
\end{tabular}



# Beer

## Random

### Avg

In [139]:
table_path = "../tables/reproducing/beer/avg_50users_random.csv"
df = load_dataframe_csv(table_path,"") 

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &               50 \\
\midrule
0 &  0.0199$\pm$0.023 &  0.0449$\pm$0.032 &  0.0627$\pm$0.034 &  0.1217$\pm$0.047 &  0.1796$\pm$0.06 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.757$\pm$0.278 &  19.542$\pm$0.331 &  19.129$\pm$0.496 &  18.462$\pm$0.579 &  17.545$\pm$0.775 \\
\bottomrule
\end{tabular}



In [259]:
# average random
table_path = "../tables/reproducing/beer/avg_50users_random.csv"
df = load_dataframe_csv(table_path,"")   

# avg random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.757354497354495, 0.2779927766784297)
suc_rate:  (0.019944755680049796, 0.02258021880252023)
top5 result:
avg_length:  (19.5425, 0.3305248261097751)
suc_rate:  (0.04490948231144309, 0.032302378695209445)
top10 result:
avg_length:  (19.12864197530864, 0.49555452117671855)
suc_rate:  (0.0627489884842826, 0.03415633080480409)
top20 result:
avg_length:  (18.462072491778372, 0.5792931262511614)
suc_rate:  (0.12167948065006888, 0.04664426913400822)
top50 result:
avg_length:  (17.54504050234442, 0.7749913409029133)
suc_rate:  (0.17959713670497984, 0.059711756932172046)


### Rating

In [144]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_50users_random.csv"
df = load_dataframe_csv(table_path,"")  

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0226$\pm$0.021 &  0.0664$\pm$0.052 &  0.0839$\pm$0.055 &  0.127$\pm$0.062 &  0.1724$\pm$0.063 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &                20 &               50 \\
\midrule
0 &  19.75$\pm$0.291 &  18.817$\pm$0.904 &  18.415$\pm$1.043 &  17.747$\pm$1.148 &  16.975$\pm$1.18 \\
\bottomrule
\end{tabular}



In [238]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_50users_random.csv"
df = load_dataframe_csv(table_path,"")  

# rating random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.74962962962963, 0.29107414958393807)
suc_rate:  (0.022623456790123456, 0.020989635923557414)
top5 result:
avg_length:  (18.816529982363313, 0.9036814563363051)
suc_rate:  (0.06635361552028218, 0.05151346941071546)
top10 result:
avg_length:  (18.414528218694883, 1.0432722209167709)
suc_rate:  (0.08388447971781306, 0.054973102609672884)
top20 result:
avg_length:  (17.746574074074076, 1.1476013615327938)
suc_rate:  (0.1270356803690137, 0.0616629059833558)
top50 result:
avg_length:  (16.975411968206085, 1.1800766837586478)
suc_rate:  (0.1723705180077729, 0.0629333496347026)


### RankSVM3

In [242]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(0.01,
 (17.770470787382553, 1.0594595506005924),
 (0.17312225719088464, 0.07581358661361602))

In [240]:
table_path = "../tables/tuning_lambda/beer/tuning_beer_at_lamb_0.01_with_random.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.92509259259259, 0.09356016847696566)
suc_rate:  (0.00558641975308642, 0.006657888556071205)
top5 result:
avg_length:  (19.250091451758117, 0.7464982051921484)
suc_rate:  (0.06044437044437044, 0.04870071177690609)
top10 result:
avg_length:  (18.499294686794688, 1.007070361355241)
suc_rate:  (0.10716259049592382, 0.06418012749394501)
top20 result:
avg_length:  (17.770470787382553, 1.0594595506005924)
suc_rate:  (0.17312225719088464, 0.07581358661361602)
top50 result:
avg_length:  (15.926921571529418, 1.3104969760739227)
suc_rate:  (0.27958704512626076, 0.08501716689840598)


In [19]:
## Top 100
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(90,
 (18.067917295123173, 0.7857402024453743),
 (0.15895253802116543, 0.06744632683682779))

In [12]:
table_path = "../tables/tuning_lambda/beer/tuning_beer_at_lamb_90_with_random.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.854256854256853, 0.23845579617408874)
suc_rate:  (0.01653038319704986, 0.02234804478743511)
top5 result:
avg_length:  (19.71609948693282, 0.2806533031737103)
suc_rate:  (0.026636203302869973, 0.023767272842814605)
top10 result:
avg_length:  (19.415035300574516, 0.35420278491294976)
suc_rate:  (0.0514120538140146, 0.029034846490640264)
top20 result:
avg_length:  (18.067917295123173, 0.7857402024453743)
suc_rate:  (0.15895253802116543, 0.06744632683682779)
top50 result:
avg_length:  (15.89804251485624, 1.167381071237386)
suc_rate:  (0.2958496006535222, 0.08830123898824303)


In [151]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False,
                                   top_affected=100)

(90,
 (18.067917295123173, 0.7857402024453743),
 (0.15895253802116543, 0.06744632683682779))

In [153]:
# top 10
table_path = "../tables/tuning_lambda/topk_10/beer/tuning_beer_at_lamb_0.05_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &               50 \\
\midrule
0 &  0.0146$\pm$0.022 &  0.0685$\pm$0.051 &  0.127$\pm$0.076 &  0.1953$\pm$0.083 &  0.243$\pm$0.084 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.768$\pm$0.349 &  19.012$\pm$0.689 &  18.657$\pm$0.768 &  17.277$\pm$1.215 &  16.328$\pm$1.232 \\
\bottomrule
\end{tabular}



In [148]:
# top 20
table_path = "../tables/tuning_lambda/topk_20/beer/tuning_beer_at_lamb_70_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  0.0143$\pm$0.022 &  0.0692$\pm$0.052 &  0.102$\pm$0.059 &  0.1928$\pm$0.089 &  0.2665$\pm$0.087 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.768$\pm$0.358 &  19.115$\pm$0.645 &  18.672$\pm$0.768 &  17.359$\pm$1.277 &  15.987$\pm$1.339 \\
\bottomrule
\end{tabular}



In [150]:
# top 50
table_path = "../tables/tuning_lambda/topk_50/beer/tuning_beer_at_lamb_90_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  0.0151$\pm$0.022 &  0.0516$\pm$0.038 &  0.0864$\pm$0.055 &  0.1861$\pm$0.082 &  0.3138$\pm$0.098 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.818$\pm$0.284 &  19.294$\pm$0.549 &  18.884$\pm$0.736 &  17.835$\pm$0.971 &  16.015$\pm$1.228 \\
\bottomrule
\end{tabular}



In [152]:
# top 100
table_path = "../tables/tuning_lambda/topk_100/beer/tuning_beer_at_lamb_90_with_random.csv"
df = load_dataframe_csv(table_path,"")   

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0165$\pm$0.022 &  0.0266$\pm$0.024 &  0.0514$\pm$0.029 &  0.159$\pm$0.067 &  0.2958$\pm$0.088 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &                20 &                50 \\
\midrule
0 &  19.854$\pm$0.238 &  19.716$\pm$0.281 &  19.415$\pm$0.354 &  18.068$\pm$0.786 &  15.898$\pm$1.167 \\
\bottomrule
\end{tabular}



## Diff

### Avg

In [22]:
# average diff
table_path = "../tables/reproducing/beer/avg_50users_diff.csv"
df = load_dataframe_csv(table_path,"")   

# avg random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.00496204412871, 0.7102924877807976)
suc_rate:  (0.08116280596672754, 0.06397039354138734)
top5 result:
avg_length:  (17.597803461970127, 1.1630273528838047)
suc_rate:  (0.15395452949374516, 0.07644532943557877)
top10 result:
avg_length:  (16.913637504470838, 1.3778430179926433)
suc_rate:  (0.1794888408613899, 0.07794923950722438)
top20 result:
avg_length:  (15.759583361446104, 1.5417415098045595)
suc_rate:  (0.23833228137149703, 0.08512265198286892)
top50 result:
avg_length:  (13.755153258143455, 1.6641026938409798)
suc_rate:  (0.35802075012859325, 0.09329036252246106)


In [165]:
table_path = "../tables/reproducing/beer/avg_50users_diff.csv"
df = load_dataframe_csv(table_path,"") 

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &               50 \\
\midrule
0 &  0.0812$\pm$0.064 &  0.154$\pm$0.076 &  0.1795$\pm$0.078 &  0.2383$\pm$0.085 &  0.358$\pm$0.093 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  19.005$\pm$0.71 &  17.598$\pm$1.163 &  16.914$\pm$1.378 &  15.76$\pm$1.542 &  13.755$\pm$1.664 \\
\bottomrule
\end{tabular}



### Rating

In [262]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_50users_diff.csv"
df = load_dataframe_csv(table_path,"")  

# rating diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.784197530864194, 0.2891664582151433)
suc_rate:  (0.011358024691358024, 0.01521928727448122)
top5 result:
avg_length:  (18.845101410934745, 0.6689630742150717)
suc_rate:  (0.08013227513227514, 0.05432908711783081)
top10 result:
avg_length:  (18.30531746031746, 1.0395149608590142)
suc_rate:  (0.09081128747795415, 0.0548829961707191)
top20 result:
avg_length:  (17.59619488536155, 1.1588406332651495)
suc_rate:  (0.1290299823633157, 0.0614884697126393)
top50 result:
avg_length:  (16.925220717916797, 1.1975878514748912)
suc_rate:  (0.16381600788463535, 0.06354448635932486)


In [166]:
table_path = "../tables/reproducing/beer/lp1simplified_top100_50users_diff.csv"
df = load_dataframe_csv(table_path,"")  

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  0.0114$\pm$0.015 &  0.0801$\pm$0.054 &  0.0908$\pm$0.055 &  0.129$\pm$0.061 &  0.1638$\pm$0.064 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &                1  &                5  &               10 &                20 &                50 \\
\midrule
0 &  19.784$\pm$0.289 &  18.845$\pm$0.669 &  18.305$\pm$1.04 &  17.596$\pm$1.159 &  16.925$\pm$1.198 \\
\bottomrule
\end{tabular}



### RankSVM3

In [20]:
## top 20
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False)

(0.0001,
 (15.759583361446104, 1.5417415098045595),
 (0.23833228137149703, 0.08512265198286892))

In [246]:
table_path = "../tables/tuning_lambda/beer/tuning_beer_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.00496204412871, 0.7102924877807976)
suc_rate:  (0.08116280596672754, 0.06397039354138734)
top5 result:
avg_length:  (17.597803461970127, 1.1630273528838047)
suc_rate:  (0.15395452949374516, 0.07644532943557877)
top10 result:
avg_length:  (16.913637504470838, 1.3778430179926433)
suc_rate:  (0.1794888408613899, 0.07794923950722438)
top20 result:
avg_length:  (15.759583361446104, 1.5417415098045595)
suc_rate:  (0.23833228137149703, 0.08512265198286892)
top50 result:
avg_length:  (13.755153258143455, 1.6641026938409798)
suc_rate:  (0.35802075012859325, 0.09329036252246106)


In [13]:
## Top 100
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False)

(0.0001,
 (15.759583361446104, 1.5417415098045595),
 (0.23833228137149703, 0.08512265198286892))

In [14]:
table_path = "../tables/tuning_lambda/beer/tuning_beer_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 diff 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.00496204412871, 0.7102924877807976)
suc_rate:  (0.08116280596672754, 0.06397039354138734)
top5 result:
avg_length:  (17.597803461970127, 1.1630273528838047)
suc_rate:  (0.15395452949374516, 0.07644532943557877)
top10 result:
avg_length:  (16.913637504470838, 1.3778430179926433)
suc_rate:  (0.1794888408613899, 0.07794923950722438)
top20 result:
avg_length:  (15.759583361446104, 1.5417415098045595)
suc_rate:  (0.23833228137149703, 0.08512265198286892)
top50 result:
avg_length:  (13.755153258143455, 1.6641026938409798)
suc_rate:  (0.35802075012859325, 0.09329036252246106)


In [172]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "beer", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False,
                                   top_affected= 100)

(0.0001,
 (15.759583361446104, 1.5417415098045595),
 (0.23833228137149703, 0.08512265198286892))

In [168]:
table_path = "../tables/tuning_lambda/topk_10/beer/tuning_beer_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &               50 \\
\midrule
0 &  0.0812$\pm$0.064 &  0.154$\pm$0.076 &  0.1795$\pm$0.078 &  0.2383$\pm$0.085 &  0.358$\pm$0.093 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  19.005$\pm$0.71 &  17.598$\pm$1.163 &  16.914$\pm$1.378 &  15.76$\pm$1.542 &  13.755$\pm$1.664 \\
\bottomrule
\end{tabular}



In [170]:
table_path = "../tables/tuning_lambda/topk_20/beer/tuning_beer_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

\begin{tabular}{llllll}
\toprule
{} &                1  &               5  &                10 &                20 &               50 \\
\midrule
0 &  0.0812$\pm$0.064 &  0.154$\pm$0.076 &  0.1795$\pm$0.078 &  0.2383$\pm$0.085 &  0.358$\pm$0.093 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &               1  &                5  &                10 &               20 &                50 \\
\midrule
0 &  19.005$\pm$0.71 &  17.598$\pm$1.163 &  16.914$\pm$1.378 &  15.76$\pm$1.542 &  13.755$\pm$1.664 \\
\bottomrule
\end{tabular}



In [None]:
table_path = "../tables/tuning_lambda/topk_50/beer/tuning_beer_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

latex_table_row(df, table_name = "success_rate", value_precision = 4, uncertainty_precision = 3)
latex_table_row(df, table_name = "avg_length", value_precision = 3, uncertainty_precision = 3)

# CD 

## Random

### Average

In [263]:
# average random
table_path = "../tables/reproducing/cd/avg_50users_random.csv"
df = load_dataframe_csv(table_path,"")   

# avg random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (15.330252525252524, 0.9154090905344333)
suc_rate:  (0.02, 0.02715855666268)
top5 result:
avg_length:  (14.565560217560218, 1.0099158700747741)
suc_rate:  (0.06098290598290598, 0.04143737957985024)
top10 result:
avg_length:  (13.179090132090131, 1.0861944539214539)
suc_rate:  (0.1436153846153846, 0.05652049994804877)
top20 result:
avg_length:  (11.815252525252527, 1.2129030520469177)
suc_rate:  (0.2342820512820513, 0.076645945997584)
top50 result:
avg_length:  (9.189067599067599, 1.2666736042577258)
suc_rate:  (0.43297824397824397, 0.09498761451865143)


### Rating

In [265]:
table_path = "../tables/reproducing/cd/lp1simplified_top100_random.csv"
df = load_dataframe_csv(table_path,"")  

# rating random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (14.728030303030302, 0.9805020354671313)
suc_rate:  (0.04488888888888889, 0.032916648249033766)
top5 result:
avg_length:  (13.20812432012432, 1.124248430608588)
suc_rate:  (0.15496581196581197, 0.06096709454739042)
top10 result:
avg_length:  (12.257836829836831, 1.2004344612045517)
suc_rate:  (0.19645687645687646, 0.06751396921668301)
top20 result:
avg_length:  (11.25860606060606, 1.1764097751062257)
suc_rate:  (0.26455089355089356, 0.07639058068222888)
top50 result:
avg_length:  (9.500571872571873, 1.263131624214617)
suc_rate:  (0.379055167055167, 0.0906789451734067)


### RankSVM3

In [249]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "cd", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(1,
 (11.445679875679877, 1.1777676820860519),
 (0.25874358974358974, 0.0814482211780933))

In [251]:
table_path = "../tables/tuning_lambda/cd/tuning_cd_at_lamb_1_with_random.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (15.51269696969697, 0.8872857281923647)
suc_rate:  (0.00044444444444444447, 0.0008623560033670605)
top5 result:
avg_length:  (14.507654234654234, 1.0675023525914091)
suc_rate:  (0.06887179487179487, 0.04190257852605433)
top10 result:
avg_length:  (12.904295260295262, 1.0185065526439159)
suc_rate:  (0.17385470085470087, 0.06176702370735052)
top20 result:
avg_length:  (11.445679875679877, 1.1777676820860519)
suc_rate:  (0.25874358974358974, 0.0814482211780933)
top50 result:
avg_length:  (9.476691530691532, 1.095483144627841)
suc_rate:  (0.4123628593628594, 0.07778894528541312)


In [15]:
# Top 100
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "cd", 
                                   keyphrase_selection_method = "random",
                                   topk = 20, 
                                   return_all = False)

(0.05,
 (11.773568764568765, 1.1759729184744652),
 (0.25685470085470086, 0.08114215225680596))

In [18]:
table_path = "../tables/tuning_lambda/cd/tuning_cd_at_lamb_0.05_with_random.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (15.506919191919193, 0.886678937380012)
suc_rate:  (0.0008888888888888889, 0.001724712006734121)
top5 result:
avg_length:  (14.379252525252527, 0.9900297215182119)
suc_rate:  (0.08622222222222221, 0.05041660445273174)
top10 result:
avg_length:  (12.718671328671327, 1.13926426316782)
suc_rate:  (0.194982905982906, 0.07531617532858065)
top20 result:
avg_length:  (11.773568764568765, 1.1759729184744652)
suc_rate:  (0.25685470085470086, 0.08114215225680596)
top50 result:
avg_length:  (9.480096348096346, 1.060067319269504)
suc_rate:  (0.39652758352758355, 0.07904671649034066)


## Diff

### Average

In [266]:
# average random
table_path = "../tables/reproducing/cd/avg_50users_diff.csv"
df = load_dataframe_csv(table_path,"")   

# avg random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.80688888888889, 0.36858341191401006)
suc_rate:  (0.010444444444444445, 0.019404574377332576)
top5 result:
avg_length:  (18.88533333333333, 0.7358294917565255)
suc_rate:  (0.0782222222222222, 0.0461964521495956)
top10 result:
avg_length:  (17.133923076923075, 1.1366983255576752)
suc_rate:  (0.1738717948717949, 0.06721606111703819)
top20 result:
avg_length:  (15.454273504273504, 1.4980372058706233)
suc_rate:  (0.24950427350427348, 0.08179478911531693)
top50 result:
avg_length:  (11.965618492618495, 1.6157171906561767)
suc_rate:  (0.44497824397824404, 0.08505796618004202)


### Rating

In [267]:
table_path = "../tables/reproducing/cd/lp1simplified_top100_diff.csv"
df = load_dataframe_csv(table_path,"")  

# rating random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.108, 0.626048286455783)
suc_rate:  (0.05244444444444443, 0.03473969923038027)
top5 result:
avg_length:  (17.29931623931624, 1.0772688452562702)
suc_rate:  (0.15252136752136752, 0.05982914064609926)
top10 result:
avg_length:  (16.317216783216782, 1.2821504540765372)
suc_rate:  (0.20125174825174824, 0.06886181784401578)
top20 result:
avg_length:  (15.20365268065268, 1.3950310563510468)
suc_rate:  (0.2574568764568765, 0.07410571071925763)
top50 result:
avg_length:  (12.858071484071486, 1.5746680337512364)
suc_rate:  (0.39329448329448324, 0.08459444984284296)


### RankSVM3

In [255]:
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "cd", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False)

(0.5,
 (15.701196581196582, 1.3644102903578286),
 (0.24994871794871792, 0.07898130125943605))

In [256]:
table_path = "../tables/tuning_lambda/cd/tuning_cd_at_lamb_0.5_with_diff.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.80377777777778, 0.36860848708114)
suc_rate:  (0.010888888888888887, 0.019444421546653188)
top5 result:
avg_length:  (18.973444444444443, 0.6850892426333598)
suc_rate:  (0.06744444444444443, 0.044688520031628744)
top10 result:
avg_length:  (17.21054700854701, 1.112399545918299)
suc_rate:  (0.1645384615384615, 0.06600664793263698)
top20 result:
avg_length:  (15.701196581196582, 1.3644102903578286)
suc_rate:  (0.24994871794871792, 0.07898130125943605)
top50 result:
avg_length:  (12.702292152292152, 1.5786776267931855)
suc_rate:  (0.40538228438228435, 0.0864953934289889)


In [17]:
## Top 100
get_best_metric_with_single_lambda(data_path = "", 
                                   lambs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000],
                                   dataset_name = "cd", 
                                   keyphrase_selection_method = "diff",
                                   topk = 20, 
                                   return_all = False)

(0.0001,
 (15.86731623931624, 1.361310585524252),
 (0.23463247863247866, 0.07622905884921852))

In [20]:
table_path = "../tables/tuning_lambda/cd/tuning_cd_at_lamb_0.0001_with_diff.csv"
df = load_dataframe_csv(table_path,"")

# ranksvm3 random 
print ("top1 result:")
get_2_metric(df, topk = 1, print_result= True)

print ("top5 result:")
get_2_metric(df, topk = 5, print_result= True)

print ("top10 result:")
get_2_metric(df, topk = 10, print_result= True)

print ('top20 result:')
get_2_metric(df, topk = 20, print_result= True)

print ('top50 result:')
get_2_metric(df, topk = 50, print_result= True)

top1 result:
avg_length:  (19.80377777777778, 0.36860848708114)
suc_rate:  (0.010444444444444445, 0.019404574377332576)
top5 result:
avg_length:  (18.894777777777776, 0.7700799848253058)
suc_rate:  (0.06299999999999999, 0.044439059055945106)
top10 result:
avg_length:  (17.291888888888888, 1.1146541174414977)
suc_rate:  (0.158, 0.06447999514750465)
top20 result:
avg_length:  (15.86731623931624, 1.361310585524252)
suc_rate:  (0.23463247863247866, 0.07622905884921852)
top50 result:
avg_length:  (12.53661693861694, 1.640370317447069)
suc_rate:  (0.41451048951048947, 0.08677753802432024)
