In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

In [2]:
def save_dataframe_csv(df, path, name):
    df.to_csv(path+name, index=False)

def load_dataframe_csv(path, name, index_col=None):
    return pd.read_csv(path+name, index_col=index_col)

In [4]:
def get_average_length(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    iteration = df_s_f[df_s_f['target_rank']==n].groupby('user_id', as_index=False).agg({'iteration':'mean'})['iteration'].to_numpy()
    return (np.average(iteration), 1.96*np.std(iteration)/np.sqrt(len(iteration)))

def get_success_num(df, n):
    return len(df[(df['result'] == 'successful') & (df['target_rank'] == n)])

def get_fail_num(df, n):
    return len(df[(df['result'] == 'fail') & (df['target_rank'] == n)])

def get_success_rate(df, n):
    df_s_f = df[(df['result'] == 'successful') | (df['result'] == 'fail')]
    df_list_result = df_s_f[df_s_f['target_rank']==n].groupby('user_id', as_index=False)['result'].apply(list).reset_index(name='result')
    successful_rate = df_list_result['result'].apply(lambda r: r.count("successful")/len(r)).to_numpy()
    return (np.average(successful_rate), 1.96*np.std(successful_rate)/np.sqrt(len(successful_rate)))



In [5]:
def get_2_metric(df, topk = 20, print_result = True):
    if print_result:
        print ('avg_length: ',get_average_length(df,topk))
        print ('suc_rate: ',get_success_rate(df,topk))
        return 
    else:
        return get_average_length(df,topk), get_success_rate(df,topk)

In [6]:
def get_best_metric_with_single_lambda(data_path, lambs, topk = 20, return_all = False):
    avg_lengths = []
    suc_rates = []
    for lamb in lambs:
        
        table_name = 'tuning_lamb_'+ str(lamb) + '.csv'
        df = load_dataframe_csv(data_path, table_name)
        avg_length, suc_rate = get_2_metric(df, topk = topk, print_result = False)
#         print ('avg_length', avg_length)
#         print ('suc_rate', suc_rate)
        avg_lengths.append(avg_length)
        suc_rates.append(suc_rate)
        
    if return_all:
        return avg_lengths, suc_rates
    else:
        suc_rates_temp = [suc_rate[0] for suc_rate in suc_rates]
        optimal_lambda_index = np.argmax(suc_rates_temp)
#         print (optimal_lambda_index)
        return lambs[optimal_lambda_index], avg_lengths[optimal_lambda_index], suc_rates[optimal_lambda_index]

In [8]:
dataset_name = "beer"
lamb = 1
keyphrase_selection_method = "diff"
table_path = '../tables/tuning_lambda/'+dataset_name+'/tuning_'+dataset_name+'_at_lamb_'+ str(lamb)+'_with_'+ keyphrase_selection_method+'.csv'
df = load_dataframe_csv(table_path,"")     
# get_2_metric(df,topk = 20)

In [93]:
df['target_rank'] = 20

In [94]:
np.where(df['item_rank'] < df['target_rank'])

(array([  43,   44,   45,   46,   47,   59,   60,   61,   62,   64,   65,
          66,   67,   68,   72,   73,   75,   76,   77,   78,   79,   80,
          83,   85,   86,  450,  451,  452,  453,  454,  455,  456,  457,
         458,  459,  460,  461,  462,  463,  464,  487,  529,  613,  614,
         615,  616,  643,  644,  645,  646,  647,  648,  649,  650,  651,
         652,  653,  654,  655,  656,  657,  658,  975,  976,  977,  978,
         979,  980,  981,  982,  983,  984,  985,  986,  987,  988,  989,
         990,  991,  992,  993,  994, 1164, 1165, 1479, 1480, 1482, 1483,
        1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1519,
        1587, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599,
        1600, 1601, 1602, 1603, 1647, 1648, 1649, 1693, 1694, 1696, 1698,
        1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743,
        1744, 1745, 1746, 1747, 1748, 1749, 1750, 1773, 1774, 1839, 1840,
        1841, 1842, 1843, 1844, 1856, 

In [98]:
df.drop(df.index[[0,1]])

Unnamed: 0,user_id,item_id,item_name,item_rank,item_score,iteration,critiqued_keyphrase,target_rank,num_existing_keyphrases,result,theta,critiqued_keyphrase_name
2,0.0,1154.0,,335.0,0.458133,2.0,0.0,20,20.0,,"[1.0, 0.9656370855944116, 0.9145810691480878]",roast
3,0.0,1154.0,,332.0,1.280821,3.0,2.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0]",caramel
4,0.0,1154.0,,341.0,1.339045,4.0,4.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0]",toast
5,0.0,1154.0,,340.0,1.658527,5.0,6.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",chocolate
6,0.0,1154.0,,348.0,1.564336,6.0,8.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",coffee
...,...,...,...,...,...,...,...,...,...,...,...,...
6594,49.0,3559.0,,848.0,7.119859,16.0,52.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",watery
6595,49.0,3559.0,,856.0,6.798543,17.0,65.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",banana
6596,49.0,3559.0,,844.0,6.891129,18.0,71.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",clove
6597,49.0,3559.0,,822.0,7.068369,19.0,73.0,20,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",coriander


In [100]:
def get_drop_index(df, topk = 20):
    drop_index = []
    iter_flag = 0 

    for i in range(len(df)):
        if df["iteration"][i] == 0:
            iter_flag = 0
        if df['item_rank'][i] < topk and iter_flag != 0:
            iter_flag = 1
        elif iter_flag == 1:
            drop_index.append(i)
    return drop_index
            
            
            

In [None]:
drop_list = get_drop_index(df,topk = 20)

In [99]:
df["iteration"][0]

0.0

In [52]:
new_df = pd.DataFrame({})
new_df = new_df.append(df.loc[1,:])

In [53]:
new_df

Unnamed: 0,critiqued_keyphrase,critiqued_keyphrase_name,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,theta,user_id
1,44.0,brown,1154.0,,353.0,0.86956,1.0,20.0,,1.0,"[1.0, 2.0]",0.0


In [56]:
new_df.set_value(1,"result","successful")

  """Entry point for launching an IPython kernel.


Unnamed: 0,critiqued_keyphrase,critiqued_keyphrase_name,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,theta,user_id
1,44.0,brown,1154.0,,353.0,0.86956,1.0,20.0,successful,1.0,"[1.0, 2.0]",0.0
0,,,,,,,,,successful,,,


In [86]:
def generate_topk_dataframe_from_top1(df, topk = 20):
    """
    Generate target_rank = topk result table from top1 table 
    """
    new_df = pd.DataFrame({})
    for i in range(len(df)):
        # Start of a iteration, append that line 
        if df["iteration"][i] == 0:
            temp_df = pd.DataFrame({})
            temp_df = temp_df.append(df.loc[i])
#             print (temp_df)
        else:
            # if target rank not satisfied, append
            if df["item_rank"][i] >= topk:
                temp_df = temp_df.append(df.loc[i])
            # if target rank satisfied, termination
            else:
                df.set_value(i,"result","successful")  
                temp_df = temp_df.append(df.loc[i])
        new_df = new_df.append(temp_df)
        print (new_df)
#         break 
        
    return new_df

In [77]:
temp_df = pd.DataFrame({})
temp_df.append(df.loc[1,:])

Unnamed: 0,critiqued_keyphrase,critiqued_keyphrase_name,item_id,item_name,item_rank,item_score,iteration,num_existing_keyphrases,result,target_rank,theta,user_id
1,44.0,brown,1154.0,,353.0,0.86956,1.0,20.0,,1.0,"[1.0, 2.0]",0.0


In [65]:
df

Unnamed: 0,user_id,item_id,item_name,item_rank,item_score,iteration,critiqued_keyphrase,target_rank,num_existing_keyphrases,result,theta,critiqued_keyphrase_name
0,0.0,1154.0,,,,0.0,,1.0,20.0,,,
1,0.0,1154.0,,353.0,0.869560,1.0,44.0,1.0,20.0,,"[1.0, 2.0]",brown
2,0.0,1154.0,,335.0,0.458133,2.0,0.0,1.0,20.0,,"[1.0, 0.9656370855944116, 0.9145810691480878]",roast
3,0.0,1154.0,,332.0,1.280821,3.0,2.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0]",caramel
4,0.0,1154.0,,341.0,1.339045,4.0,4.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0]",toast
...,...,...,...,...,...,...,...,...,...,...,...,...
6594,49.0,3559.0,,848.0,7.119859,16.0,52.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",watery
6595,49.0,3559.0,,856.0,6.798543,17.0,65.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",banana
6596,49.0,3559.0,,844.0,6.891129,18.0,71.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",clove
6597,49.0,3559.0,,822.0,7.068369,19.0,73.0,1.0,20.0,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",coriander


In [33]:
get_best_metric_with_single_lambda(data_path='../tables/critiquing/multi_step_critiquing/beer/ranksvm3/',
                                   lambs = [0.001,0.01,0.1,0.5,1,10,30,50,70,90,100,1000],
                                   topk=50,
                                   return_all=False)

(1,
 (15.140380697733638, 2.2842677996023304),
 (0.294980264833206, 0.13289815128618654))

In [35]:
table_path = '../tables/reproducing/beer/lp1simplified_top100_sample_25users.csv'
df = load_dataframe_csv(table_path,"")     
get_2_metric(df,topk = 1)

avg_length:  (9.026735138499843, 1.499916829103878)
suc_rate:  (0.009523809523809525, 0.018216801361705958)
