In [None]:
%matplotlib inline

import copy
import sys 
import os 

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *


from phdconf import config
from phdconf.config import *

In [None]:
queries = load_queries(config.AUS_TOPIC_PATH)
broad, specific = load_query_types(queries)
law, fact, generic = load_query_focus_types(queries)

In [None]:
index_names = ['filtered-phrasestop']#, config.SIGIR_INDEX_NAME]
qrel_paths = [config.AUS_QREL_PATH]#, config.SIGIR_QREL_PATH]
rel_levels = [config.AUS_REL_LEVEL]#, config.SIGIR_REL_LEVEL]
display_names = ['unweighted', 'weighted']#, 'SIGIR']

In [None]:
tt_folds = read_folds(AUS_FOLDS)

In [None]:
run_path = os.path.join(BASE_DIR, 'qry-weights')
runs = ['idf-term-weight.run', 'ictf-term-weight.run', 
        'emb-term-ictf-weight.run', 'emb-term-idf-weight.run', 'emb-term-weight.run',
        'emb-term-diff-ictf-weight.run', 'emb-term-diff-idf-weight.run', 'emb-term-diff-weight.run', 
        'avg-cooccur-weight.run', 'coocur-covariance.run']


run_names = ['idf', 'ictf', 'emb-ictf', 'emb-idf', 'emb', 'emb-diff-ictf', 'emb-diff-idf', 'emb-diff', 'avg-cooccur', 'co-covar']

In [None]:
kli_dfs = load_1d_dfs(index_names, qrel_paths, run_path, 'kli-term-weight-{1}.run', rel_levels[0], 0, 100, 1)

In [None]:
base_df = load_1d_dfs(index_names, qrel_paths, os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1050, 1050, 1)[0][0]
base_query = load_1d_dfs(index_names, qrel_paths, os.path.join(BASE_DIR,'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1050, 1050, 1, per_query=True)[0][0]
# base_query.sort_index(inplace=True)

In [None]:
plot = plot_tune_1d_comp(display_names, RERANK_METRICS, [[base_df for x in range(len(kli_dfs[0]))], kli_dfs[0]], 0, 100, 1, ylims=RERANK_YLIMS, legend_x=0.99)
# plot.savefig('figures/kli-term-weighting.pdf')

In [None]:
kli_dfs = load_1d_dfs(index_names, qrel_paths, run_path, 'kli-term-weight-{1}.run', rel_levels[0], 0, 100, 1, per_query=True)

In [None]:
kli_df = pd.DataFrame(columns=config.METRIC_NAMES)

for ab, runs in zip(['KLI'], kli_dfs):
    cross = cross_validation(runs, tt_folds, config.METRIC_NAMES, base_query)
    kli_df.loc[ab] = cross[0]
    
# cv_df.loc['R'] = ['{:.4f}'.format(base_df[m]) for m in config.METRIC_NAMES]
# print(kli_df.drop('unjudged@20',axis='columns').rename(config.METRIC_NAMES, axis='columns').to_latex())
kli_df

In [None]:
def plot_qry_diff(runs, qry_df, metrics, folds):
    _max = [0.0] * len(metrics)
#     max_inds = [0] * len(metrics)
    qry_res = runs[0].copy(deep=True)
    
    for i, run in enumerate(runs):
        for f, fold in enumerate(folds): 
            filtered = run[run.index.isin(fold[1])]
            for j, m in enumerate(metrics.keys()):
                v = filtered[m].mean()
                if v > _max[j]:
                    _max[j] = v
#                     max_inds[j][f] = i
                    for ind, item in run[run.index.isin(fold[0])][m].items():
                        qry_res.loc[ind][m] = item
    
    qry_df.sort_index(inplace=True)        
    qry_res.sort_index(inplace=True)

    qry_comp_df = qry_res-qry_df
    qry_comp_fig = qry_comp_df[metrics.keys()].rename(metrics, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'),figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')
    return qry_comp_fig

kli_diff = plot_qry_diff(kli_dfs[0], base_query, RERANK_METRICS, tt_folds)
# kli_diff.get_figure().savefig('figures/ausnl-kli-weight-qry-diff.pdf')

In [None]:
runs = ['idf-term-weight.run', 'ictf-term-weight.run', 
        'emb-term-ictf-weight.run', 'emb-term-idf-weight.run', 'emb-term-weight.run',
        'emb-term-diff-ictf-weight.run', 'emb-term-diff-idf-weight.run', 'emb-term-diff-weight.run', 
        'avg-cooccur-weight.run', 'coocur-covariance.run']

dfs = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, run_path, runs, per_query=True)
# dfs = [base_query] + dfs

In [None]:
cv_df = pd.DataFrame(columns=config.METRIC_NAMES)

for ab, r in zip(run_names, [[x] for x in dfs]):
    cross = cross_validation(r, tt_folds, config.METRIC_NAMES, base_query)
    cv_df.loc[ab] = cross[0]
    
cv_df.loc['$R$'] = ['{:.4f}'.format(base_df[m]) for m in config.METRIC_NAMES]
cv_df.loc['kli'] = kli_df.loc['KLI']
# write_table('tables/term-weighting', bold_max(cv_df).drop('unjudged@20',axis='columns').rename(config.METRIC_NAMES, axis='columns').to_latex(escape=False))

In [None]:
metrics = copy.deepcopy(config.METRIC_NAMES)
del metrics['unjudged@20']
diff = dfs[1]-base_query
box = diff[metrics.keys()].rename(metrics, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'),figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')
# box.get_figure().savefig('figures/ausnl-ictf-weight.pdf')

In [None]:
query_dfs = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, run_path, runs, per_query=True)

In [None]:
keys = ['recip_rank', 'recall_20', 'recall_100', 'ndcg', 'rbp@0.80']

def cnt_increases(run, base, keys=keys):
    l = len(keys)
    
    increases = [0]*l
    decreases = [0]*l
    inc_all = []
    dec_all = []
    all_same = []
    
    diff = run-base
    
    for row in diff.iterrows():
        _all_d = True 
        _all_i = True
        _all_same = True
        for i, key in enumerate(keys):   
            if row[1][key] > 0.0:
                increases[i] += 1
                _all_d = False 
                _all_same = False 
            elif row[1][key] < 0.0:
                decreases[i] += 1
                _all_i = False 
            else:
                _all_d = False
        if _all_d: 
            dec_all.append(row[0]) 
        if _all_i and not _all_same:
            inc_all.append(row[0])
        elif _all_same:
            all_same.append(row[0])
    
    return increases, decreases, inc_all, dec_all, all_same

runs_inc = []
runs_dec = []
rai = []
rad = []
sad = []
for run in query_dfs:
    a, b, c, d, e = cnt_increases(run, base_query) 
    runs_inc.append(a)
    runs_dec.append(b)
    rai.append(c)
    rad.append(d)
    sad.append(e)

In [None]:
comb = [None]*(len(runs_inc)+len(runs_dec))
comb[::2] = runs_inc
comb[1::2] = runs_dec
mi = pd.MultiIndex.from_product([run_names, ['+', '-']], names=['', 'Change'])
# write_table('tables/weighting-changes', (pd.DataFrame(comb, index=mi, columns=['RR', 'R@20', 'R@100', 'NDCG', 'RBP'])/len(base_query.index)).round(4).rename(metrics, axis='columns').to_latex(escape=False))

In [None]:
ch_df = pd.DataFrame(list(zip([len(x) for x in rai], [len(x) for x in rad])), columns=['+', '-'], index=run_names)/len(base_query.index)

In [None]:
pd.DataFrame([x.loc[8, : ]-base_query.loc[8, : ] for x in query_dfs], index=run_names)

In [None]:
def get_portions(queries, portions):
    p = len(portions)
    overlap = [0]*p
    for q in queries:
        for i, portion in enumerate(portions):
            if q in portion: 
                overlap[i] += 1
    
    for i in range(p):
        overlap[i] = float(overlap[i]) / float(len(portions[i]))
    return overlap

query_portions = [set(queries.keys()), set(broad), set(specific), set(law), set(fact), set(generic)]

In [None]:
inc_t = [get_portions(x, query_portions) for x in rai]
dec_t = [get_portions(x, query_portions) for x in rad]
cbqt = [None]*(len(runs_inc)+len(runs_dec))
cbqt[::2] = inc_t
cbqt[1::2] = dec_t
print((pd.DataFrame(cbqt, index=mi, columns=['all', 'broad','specific', 'law', 'fact', 'generic']).round(4)).to_latex())