In [None]:
%matplotlib inline

import sys 
import os
import copy

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config
from phdconf.config import *
from phdconf import stop

In [None]:
queries = load_queries(config.AUS_TOPIC_PATH)
broad, specific = load_query_types(queries)

In [None]:
index_names = ['auspdfs', 'flattened-stop', 'filtered-stop', 'filtered-hyphen', 'filtered-phrasestop']
qrel_paths = [config.AUS_QREL_PATH]*len(index_names)
rel_levels = [config.AUS_REL_LEVEL]*len(index_names)
display_names = ['doc', 'format', 'filtered', 'hyphen', 'phrasestop']

In [None]:
dir_path = os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior/')

mu_start = 300.0
mu_end = 3000.0
mu_increment = 50.0

dir_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment)

In [None]:
dir_fig = plot_tune_1d_comp(display_names, RERANK_METRICS, dir_dfs, mu_start, mu_end, mu_increment, 0.985, 0.495, ylims=RERANK_YLIMS)

In [None]:
# dir_fig.savefig('figures/ausnl-preprocessing.pdf')

In [None]:
def select_1d_max_stat_sig(display_names, dfs, start, increment, name, base_qry, base_df, path, metrics=None):
    measure_max = {}
    for i in range(len(display_names)):
        for j in range(len(dfs[i])):
            for m in dfs[i][j].index:
                if m not in metrics: 
                    continue 
                val = dfs[i][j][m]
                if (display_names[i], metrics[m]) not in measure_max: 
                    measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}
                else: 
                    if measure_max[(display_names[i], metrics[m])]['-'] < val:
                        measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}

    back_metric = {v: k for k, v in metrics.items()}
    for k, v in measure_max.items():
        if k[1] == 'Unjudged@20':
            continue
        _l = float(v[name])
        if _l == 0.00:
            v['-'] = '{0:.4f}'.format(v['-'])
        else:
            comp = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', [path.format(k[0], float(v[name]))], per_query=True)[0]
            p = stats.ttest_rel(base_qry[back_metric[k[1]]], comp[back_metric[k[1]]]).pvalue
            if p < 0.01:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{**}$'
            elif p < 0.05:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{*}$'
            else:
                v['-'] = '{0:.4f}'.format(v['-'])
        
    max_df = pd.DataFrame.from_dict(measure_max).stack().unstack(level=0)
    return max_df.reindex(list(metrics.values()))

In [None]:
mu = 300
base_qry = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [None]:
len_max = select_1d_max_stat_sig(index_names, dir_dfs, 300.0, 50.0, '$\mu$', base_qry, base_df, os.path.join(dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run'), RERANK_METRICS).T

In [None]:
print(len_max.drop(['Unjudged@20'], axis='columns').to_latex(escape=False))

## Stemming and stopwords

In [None]:
stopword_dfs = load_1d_dfs(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment)

In [None]:
dir2_fig = plot_tune_1d_comp(['none', 'top', 'manual', 'manual-stemmed'], RERANK_METRICS, stopword_dfs, mu_start, mu_end, mu_increment, 0.985, 0.495)

In [None]:
# dir2_fig.savefig('figures/ausnl-stopwords.pdf')

In [None]:
mu = 300
base_qry = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

len_max = select_1d_max_stat_sig(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], stopword_dfs, 300.0, 50.0, '$\mu$', base_qry, base_df, os.path.join(BASE_DIR, 'dirichlet_prior', 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run'), metrics).T

In [None]:
len_max

In [None]:
om = copy.deepcopy(config.METRIC_NAMES)
del om['recall_100']
del om['unjudged@20']
qry_comp_df = load_1d_dfs(['stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1600.0, 1600.0, 50.0)[0][0]-load_1d_dfs(['smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1650, 1650, 50.0, per_query=True)[0][0]
qry_comp_fig = qry_comp_df[om.keys()].rename(RERANK_METRICS, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'), figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')

In [None]:
tt_folds = read_folds(AUS_FOLDS)

In [None]:
stopword_dfs = load_1d_dfs(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment, per_query=True)

In [None]:
mu = 300
base_qry = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [None]:
stop_df = pd.DataFrame(columns=RERANK_METRICS)

for ab, runs in zip(['none', 'top', 'manual', 'manual-stemmed'], stopword_dfs):
    cross = cross_validation(runs, tt_folds, RERANK_METRICS, base_qry)
    stop_df.loc[ab] = cross[0]
#     break

In [None]:
# write_table('tables/ausnl-stopwords', bold_max(stop_df).rename(columns=metrics).drop('Unjudged@20',axis='columns').to_latex(escape=False))

In [None]:
dir_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment, per_query=True)

In [None]:
mu = 300
base_qry = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [None]:
stop_df = pd.DataFrame(columns=RERANK_METRICS)

for ab, runs in zip(display_names, dir_dfs):
    cross = cross_validation(runs, tt_folds, RERANK_METRICS, base_qry)
    stop_df.loc[ab] = cross[0]

In [None]:
# write_table('tables/ausnl-preprocessing', bold_max(stop_df).rename(columns=metrics).drop('Unjudged@20',axis='columns').to_latex(escape=False))