In [1]:
%matplotlib inline

import sys 
import os
import copy

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config
from phdconf import stop

ylims=[0.67, 0.45, 0.60, 0.59, 0.38]

In [2]:
queries = load_queries(config.AUS_TOPIC_PATH)
broad, specific = load_query_types(queries)

In [3]:
index_names = ['auspdfs', 'flattened-stop', 'filtered-stop', 'filtered-hyphen', 'filtered-phrasestop']
qrel_paths = [config.AUS_QREL_PATH]*len(index_names)
rel_levels = [config.AUS_REL_LEVEL]*len(index_names)
display_names = ['doc', 'format', 'filtered', 'hyphen', 'phrasestop']

In [4]:
BASE_DIR = os.path.join(os.environ["HOME"], 'phd-generated')

In [5]:
metrics = copy.deepcopy(config.METRIC_NAMES)
# metrics['unjudged@20'] = 'Undjudged@20'

In [6]:
dir_path = os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior/')

mu_start = 300.0
mu_end = 3000.0
mu_increment = 50.0

dir_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment)

In [7]:
dir_fig = plot_tune_1d_comp(display_names, metrics, dir_dfs, mu_start, mu_end, mu_increment, 0.985, 0.495, ylims=ylims)

<Figure size 1152x432 with 7 Axes>

In [8]:
dir_fig.savefig('figures/ausnl-preprocessing.pdf')

In [9]:
def select_1d_max_stat_sig(display_names, dfs, start, increment, name, base_qry, base_df, path, metrics=None):
    measure_max = {}
    for i in range(len(display_names)):
        for j in range(len(dfs[i])):
            for m in dfs[i][j].index:
                if m not in metrics: 
                    continue 
                val = dfs[i][j][m]
                if (display_names[i], metrics[m]) not in measure_max: 
                    measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}
                else: 
                    if measure_max[(display_names[i], metrics[m])]['-'] < val:
                        measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}

    back_metric = {v: k for k, v in metrics.items()}
    for k, v in measure_max.items():
        if k[1] == 'Unjudged@20':
            continue
        _l = float(v[name])
        if _l == 0.00:
            v['-'] = '{0:.4f}'.format(v['-'])
        else:
            comp = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', [path.format(k[0], float(v[name]))], per_query=True)[0]
            p = stats.ttest_rel(base_qry[back_metric[k[1]]], comp[back_metric[k[1]]]).pvalue
            if p < 0.01:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{**}$'
            elif p < 0.05:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{*}$'
            else:
                v['-'] = '{0:.4f}'.format(v['-'])
        
    max_df = pd.DataFrame.from_dict(measure_max).stack().unstack(level=0)
    return max_df.reindex(list(metrics.values()))

In [10]:
mu = 300
base_qry = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [11]:
len_max = select_1d_max_stat_sig(index_names, dir_dfs, 300.0, 50.0, '$\mu$', base_qry, base_df, os.path.join(dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run'), metrics).T

In [12]:
print(len_max.drop(['Unjudged@20'], axis='columns').to_latex(escape=False))
# len_max.columns

\begin{tabular}{llllllll}
\toprule
               &   &       RR &   ERR@20 &           R@20 &          R@100 &           NDCG &           RBP \\
\midrule
auspdfs & $\mu$ &   900.00 &  3000.00 &        2950.00 &        3000.00 &        3000.00 &       2950.00 \\
               & - &   0.5905 &   0.3683 &  0.4592$^{**}$ &  0.6946$^{**}$ &  0.4616$^{**}$ &  0.3260$^{*}$ \\
filtered-hyphen & $\mu$ &  1200.00 &  1200.00 &        2550.00 &        1250.00 &        1450.00 &       1500.00 \\
               & - &   0.5828 &   0.3811 &  0.4591$^{**}$ &  0.7040$^{**}$ &  0.4612$^{**}$ &        0.3156 \\
filtered-phrasestop & $\mu$ &   600.00 &  1050.00 &        2900.00 &        1450.00 &        3000.00 &       2950.00 \\
               & - &   0.5718 &   0.3794 &  0.4730$^{**}$ &  0.7320$^{**}$ &  0.4691$^{**}$ &        0.3169 \\
filtered-stop & $\mu$ &  1250.00 &  1250.00 &        2000.00 &        1400.00 &        1450.00 &       1450.00 \\
               & - &   0.5886 &   0.3818 &  0.4609$^{*

## Stemming and stopwords

In [13]:
stopword_dfs = load_1d_dfs(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment)

In [14]:
dir2_fig = plot_tune_1d_comp(['none', 'top', 'manual', 'manual-stemmed'], metrics, stopword_dfs, mu_start, mu_end, mu_increment, 0.985, 0.495)

<Figure size 1152x432 with 7 Axes>

In [15]:
dir2_fig.savefig('figures/ausnl-stopwords.pdf')

In [16]:
mu = 300
base_qry = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

len_max = select_1d_max_stat_sig(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], stopword_dfs, 300.0, 50.0, '$\mu$', base_qry, base_df, os.path.join(BASE_DIR, 'dirichlet_prior', 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run'), metrics).T

In [17]:
len_max

Unnamed: 0,Unnamed: 1,RR,ERR@20,R@20,R@100,NDCG,RBP,Unjudged@20
allstop,$\mu$,400.0,950.0,2800.00,2950.00,1500.00,2350.00,300.0
allstop,-,0.5895,0.3695,0.4716$^{**}$,0.6982$^{**}$,0.4580$^{**}$,0.3203$^{**}$,2.75789
nostop,$\mu$,1350.0,2850.0,2500.00,2850.00,3000.00,3000.00,300.0
nostop,-,0.5913,0.3733,0.4664$^{**}$,0.6985$^{**}$,0.4582$^{**}$,0.3154$^{**}$,3.86316
smallstop,$\mu$,600.0,1650.0,2200.00,3000.00,3000.00,2850.00,300.0
smallstop,-,0.5915,0.3711,0.4841$^{**}$,0.7013$^{**}$,0.4620$^{**}$,0.3217$^{**}$,3.03158
stemmed-smallstop,$\mu$,1600.0,1600.0,2350.00,2900.00,2900.00,2850.00,300.0
stemmed-smallstop,-,0.5831,0.3996,0.4529$^{**}$,0.6940,0.4735$^{**}$,0.3185$^{**}$,5.30526


In [18]:
om = copy.deepcopy(config.METRIC_NAMES)
del om['recall_100']
del om['unjudged@20']
qry_comp_df = load_1d_dfs(['stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1600.0, 1600.0, 50.0)[0][0]-load_1d_dfs(['smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, 1650, 1650, 50.0, per_query=True)[0][0]
qry_comp_fig = qry_comp_df[om.keys()].rename(metrics, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'), figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')

  return array(a, dtype, copy=False, order=order)


<Figure size 1152x288 with 1 Axes>

In [19]:
tt_folds = read_folds('ausnl-folds.txt')

In [20]:
stopword_dfs = load_1d_dfs(['nostop', 'allstop', 'smallstop', 'stemmed-smallstop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment, per_query=True)

In [21]:
mu = 300
base_qry = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['nostop'], qrel_paths, os.path.join(BASE_DIR, 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [22]:
stop_df = pd.DataFrame(columns=metrics)

for ab, runs in zip(['none', 'top', 'manual', 'manual-stemmed'], stopword_dfs):
    cross = cross_validation(runs, tt_folds, metrics, base_qry)
    stop_df.loc[ab] = cross[0]
#     break

In [23]:
write_table('tables/ausnl-stopwords', bold_max(stop_df).rename(columns=metrics).drop('Unjudged@20',axis='columns').to_latex(escape=False))

In [24]:
dir_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu_start, mu_end, mu_increment, per_query=True)

In [25]:
mu = 300
base_qry = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0, per_query=True)[0][0]
base_df = load_1d_dfs(['auspdfs'], qrel_paths, dir_path, 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 50.0)[0][0]

In [26]:
stop_df = pd.DataFrame(columns=metrics)

for ab, runs in zip(display_names, dir_dfs):
    cross = cross_validation(runs, tt_folds, metrics, base_qry)
    stop_df.loc[ab] = cross[0]
#     break

In [27]:
write_table('tables/ausnl-preprocessing', bold_max(stop_df).rename(columns=metrics).drop('Unjudged@20',axis='columns').to_latex(escape=False))