In [1]:
%matplotlib inline

import sys 
import os 
import numpy as np

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config 

from sklearn.linear_model import LinearRegression 
import scipy.stats

ylims=[0.67, 0.45, 0.57, 0.51, 0.38]

In [2]:
queries = load_queries(config.AUS_TOPIC_PATH)
broad, specific = load_query_types(queries)

BASE_DIR = os.path.join(os.environ["HOME"], 'phd-generated')

In [3]:
base_df = load_1d_dfs(['filtered-phrasestop'], [config.AUS_QREL_PATH], os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', ['1'], 1050, 1050, 1)[0][0]
base_query = load_1d_dfs(['filtered-phrasestop'], [config.AUS_QREL_PATH], os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', ['1'], 1050, 1050, 1, per_query=True)[0][0]

## Generate prior files for run

In [4]:
class Linker:
    def __init__(self, path: str, id_path: str):
        self._lookup = {}
        with open(id_path) as f:
            for line in f:
                parts = line.split()
                self._lookup[parts[0].upper()] = int(parts[1])

        self._lookup_ind = ['']*len(self._lookup)
        for k, v in self._lookup.items():
            self._lookup_ind[v] = k
        
        self._inlinks = [None]*len(self._lookup)
        self._outlinks = [None]*len(self._lookup)

        with open(path) as f:
            for line in f:
                parts = list(map(int, line.split()))
                
                if self._inlinks[parts[1]] == None:
                    self._inlinks[parts[1]] = []
                self._inlinks[parts[1]].append((parts[0], parts[2]))

                if self._outlinks[parts[0]] == None:
                    self._outlinks[parts[0]] = []
                self._outlinks[parts[0]].append((parts[1], parts[2]))
                
    def ids(self):
        return self._lookup_ind
    
    def out_links(self): 
        return self._outlinks
    
    def in_links(self): 
        return self._inlinks
                
    def get_links_for_id(_id: str, vals, count:bool = False):
        ind = self._lookup.get(_id.toupper(), None)
        if ind != None:
            if count:
                _sum = 0
                for i in vals[ind]:
                    _sum += i[1]
                return _sum
            return len(vals[ind])
        return 0

linker = Linker(os.path.join(os.environ["HOME"], "go/src/cit-extract/links.txt"), os.path.join(os.environ["HOME"], "go/src/cit-extract/id-lookup.txt"))

In [5]:
doc_lens = load_doclen_lookup(os.path.join(BASE_DIR, 'filtered-phrasestop-doc_lens.txt'))

In [6]:
# get list of documents and relevance list 
def get_qrel_rel_docs(path: str): 
    out = set()
    with open(path) as f:
        for line in f:
            parts = line.strip().split()
            if int(parts[3]) > 0:
                out.add(parts[2])
                
    return out

def get_bin(v, bins): 
    for i in range(1, len(bins)): 
#         print(v, bins)
        if v < bins[i]: 
            return i-1
    return 0 

def get_rel_vals(qrels, lookup, bins):
    vals = [0.0] * len(qrels)
    for i, q in enumerate(qrels):
        vals[i] = lookup.get(q, True)
    
    bin_cnt = [0] * (len(bins)-1)
    for i, v in enumerate(vals): 
        bin_cnt[get_bin(v, bins)] +=1
    
    return bin_cnt
    
def get_buckets(data):
    print(data[:10])
    df = pd.DataFrame(data)
    bucketed, bins = pd.cut(df[0], 100, retbins=True)
    return bucketed, bins


In [7]:
buckets = 54

rel_docs = get_qrel_rel_docs(config.AUS_QREL_PATH)

in_links = linker.in_links()
out_links = linker.out_links()

inlink = [len(x) if x is not None else 0 for x in in_links]
outlink = [len(x) if x is not None else 0 for x in out_links]

sum_in = []
for x in in_links: 
    if x is not None:
        sum_in.append(sum([y[1] for y in x]))
    else:
        sum_in.append(0)
sum_out = []
for x in out_links:
    if x is not None:
        sum_out.append(sum([y[1] for y in x]))
    else:
        sum_out.append(0)

rel = [True if x in rel_docs else False for x in linker.ids()]

labels = [x for x in range(1, buckets+1)]

len_df = pd.DataFrame({'id': [x.upper() for x in doc_lens.keys()], 'lens': list(doc_lens.values())}).set_index(['id'])

In [8]:
df = pd.DataFrame(data={'id': linker.ids(), 'inlink': inlink, 'outlink': outlink, 'rel': rel}).set_index(['id'])
df = pd.merge(df, len_df, how='outer', left_index=True, right_index=True)
df.drop('CITATION', inplace=True)
df.drop('', inplace=True)
df['present'] = [True if x in doc_lens else False for x in df.index]
# # normalize
df.sort_values(inplace=True, by='inlink')
df['ibins'], ibins = pd.qcut(df['inlink'], buckets, retbins=True, duplicates='drop')
df.sort_values(inplace=True, by='outlink')
df['obins'], obins = pd.qcut(df['outlink'], buckets, retbins=True, duplicates='drop')
df.sort_values(inplace=True, by='lens')
df['lbins'], lbins = pd.qcut(df[df['present'] == 1.0]['lens'], buckets, retbins=True)
# df=(df-df.min())/(df.max()-df.min())

In [9]:
# print(len(df.lens.dropna()))
# print(df['inlink'].sort_values().rank(method='first').head(20))
# df[df['rel'] == True]['inlink'].sort_values().tail(20)
# print(df['ibins'].fillna(pd.Interval(-0.001, 1.0, closed='right')).apply(bucket_df['prob']))
# print(df['inlink'].sort_values().head(20))
# print(df['outlink'].sort_values().head(20))
# print(len(df.inlink.dropna()))
# print(len(df['lbins'].dropna()))

In [10]:
# print(len_df.loc['2015QCA244'])
# print(df.loc['2015QCA244'])
# print(df.loc['2010FCA1'])
# print(df.loc['2010FCA2'])
df[df['inlink'] > 800]

Unnamed: 0_level_0,inlink,outlink,rel,lens,present,ibins,obins,lbins
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
193655CLR499,824.0,0.0,False,,False,"(17.0, 824.0]","(-0.001, 1.0]",


In [11]:
# _, ibins = pd.qcut(df['inlink'].rank(method='first'), buckets, retbins=True)
# _, obins = pd.qcut(df['outlink'].rank(method='first'), buckets, retbins=True)
# _, lbins = pd.qcut(df[df['present'] == True]['lens'].sort_values().rank(method='first'), buckets, retbins=True)
rel_i = get_rel_vals(rel_docs, df['inlink'].to_dict(), ibins)
rel_o = get_rel_vals(rel_docs, df['outlink'].to_dict(), obins)
rel_l = get_rel_vals(rel_docs, df['lens'].to_dict(), lbins)


In [12]:
cols = ['ibins', 'obins', 'lbins']
orig_cols = ['inlink', 'outlink', 'lens']
for col, ocol, bins, rel in zip(cols, orig_cols, [ibins, obins, lbins], [rel_i, rel_o, rel_l]):
    bucket_df = df[col].value_counts(sort=False).to_frame()
#     bucket_df = df['lbins'].value_counts(sort=False).to_frame()
    bucket_df['rel'] = rel
    fig = plt.figure() 
    fig.set_size_inches(16, 6)
    ax = fig.add_subplot(111)

    bucket_df['prob'] =  bucket_df['rel']/bucket_df[col]
    print([int(x) for x in bucket_df.index.categories.left.tolist()])
    
#    print('prob', bucket_df.loc[bucket_df.index[0]]['prob'])

    ### determine prob for each 
    to_val = 0.99 * bucket_df['rel'].sum()
#    print(to_val)
    s = 0
    stop_bucket = 0
    for v in bucket_df['rel']: 
        s += v
        stop_bucket += 1
        if s >= to_val:
            break

    if stop_bucket < 2: 
        stop_bucket = 2
        
#  print(stop_bucket)

#     def func(x, a, b):
#         return a * np.exp(-b * x)

    pre_bins = [x+bins[1]/2 for x in bins[:stop_bucket]]
#     popt, pcov = scipy.optimize.curve_fit(func, pre_bins, bucket_df['prob'][:stop_bucket].values)
#     print(bucket_df['prob'][:stop_bucket].values)
#     print(popt, pcov)

#     fitted = func(np.array(pre_bins), *popt)
#     fitted = np.concatenate((fitted, [fitted[-1]] * (len(bins) - len(fitted)-1)))


    poly = np.poly1d(np.polyfit(pre_bins, bucket_df['prob'][:stop_bucket].values, 2))
    poly_vals = [poly(x) for x in pre_bins]
    poly_vals += [poly_vals[-1]] * (len(bins) - len(poly_vals) -1)
    bucket_df['poly'] = poly_vals

    ax.plot([x for x in range(0, len(bins)-1)], poly_vals)

    sns.scatterplot([x for x in range(len(bins)-1)], bucket_df['prob'], alpha=0.8, ax=ax, markers=['X'], style=0, color='black', s=40)
    ticks = [str(int(x)) for x in bucket_df.index.categories.left.tolist()]
    ax.set_xticks(range(len(ticks)))
    ax.set_xticklabels(ticks, rotation=270)
    ax.set_ylabel('P(R|B)', size=20)
    ax.tick_params(labelsize=15)
    ax.get_legend().remove()
    
    df[col[:1]+'prob'] = df[col].fillna(bucket_df.index[0]).apply(bucket_df['prob'])
    df[col[:1]+'poly'] = df[col].fillna(bucket_df.index[0]).apply(bucket_df['poly'])
    fig.savefig('figures/aus-' + ocol+'-prob-rel.pdf')


[0, 1, 2, 3, 4, 5, 6, 8, 10, 17]
[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 15, 19, 26]
[0, 45, 106, 156, 200, 244, 283, 322, 360, 401, 438, 477, 514, 556, 596, 637, 681, 724, 769, 816, 861, 911, 962, 1019, 1072, 1130, 1192, 1256, 1323, 1390, 1462, 1535, 1616, 1700, 1792, 1886, 1989, 2100, 2217, 2339, 2473, 2616, 2774, 2955, 3173, 3398, 3676, 3987, 4370, 4857, 5512, 6458, 7957, 11199]


<Figure size 1152x432 with 1 Axes>

<Figure size 1152x432 with 1 Axes>

<Figure size 1152x432 with 1 Axes>

In [13]:
def save_res_file(out_path: str, results):
    with open(out_path, 'w') as f:
        for q, q_res in sorted(results.items(), key=lambda x: x[0]):
            q_res = sorted(zip(q_res[0], q_res[1]), key=lambda x: x[1], reverse=True)
            cut = len(q_res)
            if cut > CUTOFF:
                cut = CUTOFF
            for i, res in enumerate(q_res[:cut]):
                f.write('{0} Q0 {1} {2} {3} t\n'.format(q, res[0], i, res[1]))

def create_res_file(path: str, df: pd.DataFrame):
    cols = ['inlink', 'outlink', 'lens', 'iprob', 'oprob', 'lprob', 'ipoly', 'opoly', 'lpoly']
    results = []
    for c in cols:
        results.append({})
    
    with open(path) as f:
        for line in f:
            parts = line.split()
            q = int(parts[0])
            doc = parts[2].upper()
            row = df.loc[doc]
            for i, c in enumerate(cols):
                vals = results[i].get(q, [{}, []])
                vals[0][doc] = len(vals[1])
                vals[1].append(row[c])
                results[i][q] = vals

    for r, c in zip(results, cols):
        save_res_file(os.path.join(BASE_DIR, 'links', '{0}-res.txt'.format(c)), r)
   
    
create_res_file(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), df)

## Eval

In [14]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

names = ['inlink', 'outlink', 'lens', 'iprob', 'oprob', 'lprob', 'ipoly', 'opoly', 'lpoly']
dfs = []
for d in names: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(BASE_DIR, 'links', d+'-res.txt'), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], False)[0])
    dfs.append(interped_dfs)

In [6]:
to = 30
metrics = copy.copy(config.METRIC_NAMES)
del metrics['recall_100']
# link_text_fig = plot_tune_1d_comp(['base']+names, config.METRIC_NAMES, 
#                     [[base_df for x in range(to)]] + [x[:to] for x in dfs], 0.00, (to-1)/100, 0.01)

In [16]:
len_fig = plot_tune_1d_comp(['base', 'len', 'prob', 'poly'], metrics, 
                   [[base_df for x in range(to)]] + [x[:to] for x in [x for x in dfs[2::3]]], 0.00, (to-1)/100, 0.01, legend_x=0.92, styles=['--'], ylims=ylims)

len_fig.savefig('figures/ausnl-len-interp.pdf')

<Figure size 1152x432 with 6 Axes>

In [17]:
link_fig = plot_tune_1d_comp(['base']+[x for i, x in enumerate(names[::]) if i % 3 != 2], metrics, 
                    [[base_df for x in range(to)]] + [x[:to] for x in [x for i, x in enumerate(dfs[::]) if i % 3 != 2]], 0.00, (to-1)/100, 0.01, legend_x = 0.92, styles=['--'], ylims=ylims)

link_fig.savefig('figures/ausnl-citation-interp.pdf')

<Figure size 1152x432 with 6 Axes>

In [18]:
base_qry = load_1d_dfs(['filtered-phrasestop'], [config.AUS_QREL_PATH], os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', ['1'], 1050, 1050, 1, per_query=True)[0][0]

In [19]:
def select_1d_max_with_interp(display_names, metric_names, dfs, start, increment, name, interp, base_qry, base_df, path, metrics=None):
    measure_max = {}
    for i in range(len(display_names)):
        for j in range(len(dfs[i])):
            for m in dfs[i][j].index:
                if m not in metrics: 
                    continue 
                val = dfs[i][j][m]-base_df[m]
                if (display_names[i], metrics[m]) not in measure_max: 
                    measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}
                else: 
                    if measure_max[(display_names[i], metrics[m])]['-'] < val:
                        measure_max[(display_names[i], metrics[m])] = {'-': val, name: '{0:.2f}'.format(j*increment+start)}

    back_metric = {v: k for k, v in metrics.items()}
    for k, v in measure_max.items():
        if k[1] == 'Unjudged@20':
            continue
        _l = float(v[name])
        if _l == 0.00:
            v['-'] = '{0:.4f}'.format(v['-'])
        else:
            interp.interpolate(path.format(k[0]), _l, 'tmp.run')
            comp = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0]
            p = stats.ttest_rel(base_qry[back_metric[k[1]]], comp[back_metric[k[1]]]).pvalue
            if p < 0.01:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{**}$'
            elif p < 0.05:
                v['-'] = '{0:.4f}'.format(v['-'])+'$^{*}$'
            else:
                v['-'] = '{0:.4f}'.format(v['-'])
        
    max_df = pd.DataFrame.from_dict(measure_max).stack().unstack(level=0)
    return max_df.reindex(list(metrics.values()))



In [20]:
len_max = select_1d_max_with_interp([x for x in names[2::3]], metrics.keys(), [x for x in dfs[2::3]], 0.0, 0.01, '$\lambda$', inter, base_qry, base_df, os.path.join(BASE_DIR, 'links', '{0}-res.txt'), metrics=metrics).T

In [21]:
print(len_max.drop(['Unjudged@20'], axis='columns').to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
      &   &      RR &  ERR@20 &    R@20 &    NDCG &     RBP \\
\midrule
lens & $\lambda$ &    0.08 &    0.14 &    0.14 &    0.08 &    0.09 \\
      & - &  0.0044 &  0.0017 &  0.0130 &  0.0059 &  0.0056 \\
lpoly & $\lambda$ &    0.17 &    0.21 &    0.14 &    0.17 &    0.15 \\
      & - &  0.0235 &  0.0117 &  0.0153 &  0.0142 &  0.0138 \\
lprob & $\lambda$ &    0.19 &    0.16 &    0.13 &    0.16 &    0.15 \\
      & - &  0.0202 &  0.0044 &  0.0287 &  0.0152 &  0.0171 \\
\bottomrule
\end{tabular}



In [22]:
link_max = select_1d_max_with_interp([x for i, x in enumerate(names[::]) if i % 3 != 2], metrics.keys(), [x for i, x in enumerate(dfs[::]) if i % 3 != 2], 0.0, 0.01, '$\lambda$', inter, base_qry, base_df, os.path.join(BASE_DIR, 'links', '{0}-res.txt'), metrics=metrics).T
print(link_max.drop(['Unjudged@20'], axis='columns').to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
        &   &      RR &  ERR@20 &    R@20 &          NDCG &     RBP \\
\midrule
inlink & $\lambda$ &    0.00 &    0.00 &    0.00 &          0.00 &    0.00 \\
        & - &  0.0000 &  0.0000 &  0.0000 &        0.0000 &  0.0000 \\
ipoly & $\lambda$ &    0.16 &    0.00 &    0.04 &          0.00 &    0.08 \\
        & - &  0.0011 &  0.0000 &  0.0027 &        0.0000 &  0.0008 \\
iprob & $\lambda$ &    0.06 &    0.06 &    0.01 &          0.00 &    0.06 \\
        & - &  0.0082 &  0.0000 &  0.0010 &        0.0000 &  0.0016 \\
opoly & $\lambda$ &    0.13 &    0.12 &    0.09 &          0.12 &    0.12 \\
        & - &  0.0280 &  0.0305 &  0.0118 &  0.0227$^{*}$ &  0.0128 \\
oprob & $\lambda$ &    0.12 &    0.17 &    0.09 &          0.11 &    0.11 \\
        & - &  0.0217 &  0.0283 &  0.0085 &  0.0188$^{*}$ &  0.0142 \\
outlink & $\lambda$ &    0.00 &    0.00 &    0.00 &          0.00 &    0.00 \\
        & - &  0.0000 &  0.0000 &  0.0000 &        0.0000 &  0.000

In [23]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

names = ['inlink', 'outlink', 'lens', 'iprob', 'oprob', 'lprob', 'ipoly', 'opoly', 'lpoly']
dfs = []
for d in names: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(BASE_DIR, 'links', d+'-res.txt'), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0])
    dfs.append(interped_dfs)

In [24]:
tt_folds = read_folds('ausnl-folds.txt')

In [25]:
metrics = copy.copy(config.METRIC_NAMES)
del metrics['recall_100']

ntlm_df = pd.DataFrame(columns=metrics)

for ab, runs in zip(names, dfs):
    ntlm_cross = cross_validation(runs, tt_folds, metrics, base_qry)
    ntlm_df.loc[ab] = ntlm_cross[0]

In [26]:
ntlm_df.loc['$R$'] = base_query.mean().round(4)
link_df = ntlm_df.reindex(['$R$', 'inlink', 'iprob', 'ipoly', 'outlink', 'oprob', 'opoly'])
write_table('tables/ausnl-link-prior', bold_max(link_df).rename(columns=metrics).drop(['Unjudged@20'],axis='columns').to_latex(escape=False))

In [27]:
len_df = ntlm_df.reindex(['$R$', 'lens', 'lprob', 'lpoly'])
write_table('tables/ausnl-len-prior', bold_max(len_df).rename(columns=metrics).drop(['Unjudged@20'],axis='columns').to_latex(escape=False))

In [28]:
om = copy.copy(config.METRIC_NAMES)
del om['recall_100']
del om['unjudged@20']

for r in ['lens', 'lprob']:
    inter.interpolate(os.path.join(BASE_DIR, 'links', r+'-res.txt'), 0.14, 'tmp.run')
    b = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0]
    qry_comp_df = b-base_qry
    qry_comp_fig = qry_comp_df[om.keys()].rename(metrics, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'), figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')
    qry_comp_fig.get_figure().savefig('figures/ausnl-qry-comp-{0}.pdf'.format(r))



  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


<Figure size 1152x288 with 1 Axes>

<Figure size 1152x288 with 1 Axes>

In [4]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

names = ['oprob']
dfs = []
for d in names: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(BASE_DIR, 'links', d+'-res.txt'), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0])
    dfs.append(interped_dfs)

In [7]:

def plot_diff(names, metric_names, dfs, start, end, increment, broad, narrow, legend_x: float=0.96, legend_y: float=0.46, styles=[], ylims=[]): 

    r = int(len(metric_names)/2)
    c = r
    if c == r: 
        r-=1
    if len(metric_names)%2 != 0:
        c += 1 
    fig, axs = plt.subplots(r, c)
    fig.set_size_inches(16, 6)
    x = np.arange(start, end+increment, increment)
    cnt = 0 
    row = 0
    print(len(x))
    for m in metric_names:
            for i, df in enumerate(dfs):
                s = None 
                if i < len(styles): 
                    s = styles[i]
                
                axs[row, cnt].plot(x, [y[m].mean() for y in df], linestyle=s)
                axs[row, cnt].plot(x, [y[y.index.isin(broad)][m].mean() for y in df])
                axs[row, cnt].plot(x, [y[y.index.isin(narrow)][m].mean() for y in df])

            axs[row, cnt].set_ylabel(metric_names[m],fontsize=18)

            axs[row, cnt].tick_params(labelsize=12)
            axs[row, cnt].yaxis.set_major_formatter(FormatStrFormatter('%.4f'))
            cnt += 1 
            if cnt >= c: 
                cnt = 0 
                row += 1 
                
    for i in range(len(ylims)):
        plt.gcf().get_axes()[i].set_ylim(ymax=ylims[i])
    
    if len(metric_names) % 2 != 0: 
        fig.delaxes(axs[row, -1])

    fig.legend(names + ['broad', 'specific'], bbox_to_anchor=[legend_x, legend_y], frameon=True, ncol=2, prop={"size": 15}).get_frame().set_edgecolor('black')
        
    fig.tight_layout()
    return fig

bs_plot = plot_diff(['all'], metrics, dfs, 0, 0.99, 0.01, broad, specific, styles=['--'])
bs_plot.savefig('figures/ausnl-oprob-qtype.pdf')

100


<Figure size 1152x432 with 6 Axes>

In [8]:
interped_df = dfs[0][10]

In [9]:
def count_changes(df, base, broad, specific, metrics):
    new_df = df - base 
    totals = {}
    b_df = new_df[new_df.index.isin(broad)]
    s_df = new_df[new_df.index.isin(specific)]
    for m in metrics:
        totals[m] = {}
        for l, d in zip(['b', 's'], [b_df, s_df]):
            totals[m][(l, '+')] = d[d[m] > 0][m].count()/len(d)
            totals[m][(l, '-')] = d[d[m] < 0][m].count()/len(d)
    
#     print(pd.DataFrame(totals))
    print(pd.DataFrame(totals).rename(metrics, axis='columns').drop("Unjudged@20", axis='columns').round(4))

count_changes(interped_df, dfs[0][0], broad, specific, metrics)

         RR  ERR@20    R@20    NDCG     RBP
b +  0.2400  0.4800  0.3200  0.6000  0.5200
  -  0.1200  0.3600  0.2000  0.3200  0.4000
s +  0.2143  0.4143  0.1000  0.5000  0.3857
  -  0.2429  0.3429  0.2143  0.3143  0.3857


In [10]:
def plot_per_qry(df, base, broad, metric, disp):
    cmp = df-base
    cmp['type'] = pd.Series({k: queries[k]['type'] for k in cmp.index})
    sort = cmp.sort_values(metric, ascending=True)
    mask = sort['type'] == 'broad'
    colors = np.array(['b']*len(sort))
    colors[mask.values] = 'r'
    
    fig = plt.figure() 
    ax = fig.add_subplot(111)
    fig.set_size_inches(16, 4)
    sns.barplot(x=df.index, y=metric, data=cmp, order=sort.index, ax=ax, palette=colors)
    ax.set_xticklabels([])
    ax.set_ylabel(disp, fontsize=15)
    ax.tick_params(labelsize=12)
    return fig
    

pq_diff = plot_per_qry(interped_df, base_query, queries, 'err@20', 'ERR@20')
pq_diff.savefig('figures/ausnl-oprob-qtype-err.pdf')

pq_diff = plot_per_qry(interped_df, base_query, queries, 'rbp@0.80', 'RBP')
pq_diff.savefig('figures/ausnl-oprob-qtype-ndcg.pdf')

<Figure size 1152x288 with 1 Axes>

<Figure size 1152x288 with 1 Axes>

In [11]:
def plot_len_correlation(queries, df, metrics):
    lens = {k: len(queries[k]['topic'].split()) for k in queries}
    df['lens'] = df.index.map(lens)
    
    fig = plt.figure() 
    ax = fig.add_subplot(111)
    fig.set_size_inches(16, 8)
    for m in metrics.keys():
        sns.regplot(x=df['lens'], y=df[m], ax=ax, truncate=False)
#         ax.scatter(x=df['lens'], y=df[m])
#         g, b = np.polyfit(df['lens'], df[m], 1)
#         ax.plot(df['lens'], g*df['lens']+b)
    ax.set_xlabel('Query length', fontsize=15)
    ax.set_ylabel('Increase above $R$', fontsize=15)
    ax.tick_params(labelsize=12)
    ax.legend(list(metrics.values()), frameon=True, fontsize=12).get_frame().set_edgecolor("black")
    
    return fig
    
len_corr_fig = plot_len_correlation(queries, interped_df, {'err@20': 'ERR@20', 'rbp@0.80': 'RBP'})
len_corr_fig.savefig('figures/ausnl-len-corr.pdf')

<Figure size 1152x576 with 1 Axes>

In [12]:
def correlation(queries, df, metrics):
    lens = {k: len(queries[k]['topic'].split()) for k in queries}
    df['lens'] = df.index.map(lens)
    out_df = pd.DataFrame()
    out_df['pearson'] = df[['lens', *list(metrics.keys())]].corr(method='pearson')['lens'].T
    out_df['kendall'] = df[['lens', *list(metrics.keys())]].corr(method='kendall')['lens'].T
    return out_df.drop(['lens'], axis='index').rename(config.METRIC_NAMES, axis='index').round(4).to_latex()
    
write_table('tables/link-correlation', correlation(queries, interped_df, config.METRIC_NAMES))