# Link text based scoring methods

In [None]:
%matplotlib inline

import sys 
import os 
import numpy as np

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config 
from phdconf.config import *

In [None]:
queries = load_queries(config.AUS_TOPIC_PATH)
broad, specific = load_query_types(queries)

In [None]:
link_dir = os.path.join(BASE_DIR, 'anchor-text', 'dirichlet_prior')

In [None]:
index_names = ['case-topics']
qrel_paths = [config.AUS_QREL_PATH] * len(index_names)
rel_levels = [config.AUS_REL_LEVEL] * len(index_names)
display_names = ['base', 'indegree', 'sum-indegree', 'outdegree', 'sum-outdegree', 'comb', 'comb-sum']

topics = 'case-topics'

to = 19

In [None]:
mu = 1050
base_df = load_1d_dfs(['filtered-phrasestop'], qrel_paths, os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 1)[0][0]
base_qry = load_1d_dfs(['filtered-phrasestop'], qrel_paths, os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', rel_levels, mu, mu, 1, per_query=True)[0][0]

In [None]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

dfs = []
for d in display_names[1:]: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(link_dir, 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00-linktext-{0}.run'.format(d)), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'])[0])
    dfs.append(interped_dfs)

In [None]:
link_text_fig = plot_tune_1d_comp(['base', 'inlink', 'weight-inlink', 'outlink', 'weight-outlink', 'comb', 'weight-comb'], RERANK_METRICS, 
                    [[base_df for x in range(to+1)]] + [x[:to+1] for x in dfs], 0.00, (to)/100, 0.01, legend_x=0.995, ylims=RERANK_YLIMS, styles=['--'])

In [None]:
# link_text_fig.savefig('figures/ausnl-linktext-interp.pdf')

In [None]:
link_max = select_1d_max_with_interp(display_names[1:], dfs, 0.0, 0.01, '$\lambda$', inter, base_qry, base_df, 1050, os.path.join(link_dir, 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00-linktext-{0}.run'), config.AUS_QREL_PATH, config.AUS_REL_LEVEL, metrics=RERANK_METRICS).T

In [None]:
print(link_max.drop(['Unjudged@20'], axis='columns').to_latex(escape=False))

In [None]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

names = ['iprob', 'oprob']
dfs = []
for d in names: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(BASE_DIR, 'links', d+'-res.txt'), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], False)[0])
    dfs.append(interped_dfs)

In [None]:
cit_max = select_1d_max_with_interp(names, dfs, 0.0, 0.01, '$\lambda$', inter, base_qry, base_df, 1050, os.path.join(BASE_DIR, 'links', '{0}-res.txt'), config.AUS_QREL_PATH, config.AUS_REL_LEVEL, metrics=RERANK_METRICS).T
cit_max

## Compare to citation effectiveness

In [None]:
# max for err@20
om = copy.copy(config.METRIC_NAMES)
del om['recall_100']
del om['unjudged@20']

text_comps =['sum-indegree', 'sum-outdegree']
link_comps = ['iprob', 'oprob']
runs = ['in', 'out']

cols = om.keys()

a = pd.DataFrame()
b = pd.DataFrame()

for i in range(len(link_comps)):
    interps = [float(x) for x in link_max.loc[text_comps[i], '$\lambda$'].values]
    for j, c in zip(interps, cols):
        inter.interpolate(os.path.join(link_dir, 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00-linktext-{0}.run'.format(text_comps[i])), j, 'tmp.run')
        a[c] = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0][c]
    
    b_interps = [float(x) for x in cit_max.loc[link_comps[i], '$\lambda$'].values]
    for j, c in zip(b_interps, cols):
        inter.interpolate(os.path.join(BASE_DIR, 'links', link_comps[i]+'-res.txt'), j, 'tmp.run')
        b[c] = load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0][c]
    
    qry_comp_df = a-b
    qry_comp_fig = qry_comp_df[om.keys()].rename(RERANK_METRICS, axis='columns').plot.box(fontsize=15, boxprops=dict(linestyle='-', linewidth=2), medianprops=dict(linestyle='-', linewidth=2), color=dict(boxes='black', whiskers='black', medians='b', caps='r'), figsize=(16, 4)).axhline(y=0, xmin=0.0, xmax=1.0, linestyle='--', linewidth=1.0, color='grey')
    # qry_comp_fig.get_figure().savefig('figures/ausnl-link-{0}-qry-comp.pdf'.format(runs[i]))


In [None]:
inter = Interpolater(os.path.join(BASE_DIR, 'preprocessing', 'dirichlet_prior', 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00.run'), normalize=True)

dfs = []
for d in display_names[1:]: 
    interped_dfs = []
    for _lambda in np.arange(0, 1.0, 0.01):
        inter.interpolate(os.path.join(link_dir, 'case-topics-filtered-phrasestop-unigram_dir_mu_1050.00-linktext-{0}.run'.format(d)), _lambda, 'tmp.run')
        interped_dfs.append(load_dfs(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '', ['tmp.run'], per_query=True)[0])
    dfs.append(interped_dfs)

In [None]:
tt_folds = read_folds(AUS_FOLDS)

In [None]:
ntlm_df = pd.DataFrame(columns=RERANK_METRICS)

for ab, runs in zip(['indegree', 'sum-indegree', 'outdegree', 'sum-outdegree', 'comb', 'comb-sum'], dfs):
    cross = cross_validation(runs, tt_folds, RERANK_METRICS, base_qry)
    ntlm_df.loc[ab] = cross[0]
    
ntlm_df.loc['$R$'] = base_df.round(4)
ntlm_df = ntlm_df.rename(index={'indegree':'inlink', 'outdegree': 'outlink', 'sum-indegree': 'weighted-inlink', 'sum-outdegree': 'weighted-outlink', 'comb-sum': 'weighted-comb'})
ntlm_df = ntlm_df.reindex(['$R$', 'inlink', 'weighted-inlink', 'outlink', 'weighted-outlink', 'comb', 'weighted-comb'])
# write_table('tables/ausnl-linktext', bold_max(ntlm_df).rename(columns=RERANK_METRICS).drop('Unjudged@20',axis='columns').to_latex(escape=False))

In [None]:
def read_count_file(path: str): 
    out = {}
    with open(path) as f:
        for line in f:
            vals = list(map(int, line.split()))
            q = vals[0]
            out[q] = vals[1:]
            vals = vals[1:]
            split = [vals[i:i+2] for i in range(0, len(vals), 2)]
            _in = []
            _out = []
            for i, s in enumerate(split):
                if i % 2 == 0:
                    _in.append(s)
                else:
                    _out.append(s)

            out[q] = [[x[0] for x in _in], [x[1] for x in _in], [x[0] for x in _out], [x[1] for x in _out]]
    
    return out

counts = read_count_file(os.path.join(BASE_DIR, 'anchor-text', 'counts.txt'))

In [None]:
count_df = pd.DataFrame.from_dict(counts, orient='index', columns=['inlinks', 'pin', 'outlinks', 'pout'])

In [None]:
count_df.mean()/100

In [None]:
xs = [1, 2, 3, 4, 5, 6, 7, 8]
size = 2
xs = [xs[i:i+size] for i in range(0, len(xs), size)]
xs[1::2]

In [None]:
pd.DataFrame.from_dict({k: [sum(x) for x in v] for k, v in counts.items()}, orient='index', columns=['inlinks', 'pin', 'outlinks', 'pout'])

In [None]:
def sum_relevant(qrel_path: str, res_path: str, counts): 
    qrels = {}
    with open(qrel_path) as f:
        for line in f:
            parts = line.split()
            q = int(parts[0])
            rel = qrels.get(q, [set(), set()])
            if parts[3] != '0':
                rel[0].add(parts[2])
            else:
                rel[1].add(parts[2])
            qrels[q] = rel
    
    
    rel = []
    nonrel = []
    for i in range(4):
        rel.append([])
        nonrel.append([])
    numrel = [0]*2
    with open(res_path) as f:
        for line in f:
            parts = line.split()
            q = int(parts[0])
            r = int(parts[3])
            if parts[2] in qrels[q][0]:
                numrel[0] += 1
                for i, x in enumerate(counts[q]):
                    rel[i].append(x[r])
            elif parts[2] in qrels[q][1]:
                numrel[1] += 1
                for i, x in enumerate(counts[q]):
                    nonrel[i].append(x[r])
                
                
    return np.array(rel), np.array(nonrel), numrel
                
            
    
rel_stats, non_rel_stats, proportion = sum_relevant(config.AUS_QREL_PATH, os.path.join(link_dir, 'case-topics-filtered-phrasestop-unigram_dir_mu_3000.00-linktext-indegree.run'), counts)

In [None]:
rel_stats/proportion[0]

In [None]:
non_rel_stats/proportion[1]