In [2]:
# import everything as needed
%matplotlib inline

import itertools
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D, axes3d
from matplotlib import cm
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import math
import csv
import re
import string
from matplotlib.artist import setp
import subprocess
import os
import copy
from matplotlib.ticker import FormatStrFormatter

from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('png')

from trectools import TrecQrel, TrecRun, TrecEval, procedures, TrecPool
import statistics

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config 


#Set general plot properties
sns.set()
sns.set_context("paper")
sns.set_color_codes("pastel")

sns.set_context({"figure.figsize": (16, 10)})
plt.style.use('seaborn-white')


In [2]:
# BASE_DIR = '/Users/danlocke/go/src/github.com/dan-locke/phd/experiments/'
# QREL_DIR = '/Users/danlocke/go/src/github.com/dan-locke/phd-data/'
# qrel_path = 'sigir_qrels.txt'

# Return a tuple of (qry_id, unjudged at 100, nonrel at 100, unjudged at 10, nonrel at 10) 
def get_unjudged(qrel_path: str, path: str):
    
    args = ['trec_eval', '-q', qrel_path, path, '-m', 'relstring.100']
#     args[0] = os.path.join(TREC_FILE_PATH, args[0])
    res = subprocess.check_output(args)
    ret = []
    for line in res.decode('utf-8').split('\n'):
        if line == '':
            break
        parts = line.split()
        # add 1 for the quotation mark
        unjudged = parts[2].count('-')
        nonrel = parts[2].count('0')
            
        # add 1 for the quotation mark
        unjudged_at_10 = parts[2][:11].count('-')
        nonrel_at_10 = parts[2][:11].count('0')
    
        ret.append((parts[1], unjudged, nonrel, unjudged_at_10, nonrel_at_10))
        
    return ret
    
# print(get_unjudged(os.path.join(QREL_DIR, qrel_dirs, qrel_path), os.path.join(BASE_DIR, 'dirichlet_prior/sigir-unigram_dir_mu_1350.00.run')))

In [3]:
emb_names = ['all-paras-lower-100-cbow-minoccur-50', 'sigir-100-minoccur-300']
model = 'mnzexp'
# rel_level = ['1', '2']

# display_names = ['AUS', 'SIGIR']
# index_names = ['flattened',  'sigir']
# qrel_paths = ['comb-aus.txt', 'comb-sigir.txt']
# qrel_dirs = ['aus', 'sigir']

rel_level = ['1', '1']

display_names = ['AUS', 'FILTERED', 'HYPHEN', 'PHRASESTOP', 'PDF']
index_names = ['flattened-stop',  'filtered-stop', 'filtered-hyphen', 'filtered-phrasestop', 'auspdfs']
qrel_paths = [config.AUS_QREL_PATH] * len(display_names)
qrel_dirs = ['aus']*len(display_names)

# QREL_DIR = '/Users/danlocke/go/src/github.com/dan-locke/phd-data/'
# BASE_DIR = '/Users/danlocke/go/src/github.com/dan-locke/phd/experiments/'

jm_path = 'jelinek_mercer'
lambda_start = 0.0
lambda_end = 1.05
increment = 0.05

def load_1d_dfs(index_names, qrel_paths, results_path, run_format, start, end, increment):
    dfs = []
    iterator =  np.arange(start, end, increment)
    for i, ind in enumerate(index_names):
        temp = []
        for l in iterator:
            res = get_unjudged(qrel_paths[i], os.path.join(results_path, run_format.format(ind, l)))
            temp.append(res)
        dfs.append(temp)
    
    return dfs 

def load_unjudged_df(qrel_path, rel_level, path, runs):
    dfs = []
    for r in runs:
        dfs.append(get_unjudged(qrel_path, os.path.join(path, r)))
    
    return dfs

def load_1d_emb_dfs(index_names, qrel_paths, results_path, run_format, rel_levels, emb_names, model, start, end, increment, per_query=False):
    dfs = []
    iterator =  np.arange(start, end+increment, increment)
    for i, ind in enumerate(index_names):
        temp = []
        for l in iterator:
            # num neighbours  
            it_temp = []
            for j in range(1, 21): 
                it_temp.append(get_unjudged(os.path.join(config.DATA_DIR, qrel_dirs[i], qrel_paths[i]), os.path.join(results_path, run_format.format(ind, emb_names[i], model, l, j))))
            temp.append(it_temp)
        dfs.append(temp)
    
    return dfs 

# unjudged_jm = load_1d_dfs(index_names, qrel_paths, os.path.join(os.environ["HOME"], 'phd-generated', 'preprocessing', 'dirichlet_prior'), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', 300, 3000, 50)

# unjudged_jm = load_1d_dfs(index_names, qrel_paths, os.path.join(BASE_DIR, jm_path), '{0}-unigram_jm_lambda_{1:.2f}.run', lambda_start, lambda_end, increment)
unjudged_jm = load_1d_dfs(index_names[3:4], qrel_paths, os.path.join(os.environ["HOME"], 'phd-generated', 'plm'), 'case-topics-{0}-plm-dir-mu-300.00-sigma-{1:.2f}.run', 10, 200, 10)
# tlm_dfs = load_1d_emb_dfs(index_names, qrel_paths, os.path.join(BASE_DIR, model), '{0}-{1}-{2}-dir-mu-{3:.2f}-neighbours-{4}.run', rel_level, emb_names, model, 2400, 2450, 50)

In [4]:
# runs = ['idf-term-weight.run', 'ictf-term-weight.run', 
#         'emb-term-ictf-weight.run', 'emb-term-idf-weight.run', 'emb-term-weight.run',
#         'emb-term-diff-ictf-weight.run', 'emb-term-diff-idf-weight.run', 'emb-term-diff-weight.run', 
#         'avg-cooccur-weight.run', 'coocur-covariance.run']
# unjudged = load_unjudged_df(config.AUS_QREL_PATH, config.AUS_REL_LEVEL, '/home/danlocke/phd-generated/qry-weights', runs)
# avg_nested(unjudged, 3, 10)

In [13]:
def avg_nested(l, i, f):
    avg = []
    for y in l:
        avg.append(statistics.mean([(f - x[i]) for x in y]))
    return avg

def plot_unjudged_1d(index_names, df, start, end, increment): 
    fig, axs = plt.subplots(2, len(index_names))
    fig.set_size_inches(16, 10)
    for i in range(len(index_names)):
        unjudged = avg_nested(df[i], 1, 100)
        unjudged_at_10 = avg_nested(df[i], 3, 10)

        for j in range(2):
            if j == 0:
                axs[j, i].plot(np.arange(start, end, increment), unjudged)
                
                if i == 0: 
                    axs[j, i].set_ylabel("Avg judged @ 100", fontsize=20)

            else:
                axs[j, i].plot(np.arange(start, end, increment), unjudged_at_10)
                
                axs[j, i].set_xlabel(index_names[i], fontsize=20)
                
                if i == 0:
                    axs[j, i].set_ylabel("Avg judged @ 10", fontsize=20)
    
            axs[j, i].tick_params(labelsize=15)
            axs[j, i].yaxis.set_major_formatter(FormatStrFormatter('%1.2f'))
        
    fig.tight_layout()
    
plot_unjudged_1d(display_names[:2], unjudged_jm, 10, 200, 10)
# plot_unjudged_1d(display_names, [x[0] for x in tlm_dfs], 1, 21, 1)

IndexError: list index out of range

<Figure size 1152x720 with 4 Axes>

In [None]:
dir_path = 'dirichlet_prior'
mu_start = 1000.0
mu_end = 3050.0
mu_increment = 50.0

unjudged_dir = load_1d_dfs(index_names, qrel_paths, os.path.join(config.BASE_DIR, dir_path), 'case-topics-{0}-unigram_dir_mu_{1:.2f}.run', mu_start, mu_end, mu_increment)
plot_unjudged_1d(display_names, unjudged_dir, mu_start, mu_end, mu_increment)

In [None]:
from collections.abc import Iterable
import io

def read_res_file(path: str, seen, depth):
    with open(path) as f: 
        cnt = 1
        prev = None 
        for line in f:                
            parts = line.strip().split()
            if cnt > depth:
                if prev == parts[0]:
                    continue 
                else:
                    cnt = 1

            if cnt == 1: 
                    prev = parts[0]

            if parts[0] not in seen:
                seen[parts[0]] = set()
            seen[parts[0]].add(parts[2])
                
            cnt += 1
            
    return seen
    
# read_res_file(os.path.join(BASE_DIR, 'jelinek_mercer', 'ussc-unigram_jm_lambda_0.00.run'), {})

index = 'sigir'

def get_unassessed_baselines(index: str, exclude, depth: int):
    dirs = ['bm25', 'dirichlet_prior', 'jelinek_mercer', 'tfidf']
    formats = ['{0}-unigram_bm25_k1_{1:.2f}_b_{2:.2f}.run', '{0}-unigram_dir_mu_{1:.2f}.run', '{0}-unigram_jm_lambda_{1:.2f}.run', '{0}-unigram_tfidf.run']
    iterators = [[x for x in itertools.product(np.arange(1.2, 3.05, 0.05), np.arange(0.05, 1.05, 0.05))], np.arange(1000, 3050, 50), np.arange(0, 1, 0.05), [1]]
    all_top = {}
    for i, d in enumerate(dirs): 
        for it in iterators[i]:
            if isinstance(it, Iterable):
                all_top = read_res_file(os.path.join(BASE_DIR, d, formats[i].format(index, *it)), all_top, depth)
            else:
                all_top = read_res_file(os.path.join(BASE_DIR, d, formats[i].format(index, it)), all_top, depth)
                
    total = 0 
    ex = 0 
    
    out = io.StringIO()
    for k in all_top:
        total += len(all_top[k])
        for j in all_top[k]:
            if j in exclude[k]:
                ex += 1
            else:
                out.write('{0} 0 {1}\n'.format(k, j))

    print('To assess:', total)
    print('Exclude:', ex)

    return out
    

In [101]:
def load_qrel(path: str):
    qrels = {}
    with open(path) as f:
        for line in f:
            parts = line.strip().split()
            if parts[0] not in qrels:
                qrels[parts[0]] = set()
            qrels[parts[0]].add(parts[2])
            
    return qrels

In [3]:
qrels = TrecQrel(config.AUS_QREL_PATH)

In [4]:
# , 'sdm'

# BASE_LINK_DIR = '/Users/danlocke/go/src/github.com/dan-locke/phd/link-rerank'

# dirs = ['dirichlet_prior', 'tfidf', 'jelinek_mercer']
dirs = ['dirichlet_prior']
runs = []
for d in dirs: 
#     flattened*0.[6-9]0
    runs.append(procedures.list_of_runs_from_path('/home/danlocke/phd-generated/jelinek_mercer', 'case-topics-filtered-phrasestop-unigram_*.run'))
#     runs.append(procedures.list_of_runs_from_path('/home/danlocke/phd-generated/reduction/dirichlet_prior', '*-0.7*.run'))
#     runs.append(procedures.list_of_runs_from_path('/home/danlocke/phd-generated/reduction/dirichlet_prior', '*-0.8*.run'))
#     runs.append(procedures.list_of_runs_from_path('/home/danlocke/phd-generated/reduction/dirichlet_prior', '*-0.9*.run'))
#     runs.append(procedures.list_of_runs_from_path(os.path.join('/home/danlocke/phd-generated/','dirichlet_prior'), "*para-rerank*.run")) 
#     runs.append(procedures.list_of_runs_from_path(os.path.join('/home/danlocke/phd-generated/','dirichlet_prior'), "*paras-rerank*.run")) 
#     runs.append(procedures.list_of_runs_from_path(os.path.join('/home/danlocke/phd-generated', 'bm25'), "case-topics-filtered-phrasestop*.run"))    
#     runs.append(procedures.list_of_runs_from_path(os.path.join('/home/danlocke/go/src/github.com/dan-locke/phd/python-scripts'), "tiny*"))    
#     runs.append(procedures.list_of_runs_from_path(os.path.join('/home/danlocke/phd-generated/plm'), "case-topics-filtered-phrasestop-plm-dir-mu-300.00*-[1-9][0-5].00.run")) 

Found 21 runs in path /home/danlocke/phd-generated/jelinek_mercer


In [5]:
results = []
for i in np.arange(0.1, 1.1, 0.1):
    run_res = []
    for j in range(len(runs)):
        temp = []
        for k in range(len(runs[j])):
            te = TrecEval(runs[j][k], qrels)
            rbp, residuals = te.get_rbp(i)
            temp.append((rbp, residuals))
        run_res.append(temp)
    results.append(run_res)

In [6]:

avg_rbp = []
avg_res = []

for i in results: 
    #  this is the p value 
    rbp_v = []
    res_v = []
    for j in i: 
        # this is the model          
        #  want to take the average of each ... 
        if len(j) == 0:
            continue 
        r = statistics.mean([x[0] for x in j])
        rbp_v.append(r)
        inter = statistics.mean([x[1] for x in j])
        res_v.append(r + inter)
#         res_v.append(statistics.mean([x[1] for x in j]))
        
        
    avg_rbp.append(rbp_v)
    avg_res.append(res_v)
        


In [7]:
for i in range(len(avg_rbp[0])):
    fig, axs = plt.subplots()
    fig.set_size_inches(16, 10)
    plt.plot(np.arange(0.1, 1.1, 0.1), [x[i] for x in avg_rbp])
    plt.fill_between(np.arange(0.1, 1.1, 0.1), [x[i] for x in avg_rbp], [x[i] for x in avg_res], alpha=0.5)


<Figure size 1152x720 with 1 Axes>

In [8]:
# run_names = ['{0}-unigram_bm25_k1_1.20_b_0.75.run', '{0}-unigram_dir_mu_2000.00.run', '{0}-unigram_jm_lambda_0.30.run', 
#              '{0}-unigram_tfidf.run', '{0}-sdm-dir-mu-2400.00-weights-0.00-0.50-0.50-window-8.run']
# dirs = ['bm25', 'dirichlet_prior', 'jelinek_mercer', 'tfidf', 'sdm']
# index = 'flattened'
# fig, axs = plt.subplots()
# fig.set_size_inches(16, 10)

# parsed_runs = []

# for i, r in enumerate(run_names): 
#     run = TrecRun(os.path.join(BASE_DIR, dirs[i], r.format(index)))
#     parsed_runs.append(run)

#     te = TrecEval(run, qrels)
#     rbp = []
#     res = []
#     for i in np.arange(0.1, 1.1, 0.1):
#         r1, r2 = te.get_rbp(i)
#         rbp.append(r1)
#         res.append(r2)

#     rbp = np.array(rbp)
#     res = np.array(res)

#     plt.plot(np.arange(0.1, 1.1, 0.1), rbp)
#     plt.fill_between(np.arange(0.1, 1.1, 0.1), rbp, rbp+res, alpha=0.2)
# plt.legend(['bm25', 'dir', 'jm', 'tfidf', 'sdm'])

In [9]:
flattened_runs = [y for x in runs for y in x]

In [17]:
from trectools import TrecPoolMaker

# pool1 = TrecPoolMaker().make_pool(runs[0], strategy="rbp", topX=10)
pool1 = TrecPoolMaker().make_pool(flattened_runs, strategy="rbp", topX=5)
# pool2 = TrecPoolMaker().make_pool(flattened_runs, strategy="rrf", topX=20, rrf_den=60) 

# Export documents to be judged using Relevation! visual assessing system
# pool1.export_document_list(filename="mypool.txt", with_format="relevation")

In [18]:
# avg_rbp_coverage = []
# # avg_rrf_coverage = []
# # Check to see which pool covers better my run r1
# for i in range(len(runs)):
#     avg_rbp_coverage.append(pool1.check_coverage(runs[i], topX=5))
# #     avg_rrf_coverage.append(pool2.check_coverage(parsed_runs[i], topX=10)) 

In [19]:
# print(statistics.mean(avg_rbp_coverage))
# print(statistics.mean(avg_rrf_coverage))

In [20]:
total_to_do = 0

for k in pool1.pool:
    total_to_do += len(pool1.pool[k])
    
print(total_to_do)

475


In [21]:
exclude_list = {}

with open(config.AUS_QREL_PATH) as f:
    for line in f:
        parts = line.strip().split()
        if int(parts[0]) not in exclude_list:
            exclude_list[int(parts[0])] = set()
        exclude_list[int(parts[0])].add(parts[2])


# with open(os.path.join(BASE_DIR, 'sigir_qrels.txt')) as f:
#     for line in f:
#         parts = line.strip().split()
#         if int(parts[0]) not in exclude_list:
#             exclude_list[int(parts[0])] = set()
#         exclude_list[int(parts[0])].add(int(parts[2]))


In [22]:
print('3783492' in exclude_list[1])
print(exclude_list.keys())

False
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 116, 117, 118, 17, 58, 71, 73, 109, 115, 48, 119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139])


In [23]:
# need to rerun for 43 as had typo 
to_assess = 0

for i in range(1, 140):
    key = i

    if key not in pool1.pool:
        continue
    for item in pool1.pool[key]:
        if key not in exclude_list: 
            print('{0} 0 {1} 0'.format(key, item))
            to_assess += 1
        elif item not in exclude_list[key]:
            print('{0} 0 {1} 0'.format(key, item))
            to_assess += 1
            
print(to_assess)

1 0 2002QSC33 0
1 0 2017FCA999 0
1 0 2015FCA776 0
1 0 2018FCA1095 0
1 0 2011FCA1429 0
2 0 2002QCA393 0
2 0 2006QCA334 0
2 0 2001QCA469 0
2 0 2003QCA40 0
2 0 2004QCA242 0
3 0 2018FCAFC67 0
4 0 2003FCA738 0
4 0 2000QCA45 0
4 0 2002QCA96 0
5 0 2001QCA46 0
5 0 2000FCA386 0
5 0 2007FCAFC116 0
6 0 2002QCA393 0
6 0 2003QCA153 0
6 0 2006QCA426 0
6 0 2001QCA469 0
6 0 2004QCA242 0
8 0 2001QCA523 0
8 0 2003QCA251 0
8 0 2002QCA311 0
8 0 2005QDC466 0
8 0 2007QCA109 0
10 0 2001FCA1538 0
12 0 2001FCA1545 0
12 0 2003QCA1 0
12 0 2006QCA391 0
13 0 2002QCA362 0
13 0 2006QCA571 0
13 0 2003FCA1429 0
15 0 2018FCA827 0
15 0 2008FCA396 0
17 0 2003QCA354 0
17 0 2006QCA545 0
17 0 2006QCA250 0
17 0 2014QSC261 0
17 0 2003QCA496 0
19 0 2011FCA1278 0
21 0 2016QSC243 0
21 0 2004QDC70 0
23 0 2000FCA1171 0
23 0 2007FCA1997 0
23 0 2015FCAFC191 0
23 0 2004FCA1111 0
23 0 2004QSC333 0
24 0 2009QCA47 0
24 0 2009QSC427 0
24 0 2009QCA77 0
25 0 2005FCA1651 0
27 0 2004QCA173 0
27 0 2010QCA185 0
27 0 2012FCA624 0
27 0 2000QCA27