In [4]:
# manual stemming based phrase expansions
%matplotlib inline

import sys 
import os 


nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

from phdconf import config 

In [5]:
def load_queries(path: str):  
    queries = {}
    with open(path) as f:
        data = json.load(f)
        for topic in data['topics']:
            queries[topic['id']] = {'topic': topic['topic'], 'type': topic['type']}
    return queries

def load_qrel(path: str, rel: bool=False):
    qrels = {}
    with open(path) as f:
        for line in f:
            parts = line.strip().split()
            if int(parts[0]) not in qrels:
                qrels[int(parts[0])] = set()
            if rel:
                if parts[3] == '1':
                    qrels[int(parts[0])].add(parts[2])
            else:
                qrels[int(parts[0])].add(parts[2])
            
    return qrels

def load_relevant_docs(path: str): 
    with open(path) as f: 
        return json.load(f)
        

In [6]:
queries = load_queries('/home/danlocke/go/src/github.com/dan-locke/phd-data/case-topics.json')

In [11]:
print(len(queries.keys()))
queries = load_queries('/home/danlocke/go/src/github.com/dan-locke/phd-data/legislative-queries.json')
print(len(queries.keys()))

95
39


In [4]:
l = 0
r = {}

with open('/home/danlocke/go/src/github.com/dan-locke/phd-data/aus/filtered-qrels.txt') as f:
    for line in f:
        ls = line.strip()
        if ls == '':
            continue
        
        parts = ls.split()
        if parts[3] not in r:
            r[parts[3]] = 0
        
        r[parts[3]] += 1
        l += 1
        

In [5]:
# number of assessed and relevant docs
print(l, sorted(r.items()))

10732 [('0', 9534), ('1', 476), ('2', 396), ('3', 326)]


In [7]:
def plot_lens(rel):
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar([x[0] for x in rel], [x[1] for x in rel])
    plt.xlabel('Relevance', fontsize=20)
    plt.ylabel('# assessments', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=20)
    fig.savefig('figures/aus-rel-counts.pdf')
    
plot_lens(sorted(r.items()))

<Figure size 1152x720 with 1 Axes>

In [7]:
rel_docs = load_relevant_docs('/home/danlocke/go/src/github.com/dan-locke/phd/experiments/relevant-docs.json')

FileNotFoundError: [Errno 2] No such file or directory: '/home/danlocke/go/src/github.com/dan-locke/phd/experiments/relevant-docs.json'

In [72]:
# a lazy simple version of the query tokenization 
import re

# 
for topic_id, topic in queries.items():
    print(topic_id, topic)
    terms = set(re.sub('[^0-9a-zA-Z]+', ' ', topic['topic']).lower().split())
    print(terms)
    total = {}
    for rel_doc_id in qrels[topic_id]:
#         fig = plt.figure() 
#         ax = fig.add_subplot(111)
#         fig.set_size_inches(16, 10)
        for term, count in rel_docs[rel_doc_id]['Terms'].items():
            if term not in total:
                total[term] = 0
            total[term] += count 
        
    
    df = pd.DataFrame.from_dict({'term': list(total.keys()), 'count': list(total.values())})
    df.sort_values(['count'], ascending=False, inplace=True)
    df['query_term'] = pd.Series([x in terms for x in df.term])
    print(df[:50])
#         colors = ['b' if x in terms else 'r' for x in sort.term]
#         g = sns.barplot(x=df.index, y='count', data=df, order=sort.index, ax=ax, palette=colors, alpha=0.7)
#         ax.set_xticklabels([])
    break

# print(rel_docs['2000FCA120'])

1 {'topic': 'What is the effect of reinstating a company that was in liquidation as regards money that may be recovered?', 'type': 'specific'}
{'of', 'money', 'may', 'as', 'effect', 'was', 'is', 'be', 'recovered', 'company', 'liquidation', 'what', 'the', 'reinstating', 'that', 'a', 'regards', 'in'}
                term  count  query_term
494        companies    259       False
327    reinstatement    230       False
274            order    216       False
140     deregistered    174       False
26               601    169       False
276           orders    157       False
401               up    141       False
65              asic    140       False
49                ah    131       False
1055    commissioner    127       False
20                 3    123       False
1864      plaintiffs    122       False
9                  2    121       False
141   deregistration    114       False
235       liquidator    105       False
324     registration    100       False
0                  1