# Define Stoplist and Docno Arrays

In [1]:
import copy

stoplist = open('./reference/stoplist.txt')

docno_arr = []
stop_arr = []
for line in stoplist:
    stop_arr.append(line.strip())
stop_arr = stop_arr + ['document', 'discuss', 'report', 'include', 'describe', 'identify', 'cite', 'predict', 'new', 'two', 'state']

# Create Index

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

es.indices.create(index = 'ap_data', ignore=400, body= {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "max_result_window" : 30000,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stop_arr
                },
                "my_stemmer": {
                    "type": "stemmer",
                    "name": "english"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "my_stemmer"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions",
                "term_vector": "yes"
            }
        }
    }
})

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap_data'}

# Document Indexing

In [3]:
from elasticsearch.helpers import bulk
from tqdm import tqdm
import os

docno_arr = []
res = []

def parse_docs(docs):
    while '<DOC>' in docs:
        text = ""
        docend = docs.find('</DOC>')
        substr = docs[:docend]
        d_stt = substr.find('<DOCNO>') + len('<DOCNO>')
        d_end = substr.find('</DOCNO>')
        docno = substr[d_stt:d_end].strip()
        while "<TEXT>" in substr:
            t_stt = substr.find('<TEXT>') + len('<TEXT>')
            t_end = substr.find('</TEXT>')
            text = text + substr[t_stt:t_end].strip() + '\n'
            substr = substr[t_end + len('</TEXT>'):]
        docs = docs[docend + len('</DOC>'):]
        docno_arr.append(docno)
        yield {
            '_index': 'ap_data',
            '_id': docno,
            'text': text
        }

def bulk_index(filename):
    with open(filename, "r", encoding="ISO-8859-1") as f:
        docs = f.read()
    return bulk(es, parse_docs(docs))

def docs_index(filepath):
    for file in tqdm(os.listdir(filepath)):
        res.append(bulk_index(os.path.join(filepath, file)))

In [4]:
docs_index('../AP_DATA/ap89_collection')

100%|██████████| 365/365 [01:16<00:00,  5.59it/s]


# Query Retrieval

In [5]:
import re

querylist = open('./reference/querylist.txt')
query_dict = {}
word_arr = []
query_arr = []

for line in querylist:
    if line.strip() != '':
        queryno = re.sub('[^A-Za-z0-9]+', '', line.split()[0])
        query = line.split()[1:]
        modified = line[line.find(".")+1:]
        terms = es.indices.analyze(index='ap_data', body={'analyzer': 'stopped', 'text': modified})
        for term in terms['tokens']:
            query_arr.append(term['token'])
        for word in query_arr:
            if word not in stop_arr:
                word_arr.append(word)
        query_dict[queryno] = word_arr
        word_arr = []
        query_arr = []
print(query_dict)

{'59': ['weather', 'event', 'directli', 'caus', 'least', 'fatal', 'locat'], '77': ['poach', 'wildlif'], '94': ['crime', 'aid', 'comput'], '85': ['alleg', 'corrupt', 'public', 'govern', 'worldwid'], '95': ['comput', 'applic', 'crime', 'solv'], '91': ['armi', 'advanc', 'weapon', 'system'], '56': ['lend', 'prime', 'rate'], '71': ['incurs', 'border', 'area', 'militari', 'guerrilla'], '64': ['polit', 'motiv', 'hostag', 'take'], '62': ['militari', 'coup', "d'etat", 'attempt', 'successfulli'], '93': ['support', 'nation', 'rifl', 'associ', 'nra', 'asset'], '99': ['iran', 'contra', 'affair'], '58': ['rail', 'strike'], '54': ['contract', 'agreement', 'launch', 'commerci', 'satellit'], '87': ['current', 'crimin', 'offic', 'fail', 'financi', 'institut'], '100': ['non', 'communist', 'regul', 'transfer', 'high', 'tech', 'good', 'technolog', 'undesir', 'nation'], '89': ['invest', 'opec', 'downstream', 'oper'], '61': ['israel', 'iran', 'contra', 'affair'], '68': ['safeti', 'manufactur', 'employe', 'in

# Define Important Variables

In [6]:
from collections import Counter

def doc_length(doc_id):
    return len(es.get(index='ap_data', id=doc_id)['_source']['text'].split())

def term_freq(term, doc_id):
    res = es.termvectors(index='ap_data', id = doc_id, fields='text')['term_vectors']['text']['terms']
    if res.get(term) is not None:
        return res.get(term)['term_freq']
    else:
        return 0

def field_stat():
    return es.termvectors(index='ap_data', id='AP890101-0001', fields='text')['term_vectors']['text']['field_statistics']

def corpus_length():
    return field_stat()['sum_doc_freq']

def tf_q(queryno, word):
    count = Counter(query_dict[queryno])
    return count[word]

def doc_filter(word):
    res = es.search(index='ap_data', body={"query": {"match": {"text": word}}, "size": 30000})
    return [d['_id'] for d in res['hits']['hits']]

def doc_freq(word):
    return es.count(index='ap_data', body = {
    "query": {
        "match": {
            "text" : word
            }
        }
    })['count']

def vocab_size():
    search_result = es.search(index='ap_data', size=0, body={
            "aggs" : {
                "unique_terms" : {
                    "cardinality" : {"field": "text"}
                }
            }
        }
    )
    return int(search_result['aggregations']['unique_terms']['value'])

In [7]:
total_docs = es.count(index='ap_data')['count']

avg_len = field_stat()['sum_ttf'] / total_docs

# Pre-compute Document Length

In [8]:
doc_len_info = {}
for docno in tqdm(docno_arr):
    doc_len_info[docno] = doc_length(docno)
doc_len_info['AP890101-0001']

100%|██████████| 84679/84679 [04:41<00:00, 301.06it/s]


1040

# Pre-compute Term Frequency

In [None]:
tf_info = []
def precompute_tf(query, tf_info):
    for key, words in query.items():
        print(key, words)
        for word in words:
            docs = doc_filter(word)
            for docno in docs:
                freq = {'id': docno, 'word': word, 'tf': term_freq(word, docno)}
                tf_info.append(freq)

precompute_tf(query_dict, tf_info)

# Models

In [22]:
def okapi_tf(tf, doc_len):
    return tf / (tf + 0.5 + 1.5 * (doc_len/avg_len))

def tf_idf(tf, doc_len, df, total_docs):
    return okapi_tf(tf, doc_len) * math.log(total_docs / df)

def okapi_bm25(tf, tf_q, doc_len, df, k1, k2, b, total_docs, avg_len):
    return math.log((total_docs + 0.5) / (df + 0.5)) * \
           ((tf + k1 * tf) / (tf + k1* ((1-b) + b * (doc_len / avg_len)))) * \
           ((tf_q + k2 * tf_q) / (tf_q + k2))

def unigram_lm_laplace(tf, doc_len, voc_size):
    return math.log((tf + 1) / (doc_len + voc_size))

def unigram_lm_jm(tf, doc_len, all_tf, all_len, lmbda):
    frntgrd = float(tf) / doc_len
    backgrd = float(all_tf - tf) / (all_len - doc_len)
    return lmbda * frntgrd + (1 - lmbda) * backgrd

## Helper Methods

In [23]:
def rank_scores(key, scores, out):
    if len(scores) < 1000:
        iter = len(scores)
    else:
        iter = 1000
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for j in range(iter):
        str = ('{} Q0 {} {} {} Exp'
            .format(key, sorted_scores[j][0], j+1, sorted_scores[j][1]))
        out.write(str+"\n")
    scores.clear()

In [24]:
def add_score(docno, model_scores, score):
    if docno in model_scores:
        model_scores[docno] += score
    else:
        model_scores[docno] = score

# Define ES Built-In Computing

In [25]:
import math
from collections import defaultdict

def compute_es_built_in(query):
    out = open('./es-built-in.txt', "a")
    es_built_in_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing ES Built-In'):
        for word in words:
            res=es.search(index='ap_data',body={"query":{"match":{"text":word}},"size":30000})
            for docno in res['hits']['hits']:
                add_score(docno['_id'], es_built_in_scores, docno['_score'])
        rank_scores(key, es_built_in_scores, out)
    out.close()

# Define Vector Space Models Computing

In [26]:
def compute_vs_models(query):
    okapi_tf_out = open('./okapi-tf.txt', "a")
    tf_idf_out = open('./tf-idf.txt', "a")
    okapi_bm25_out = open('./okapi-bm25.txt', "a")

    okapi_tf_scores = defaultdict(lambda: 0.0)
    tf_idf_scores = defaultdict(lambda: 0.0)
    okapi_bm25_scores = defaultdict(lambda: 0.0)

    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing Vector Space models'):
        for word in words:
            new_tf_info = [element for element in tf_info if element['word'] == word]
            docs = doc_filter(word)
            for docno in docs:
                tf = next(item for item in new_tf_info if item['id'] == docno)['tf']
                df = len(docs)
                qf = tf_q(key, word)
                doc_len = doc_len_info[docno]
                add_score(docno, okapi_tf_scores, okapi_tf(tf, doc_len))
                add_score(docno, tf_idf_scores, tf_idf(tf, doc_len, df, total_docs))
                add_score(docno, okapi_bm25_scores, okapi_bm25(tf, qf, doc_len, df, 1.2, 100, 0.75, total_docs, avg_len))
        rank_scores(key, okapi_tf_scores, okapi_tf_out)
        rank_scores(key, tf_idf_scores, tf_idf_out)
        rank_scores(key, okapi_bm25_scores, okapi_bm25_out)
    okapi_tf_out.close()
    tf_idf_out.close()
    okapi_bm25_out.close()

# Define Language Models Computing

In [27]:
def compute_unigram_lm_laplace(query):
    laplace_scores = defaultdict(lambda: 0.0)
    out = open('./unigram-lm-laplace.txt', "a")
    voc_size = vocab_size()
    for key,words in tqdm(query.items(), position=0, leave=True, desc='Computing Language Models(Unigram-LM-Laplace)'):
        for word in words:
            new_tf_info = [element for element in tf_info if element['word'] == word]
            docs=doc_filter(word)
            for docno in docno_arr:
                if docno in docs:
                    tf = next(item for item in new_tf_info if item['id'] == docno)['tf']
                    doc_len = doc_len_info[docno]
                else:
                    tf = 0
                    doc_len = 0
                score = unigram_lm_laplace(tf, doc_len, voc_size)
                add_score(docno, laplace_scores, score)
        rank_scores(key, laplace_scores, out)
    out.close()

def compute_unigram_lm_jm(query):
    jelinek_scores = defaultdict(lambda: 0.0)
    out = open('./unigram-lm-jm.txt', "a")
    lmbda = .7
    for key,words in tqdm(query.items(), position=0, leave=True, desc='Computing Language Models(Unigram-LM-JM)'):
        for word in words:
            new_tf_info = [element for element in tf_info if element['word'] == word]
            docs = doc_filter(word)
            all_tf = 0
            all_len = 0
            backgrd = 0
            frntgrd = 0
            for docno in docs:
                tf = next(item for item in new_tf_info if item['id'] == docno)['tf']
                all_tf = all_tf + tf
                all_len = all_len + len(docs)
            for docno in docs:
                tf = next(item for item in new_tf_info if item['id'] == docno)['tf']
                doc_len = doc_len_info[docno]
                lmbda = .7
                score = unigram_lm_jm(tf, doc_len, all_tf, all_len, lmbda)
                add_score(docno, jelinek_scores, score)
        rank_scores(key, jelinek_scores, out)
    out.close()

def compute_lang_models(query):
    compute_unigram_lm_laplace(query)
    compute_unigram_lm_jm(query)

# Pseudo-relevance Feedback

In [28]:
def retrieve_k_docs(key, scores, k, term_dict):
    arr = []
    temp_dict = defaultdict(lambda: 0.0)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    if len(sorted_scores) < 1000:
        iter = len(sorted_scores)
    else:
        iter = k
    for j in range(iter):
        arr.append(sorted_scores[j][0])
    for docno in arr:
        res = es.termvectors(index='ap_data', id = docno, fields='text')['term_vectors']['text']['terms']
        for id in res:
            if id not in temp_dict:
                temp_dict[id] = 1
        # print(temp_dict)
        for word in temp_dict:
            if word not in term_dict:
                term_dict[word] = 1
            else:
                term_dict[word] += 1
        temp_dict.clear()

In [29]:
def compute_pseudo_feedback(query):
    new_query_dict = copy.deepcopy(query)
    count = 0
    efficiency = []
    term_dict = defaultdict(lambda: 0.0)
    psdo_fdbck_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(new_query_dict.items(), position=0, leave=True, desc='Proceeding Pseudo-relevance Feedback...'):
        for word in words:
            res=es.search(index='ap_data', body={"query":{"match":{"text":word}}, "size":30000})
            for docno in res['hits']['hits']:
                add_score(docno['_id'], psdo_fdbck_scores, docno['_score'])
        retrieve_k_docs(key, psdo_fdbck_scores, 1000, term_dict)
        ranks = sorted(term_dict.items(), key=lambda x: x[1], reverse=True)
        efficiency = []
        for i in range(20):
            efficiency.append(ranks[i][0])
        for j in range(len(efficiency) - 1):
            if efficiency[j] not in new_query_dict[key]:
                new_query_dict[key].append(efficiency[j])
                count += 1
            if count == 2:
                efficiency.clear()
                term_dict.clear()
                ranks.clear()
                psdo_fdbck_scores.clear()
                count = 0
                break
    compute_es_built_in(new_query_dict, './feedback1.txt')

# Pseudo-relevance Feedback using significant terms

In [40]:
def sgni_term(word):  
    res = es.search(index='ap_data', body={
        "query" : {
            "terms" : {"text" : [ word ]}
        },
        "aggregations" : {
            "significantCrimeTypes" : {
                "significant_terms" : {
                "field" : "text"             
                }
            }
        },
        "size": 0
    })
    temp = [d for d in res['aggregations']['significantCrimeTypes']['buckets']][1]
    return temp['key'], temp['score']

In [44]:
def significant_terms(query):
    sgni_dict = defaultdict(lambda: 0.0)
    for key, words in query.items():
        for word in words:
            temp_key, temp_score = sgni_term(word)
            sgni_dict[temp_key] = temp_score
        ranks = sorted(sgni_dict.items(), key=lambda x: x[1], reverse=True)
        if ranks[0][0] not in query[key]:
            query[key].append(ranks[0][0])
            # new_tf_info.append(ranks[0][0])
        sgni_dict.clear()
    return query

In [51]:
def compute_sgni_pseudo_feedback(query):
    tf_info = []
    new_query_dict = copy.deepcopy(query)
    precompute_tf(significant_terms(new_query_dict), tf_info)
    compute_es_built_in(new_query_dict)

# Compute Models

In [37]:
compute_es_built_in(query_dict)
compute_vs_models(query_dict)
compute_lang_models(query_dict)
compute_pseudo_feedback(query_dict)
compute_sgni_pseudo_feedback(query_dict)

Proceeding Pseudo-relevance Feedback...: 100%|██████████| 25/25 [03:34<00:00,  9.08s/it]
Computing ES Built-In: 100%|██████████| 25/25 [05:34<00:00, 12.11s/it]
Pre-computing newly appended term frequencies: 100%|██████████| 23/23 [1:13:58<00:00, 295.78s/it]
Computing ES Built-In: 100%|██████████| 25/25 [02:13<00:00,  5.58s/it]
Computing ES Built-In: 100%|██████████| 25/25 [02:14<00:00,  5.71s/it]
Computing Vector Space models: 100%|██████████| 25/25 [07:48<00:00, 13.59s/it]
Computing Language Models(Unigram-LM-Laplace): 100%|██████████| 25/25 [24:31<00:00, 54.48s/it]
Computing Language Models(Unigram-LM-JM): 100%|██████████| 25/25 [12:23<00:00, 18.72s/it]
