# Assignment 1 
## Task 2

In [20]:
# Imports
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string
from collections import defaultdict
from elasticsearch7 import Elasticsearch
from elasticsearch7.client import IndicesClient
import time
import math

#### Preprocessing

In [21]:
ps = PorterStemmer()
stoplist = '../IR_data/AP_DATA/stoplist.txt'
with open(stoplist, 'r') as f:
    stop_words = set(f.read().split())
def stem_text_and_remove_stopwords(tokens):
    processed_tokens = []
    for word in tokens: 
        w = word.strip()
        w = ps.stem(w)
        if w.lower() not in stop_words:
            processed_tokens.append(w)
    return ' '.join(processed_tokens)

In [22]:
def process_text(text):
    tokens = word_tokenize(text)
    processed_text = stem_text_and_remove_stopwords(tokens)
    return processed_text

In [23]:
text_map = defaultdict(str)
folder = '../IR_data/AP_DATA/ap89_collection'
count = 0
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    with open(file_path, 'rb') as f:
        content = f.read().decode('iso-8859-1')
    doc_regex = r'<DOC>(.*?)</DOC>'
    for doc in re.findall(doc_regex, content, re.S):
        docno = re.search(r'<DOCNO>(.*?)</DOCNO>', doc).group(1).strip()      
        for each in re.findall(r'<TEXT>(.*?)</TEXT>', doc, re.S):    
            text_map[docno]+= ' ' + process_text(each)
                
print("Number of documents: ", len(text_map))


Number of documents:  84678


### ElasticSearch - Indexing data

In [24]:
es = Elasticsearch('http://localhost:9200/', timeout=60)
print(es.ping())

True


In [25]:
index_name = 'ap89_collection'

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [26]:
es.indices.create(index=index_name, body=configurations)

  es.indices.create(index=index_name, body=configurations)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap89_collection'}

In [27]:
def add_data(_id, text):
    es.index(index=index_name, document={'content':text},id=_id)

In [28]:
for key in text_map: 
    add_data(key, text_map[key])
print("Successfully index data!")

Successfully index data!


In [29]:
es.indices.refresh(index=index_name)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

## Task 3 - Retreival Models

In [30]:
# Preprocess queries
query_stop_words = stop_words.union(('document', 'noncommunist', 'locat', 'least', 'countri', 'second', 'unsubstanti', 'worldwid', 'exist', 
                               'product', 'preliminari', 'perpetr', 'aid', 'success', 'predict', 'describ', 'identifi', 'make', 'undesir',
                               'level', 'determin', 'perform', 'platform', 'someth', 'side', 'effort', 'standard', 'motiv',
                               'controversi', 'measur', 'tent', 'sign', 'individu', 'develop', 'nation', 'pend',
                               'includ', 'result', 'anticip', 'support', 'ani', 'ha', 'directli', 'border' ,'area', 'base',
                              'affair', 'ongo', 'method', 'sinc', 'system', 'candid', 'specifi', 'advanc', 'polit', 'attempt', 'asset'
                              , 'organ','u s'))
def query_stem_text_and_remove_stopwords(tokens):
    processed_tokens = []
    for word in tokens: 
        w = ps.stem(word)
        w = w.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip()
        if w!='' and w.lower() not in query_stop_words and w not in processed_tokens:
            processed_tokens.append(w)
    return ' '.join(processed_tokens)
    
def process_query(text):
    tokens = word_tokenize(text)
    processed_tokens = query_stem_text_and_remove_stopwords(tokens)
    return processed_tokens

In [31]:
query_file = '../IR_data/AP_DATA/query_desc.51-100.short.txt'
query_map = {}
with open(query_file, 'r') as f: 
    query_content = f.read().split('\n')
for line in query_content:
    dot_index = line.index('.')
    query_map[line[:dot_index]] = process_query(line[dot_index+1:].strip())
for k,v in query_map.items():
    print(k,v)

85 alleg corrupt public offici government jurisdict
59 weather caus fatal
56 prime lend rate
71 incurs militari forc guerrilla
64 hostage tak
62 militari coup d etat
93 rifl associ nra
99 iran contra
58 rail strike
77 poach wildlif
54 contract agreement reserv launch commerci satellit
87 current crimin action offic fail financi institut
94 crime comput
100 non communist industri regul transfer high tech good dual us technolog
89 invest opec downstream oper
61 israel iran contra
95 comput applic crime solv
68 studi concern safeti manufactur employe instal worker fine diamet fiber insul
57 mci bell
97 instanc fiber optic technolog
98 produc fiber optic equip
60 salari incent pay contrast sole basi senior longev
80 1988 presidenti
63 machin translat
91 acquisit armi weapon


### ES Builtin

In [32]:
ic = IndicesClient(es)
print(es.ping())

True


In [33]:
def ES_Search(query):
    res_es_search = es.search(index=index_name, query={'match': {'content': query}}, size=1000)
    return res_es_search

def process_res(res, query_num):    
    output = ''
    for rank, data in enumerate(res['hits']['hits']):
        output += query_num + ' Q0 ' + str(data['_id']) + ' ' + str(rank+1) + ' ' + str(data['_score']) + ' Exp\n'
    return output
        

In [34]:
output_path = '../Deliverables/results/'

def output_txt(filename, string):
    with open(output_path+filename+'.txt', 'w') as f: 
        f.write(string)

In [35]:
start = time.time()
filename = 'es_built_in'
final_output = ''
for query_num, query in query_map.items():
    res = ES_Search(query)
    print("Completed query number", query_num)
    output = process_res(res, query_num)
    final_output+=output
output_txt(filename, final_output)
end = time.time()
print("Completed total run in time", round((end-start), 2), "seconds")

Completed query number 85
Completed query number 59
Completed query number 56
Completed query number 71
Completed query number 64
Completed query number 62
Completed query number 93
Completed query number 99
Completed query number 58
Completed query number 77
Completed query number 54
Completed query number 87
Completed query number 94
Completed query number 100
Completed query number 89
Completed query number 61
Completed query number 95
Completed query number 68
Completed query number 57
Completed query number 97
Completed query number 98
Completed query number 60
Completed query number 80
Completed query number 63
Completed query number 91
Completed total run in time 1.67 seconds


In [36]:
def process_txt(scores):
    output=''
    for query_id, doc_ids in scores.items():
        for rank, (doc_id, score) in enumerate(doc_ids):
            output += query_id + ' Q0 ' + str(doc_id) + ' ' + str(rank+1) + ' ' + str(score) + ' Exp\n'
    return output

#### Getting tf, df, ttf from ES

In [37]:
ids = list(text_map.keys())
number_of_docs = len(ids)
def get_term_doc_frequencies():
    size = 3000
    term_frequencies = {}
    doc_frequencies = {}
    term_ttf = {}
    
    for i in range((number_of_docs // size)+1):
        body = {
            "ids": ids[size*i:min(number_of_docs,size*(i+1))],
            "parameters": {
                "fields": ["content"],
                "offsets": "false",
                "payloads": "false",
                "positions": "false",
                "term_statistics": "true",
                "field_statistics": "false"
            }    
        }
        term_vectors = es.mtermvectors(index=index_name, body=body)
        for doc in term_vectors['docs']:
            doc_id = doc['_id']
            if 'term_vectors' in doc and 'content' in doc['term_vectors']:
                terms = doc['term_vectors']['content']['terms']
                term_frequencies[doc_id] = {}
                doc_frequencies[doc_id] = {}
                term_ttf[doc_id] = {}
                for term, info in terms.items():
                    term_frequencies[doc_id][term] = info['term_freq'] 
                    doc_frequencies[doc_id][term] = info['doc_freq']
                    term_ttf[doc_id][term] = info['ttf']
    return term_frequencies, doc_frequencies, term_ttf

term_frequencies, doc_frequencies, term_ttf = get_term_doc_frequencies()

In [38]:
# Calculate average length of document
avg_len_d = 0
for k,v in term_frequencies.items():
    avg_len_d+=len(v)
avg_len_d = avg_len_d//len(term_frequencies) #170.072
avg_len_d

170

In [39]:
# Getting Vocabulary size from ES
num_unique_words = es.search(index=index_name, body={ "aggs": { "unique_terms": { "cardinality": { "field": "content", "precision_threshold": 40000}}} })['aggregations']['unique_terms']['value']
num_unique_words

  num_unique_words = es.search(index=index_name, body={ "aggs": { "unique_terms": { "cardinality": { "field": "content", "precision_threshold": 40000}}} })['aggregations']['unique_terms']['value']


182041

In [40]:
#Okapi TF
def calculate_okapi_TF(tf, doc_length, avg_len_d):
    return tf / (tf + 0.5 + 1.5 * (doc_length / avg_len_d))

#TFIDF
def calculate_tfidf(tf, df, doc_length, num_doc, avg_len_d):
    term_f = (tf / (tf + 0.5 + 1.5 * (doc_length / avg_len_d)))
    idf = math.log(num_doc / df) if df else 0
    return  term_f * idf

#Okapi BM25
def compute_okapi_bm25(tf, df, doc_length, avg_len_d, num_doc):
    k1, k2, b = 1.2, 1,0.75
    return math.log((num_doc + 0.5) / (df + 0.5)) * ((tf + k1 * tf) / (tf + k1 * ((1 - b) + b * (doc_length / avg_len_d)))) * ((tf + k2 * tf)/(tf + k2))

#Unigram LM with Laplace smoothing
def compute_unigram_lml(tf, doc_length, num_unique_words):
    if tf!=0:
        return math.log((tf + 1) / (doc_length + num_unique_words))
    else: 
        return -1000

# Unigram LM with Jelinek-Mercer smoothing
def compute_unigram_lmjm(tf, ttf, doc_length, num_unique_words, lambda_const):
    if tf!=0:
        return math.log(lambda_const * (tf / doc_length) + (1 - lambda_const) * (ttf / num_unique_words))
    else: 
        return -1000
    

In [41]:
#Rank documents and get top 1000 for each query
def rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model):
    model_scores = defaultdict(dict)
    for doc_id, doc in term_frequencies.items():
        len_d = len(doc)
        num_doc = len(term_frequencies.keys())
        for query_id, query in query_map.items():
            score = 0
            for term in query.split():
                tf, df, ttf = 0, 0, 0
                if term in doc: 
                    tf = doc[term]
                    df = doc_frequencies[doc_id][term]
                    ttf = term_ttf[doc_id][term]
                if model=='okapi_tf':
                    score += calculate_okapi_TF(tf, len_d, avg_len_d)
                elif model=='tfidf':
                    score += calculate_tfidf(tf, df, len_d, num_doc, avg_len_d)
                elif model=='okapi_bm25':
                    score += compute_okapi_bm25(tf, df, len_d, avg_len_d, num_doc)
                elif model=='unigram_lml':
                    score += compute_unigram_lml(tf, len_d, num_unique_words)
                elif model=='unigram_lmjm':
                    score += compute_unigram_lmjm(tf, ttf, len_d, num_unique_words, 0.5)
            if score!=0:
                if doc_id not in model_scores[query_id]:
                    model_scores[query_id][doc_id] = score
                else:
                    model_scores[query_id][doc_id] += score
    ranked_documents = {}
    for query_id, doc_scores in model_scores.items():
        ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:1000]
        ranked_documents[query_id] = ranked_docs
    ranked_documents
    return ranked_documents
            

## Save to files

### Okapi TF

In [42]:
filename = 'okapi_tf'
okapi_tf_ranked_documents = rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_tf')
output = process_txt(okapi_tf_ranked_documents)
output_txt(filename, output)

### TFIDF

In [43]:
filename = 'tfidf'
tfidf_ranked_documents = rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='tfidf')
output = process_txt(tfidf_ranked_documents)
output_txt(filename, output)

### Okapi BM25

In [44]:
filename = 'okapi_bm25'
okapi_bm25_ranked_documents = rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_bm25')
output = process_txt(okapi_bm25_ranked_documents)
output_txt(filename, output)

### Unigram LM with Laplace smoothing

In [45]:
filename = 'unigram_lml'
unigram_lml_ranked_documents = rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='unigram_lml')
output = process_txt(unigram_lml_ranked_documents)
output_txt(filename, output)

### Unigram LM with Jelinek-Mercer smoothing

In [46]:
filename = 'unigram_lmjm'
unigram_lmjm_ranked_documents = rank_documents(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='unigram_lmjm')
output = process_txt(unigram_lmjm_ranked_documents)
output_txt(filename, output)

## Task 5 - Pseudo-relevance Feedback

In [47]:
def get_relevance_feedback_terms(k = 10):
    relevance_feedback_terms = defaultdict(list)
    for query_id, ranked_docs in tfidf_ranked_documents.items():
        top = []
        for doc_id, tfidf in ranked_docs[:3]:
            top.extend(term_frequencies[doc_id].items())
        relevance_feedback_terms[query_id] = [item[0] for item in sorted(top, key=lambda x: (x[1]), reverse=True)[:k] if len(item[0])>2]
    return dict(relevance_feedback_terms)

In [48]:
relevance_feedback_terms = get_relevance_feedback_terms()
query_map_with_relevance_feedback = query_map.copy()
for query_id, query in query_map_with_relevance_feedback.items():
    relevance = query.split()
    i = 0
    while len(relevance)<5:
        relevance.append(relevance_feedback_terms[query_id][i])
        i+=1
    new_query = ' '.join(relevance)
    query_map_with_relevance_feedback[query_id]=new_query
query_map_with_relevance_feedback

{'85': 'alleg corrupt public offici government jurisdict',
 '59': 'weather caus fatal tornado concentr',
 '56': 'prime lend rate rate bank',
 '71': 'incurs militari forc guerrilla south',
 '64': 'hostage tak hostag lebanon unit',
 '62': 'militari coup d etat noriega',
 '93': 'rifl associ nra gun nra',
 '99': 'iran contra north iran mees',
 '58': 'rail strike rail union strike',
 '77': 'poach wildlif gator eleph allig',
 '54': 'contract agreement reserv launch commerci satellit',
 '87': 'current crimin action offic fail financi institut',
 '94': 'crime comput comput comput chip',
 '100': 'non communist industri regul transfer high tech good dual us technolog',
 '89': 'invest opec downstream oper million',
 '61': 'israel iran contra north north',
 '95': 'comput applic crime solv comput',
 '68': 'studi concern safeti manufactur employe instal worker fine diamet fiber insul',
 '57': 'mci bell mci amp amp',
 '97': 'instanc fiber optic technolog fiber',
 '98': 'produc fiber optic equip fiber

### Okapi TF Pseudo-relevance Feedback

In [49]:
filename = 'okapi_tf_rel_feedback'
okapi_tf_rel_feedback_ranked_documents = rank_documents(query_map_with_relevance_feedback, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_tf')
output = process_txt(okapi_tf_rel_feedback_ranked_documents)
output_txt(filename, output)

###  ElasticSearch aggs

In [50]:
def get_rel_feedback_aggs(query):
    significant_terms = es.search(index=index_name, 
                                  body = {
                                      "aggregations" : {
                                          "significantTerms" : {
                                              "significant_terms" : {
                                                  "field" : "content"
                                              }
                                          }
                                      },
                                       "query" : {
                                          "terms" : {
                                              "content" : [query.split()[-2]]
                                          }
                                      },
                                      "size": 0
                                  }
                                 )
    return significant_terms

In [51]:
query_map_with_relevance_feedback_aggs = query_map.copy()
for query_id, query in query_map_with_relevance_feedback_aggs.items():
    significant_terms_response = get_rel_feedback_aggs(query)
    significant_terms=[]
    for i in significant_terms_response['aggregations']['significantTerms']['buckets']: 
        if len(i['key'])>2:
            significant_terms.append(i['key'])
    relevance = query.split()
    i = 0
    while len(relevance)<3:
        relevance.insert(0, significant_terms[i])
        i+=1
    new_query = ' '.join(relevance)
    query_map_with_relevance_feedback_aggs[query_id]=new_query
query_map_with_relevance_feedback_aggs

  significant_terms = es.search(index=index_name,


{'85': 'alleg corrupt public offici government jurisdict',
 '59': 'weather caus fatal',
 '56': 'prime lend rate',
 '71': 'incurs militari forc guerrilla',
 '64': 'hostage hostage tak',
 '62': 'militari coup d etat',
 '93': 'rifl associ nra',
 '99': 'iran iran contra',
 '58': 'rail rail strike',
 '77': 'poach poach wildlif',
 '54': 'contract agreement reserv launch commerci satellit',
 '87': 'current crimin action offic fail financi institut',
 '94': 'crime crime comput',
 '100': 'non communist industri regul transfer high tech good dual us technolog',
 '89': 'invest opec downstream oper',
 '61': 'israel iran contra',
 '95': 'comput applic crime solv',
 '68': 'studi concern safeti manufactur employe instal worker fine diamet fiber insul',
 '57': 'mci mci bell',
 '97': 'instanc fiber optic technolog',
 '98': 'produc fiber optic equip',
 '60': 'salari incent pay contrast sole basi senior longev',
 '80': '1988 1988 presidenti',
 '63': 'machin machin translat',
 '91': 'acquisit armi weapon'

### Okapi TF Pseudo-relevance Feedback using ElasticSearch aggs "significant terms"

In [52]:
filename = 'okapi_tf_rel_feedback_aggs'
okapi_tf_rel_feedback_aggs_ranked_documents = rank_documents(query_map_with_relevance_feedback_aggs, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_tf')
output = process_txt(okapi_tf_rel_feedback_aggs_ranked_documents)
output_txt(filename, output)