# Assignment 1 
## Task 2

In [1]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string
from collections import defaultdict
from elasticsearch7 import Elasticsearch
from elasticsearch7.client import IndicesClient
import time
import math
import pickle
import json

ps = PorterStemmer()
stoplist = '../../IR_data/AP_DATA/stoplist.txt'
with open(stoplist, 'r') as f:
    stop_words = set(f.read().split())
def stem_text_and_remove_stopwords(tokens):
    processed_tokens = []
    for word in tokens: 
        w = word.strip()
        w = ps.stem(w)
        if w.lower() not in stop_words:
            processed_tokens.append(w)
    return ' '.join(processed_tokens)

In [2]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
def stem_text_and_remove_stopwords(tokens):
    processed_tokens = []
    for word in tokens: 
        w = ps.stem(word)
        w = w.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip()
        if w!='' and w.lower() not in stop_words and w not in processed_tokens:
            processed_tokens.append(w)
    return ' '.join(processed_tokens)
def process_text(text):
    tokens = word_tokenize(text)
    processed_text = stem_text_and_remove_stopwords(tokens)
    return processed_text

In [3]:
text_map = defaultdict(str)
folder = '../IR_data/AP_DATA/ap89_collection'
count = 0
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    with open(file_path, 'rb') as f:
        content = f.read().decode('iso-8859-1')
    doc_regex = r'<DOC>(.*?)</DOC>'
    for doc in re.findall(doc_regex, content, re.S):
        docno = re.search(r'<DOCNO>(.*?)</DOCNO>', doc).group(1).strip()      
        for each in re.findall(r'<TEXT>(.*?)</TEXT>', doc, re.S):    
            text_map[docno]+= ' ' + process_text(each)
                
print(len(text_map))


84678


### ElasticSearch

In [4]:
es = Elasticsearch('http://localhost:9200/', timeout=60)
print(es.ping())

True


In [5]:
index_name = 'ap89_collection0'

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

## Task 3

In [6]:
query_stop_words = stop_words.union(('document', 'noncommunist', 'locat', 'least', 'countri', 'second', 'unsubstanti', 'worldwid', 'exist', 
                               'product', 'preliminari', 'perpetr', 'aid', 'success', 'predict', 'describ', 'identifi', 'make', 'undesir',
                               'level', 'determin', 'perform', 'platform', 'someth', 'side', 'effort', 'standard', 'motiv',
                               'controversi', 'measur', 'tent', 'sign', 'individu', 'develop', 'nation', 'pend',
                               'includ', 'result', 'anticip', 'support', 'ani', 'ha', 'directli', 'border' ,'area', 'base',
                              'affair', 'ongo', 'method', 'sinc', 'system', 'candid', 'specifi', 'advanc', 'polit', 'attempt', 'asset'
                              , 'organ','u s'))
def query_stem_text_and_remove_stopwords(tokens):
    processed_tokens = []
    for word in tokens: 
        w = ps.stem(word)
        w = w.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip()
        if w!='' and w.lower() not in query_stop_words and w not in processed_tokens:
            processed_tokens.append(w)
    return ' '.join(processed_tokens)
    
def process_query(text):
    tokens = word_tokenize(text)
    processed_tokens = query_stem_text_and_remove_stopwords(tokens)
    return processed_tokens

In [7]:
query_file = '../IR_data/AP_DATA/query_desc.51-100.short.txt'
query_map = {}
with open(query_file, 'r') as f: 
    query_content = f.read().split('\n')
for line in query_content:
    dot_index = line.index('.')
    query_map[line[:dot_index]] = process_query(line[dot_index+1:].strip())
for k,v in query_map.items():
    print(k,v)

85 discuss alleg taken corrupt public offici government jurisdict
59 report type weather event caus one fatal
56 prime lend rate report actual move
71 report incurs land air water one militari forc guerrilla group
64 report event hostage tak
62 report militari coup d etat either
93 must rifl associ nra
99 iran contra
58 rail strike report
77 report poach use certain type wildlif
54 cite contract agreement reserv launch commerci satellit
87 report current crimin action offic fail financi institut
94 must crime comput
100 non communist industri state regul transfer high tech good dual us technolog
89 must invest opec member state downstream oper
61 discuss role israel iran contra
95 must comput applic crime solv
68 report actual studi even concern safeti manufactur employe instal worker fine diamet fiber use insul
57 discuss mci bell breakup
97 must instanc fiber optic technolog actual use
98 must produc fiber optic equip
60 either one use salari incent pay contrast sole basi senior long

### ES Builtin

In [8]:
ic = IndicesClient(es)
print(es.ping())

True


In [1]:
def get_score(query):
    res = es.search(index=index_name, query={'match': {'content': query}}, size=2000, scroll='2m')
    scroll_id = res['_scroll_id']
    scores = {}
    while True:
        for hit in res['hits']['hits']:
            doc_id = hit['_id']
            score = hit['_score']
            scores[doc_id] = score
        if len(res['hits']['hits']) == 0:
            break
        res = es.scroll(scroll_id=scroll_id, scroll='2m')
    return scores

def save_es_scores():    
    for q_id in query_map:
        scores = get_score(query_map[q_id])
        with open(f'data/scores/es_{q_id}.pickle', 'wb') as file:
            pickle.dump(scores, file)
        
save_es_scores()

NameError: name 'query_map' is not defined

In [49]:
ids = list(text_map.keys())
number_of_docs = len(ids)
def get_term_doc_frequencies():
    size = 3000
    term_frequencies = {}
    doc_frequencies = {}
    term_ttf = {}
    
    for i in range((number_of_docs // size)+1):
        body = {
            "ids": ids[size*i:min(number_of_docs,size*(i+1))],
            "parameters": {
                "fields": ["content"],
                "offsets": "false",
                "payloads": "false",
                "positions": "false",
                "term_statistics": "true",
                "field_statistics": "false"
            }
        }
        term_vectors = es.mtermvectors(index=index_name, body=body)
        for doc in term_vectors['docs']:
            doc_id = doc['_id']
            if 'term_vectors' in doc and 'content' in doc['term_vectors']:
                terms = doc['term_vectors']['content']['terms']
                term_frequencies[doc_id] = {}
                doc_frequencies[doc_id] = {}
                term_ttf[doc_id] = {}
                for term, info in terms.items():
                    term_frequencies[doc_id][term] = info['term_freq']
                    doc_frequencies[doc_id][term] = info['doc_freq']
                    term_ttf[doc_id][term] = info['ttf']
    return term_frequencies, doc_frequencies, term_ttf

term_frequencies, doc_frequencies, term_ttf = get_term_doc_frequencies()

In [50]:
avg_len_d = 0
for k,v in term_frequencies.items():
    avg_len_d+=len(v)
avg_len_d = avg_len_d//len(term_frequencies) #170.072
avg_len_d

170

In [51]:
num_unique_words = es.search(index=index_name, body={ "aggs": { "unique_terms": { "cardinality": { "field": "content", "precision_threshold": 40000}}} })['aggregations']['unique_terms']['value']
num_unique_words

  num_unique_words = es.search(index=index_name, body={ "aggs": { "unique_terms": { "cardinality": { "field": "content", "precision_threshold": 40000}}} })['aggregations']['unique_terms']['value']


182041

In [53]:
def calculate_okapi_TF(tf, doc_length, avg_len_d):
    return tf / (tf + 0.5 + 1.5 * (doc_length / avg_len_d))

def calculate_tfidf(tf, df, doc_length, num_doc, avg_len_d):
    term_f = (tf / (tf + 0.5 + 1.5 * (doc_length / avg_len_d)))
    idf = math.log(num_doc / df) if df else 0
    return  term_f * idf

def compute_okapi_bm25(tf, df, doc_length, avg_len_d, num_doc):
    k1, k2, b = 1.2, 1,0.75
    return math.log((num_doc + 0.5) / (df + 0.5)) * ((tf + k1 * tf) / (tf + k1 * ((1 - b) + b * (doc_length / avg_len_d)))) * ((tf + k2 * tf)/(tf + k2))

def compute_unigram_lml(tf, doc_length, num_unique_words):
    if tf!=0:
        return math.log((tf + 1) / (doc_length + num_unique_words))
    else: 
        return -1000
    
def compute_unigram_lmjm(tf, ttf, doc_length, num_unique_words, lambda_const):
    if tf!=0:
        return math.log(lambda_const * (tf / doc_length) + (1 - lambda_const) * (ttf / num_unique_words))
    else: 
        return -1000
    

In [54]:
def get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model):
    model_scores = defaultdict(dict)
    for doc_id, doc in term_frequencies.items():
        len_d = len(doc)
        num_doc = len(term_frequencies.keys())
        for query_id, query in query_map.items():
            score = 0
            for term in query.split():
                tf, df, ttf = 0, 0, 0
                if term in doc: 
                    tf = doc[term]
                    df = doc_frequencies[doc_id][term]
                    ttf = term_ttf[doc_id][term]
                if model=='okapi_tf':
                    score += calculate_okapi_TF(tf, len_d, avg_len_d)
                elif model=='tfidf':
                    score += calculate_tfidf(tf, df, len_d, num_doc, avg_len_d)
                elif model=='okapi_bm25':
                    score += compute_okapi_bm25(tf, df, len_d, avg_len_d, num_doc)
                elif model=='unigram_lml':
                    score += compute_unigram_lml(tf, len_d, num_unique_words)
                elif model=='unigram_lmjm':
                    score += compute_unigram_lmjm(tf, ttf, len_d, num_unique_words, 0.5)
            if score!=0:
                if doc_id not in model_scores[query_id]:
                    model_scores[query_id][doc_id] = score
                else:
                    model_scores[query_id][doc_id] += score
    return model_scores
            

In [57]:
def save_scores(model_scores, filename):
    for q_id, scores in model_scores.items():
        with open(f'data/scores/{filename}/{filename}_{q_id}.pickle', 'wb') as file:
            pickle.dump(scores, file)

In [59]:
filename = 'okapi_tf'
okapi_tf_model_scores = get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_tf')
save_scores(okapi_tf_model_scores, filename)


In [60]:
filename = 'tfidf'
tfidf_ranked_model_scores = get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='tfidf')
save_scores(tfidf_ranked_model_scores, filename)


In [61]:
filename = 'okapi_bm25'
okapi_bm25_model_scores = get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='okapi_bm25')
save_scores(okapi_bm25_model_scores, filename)


In [62]:
filename = 'unigram_lml'
unigram_lml_model_scores = get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='unigram_lml')
save_scores(unigram_lml_model_scores, filename)


In [63]:
filename = 'unigram_lmjm'
unigram_lmjm_model_scores = get_scores(query_map, term_frequencies, doc_frequencies, term_ttf, avg_len_d, model='unigram_lmjm')
save_scores(unigram_lmjm_model_scores, filename)