# [Parse, Stem, and Tokenize]

In [2]:
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer
import pprint
import json
import nltk
import os
import re

docs_dict = {}
unqno = 1
unq_dict = {}


stoplist = open('../HW1/reference/stoplist.txt')

stop_arr = []
for line in stoplist:
    stop_arr.append(line.strip())

def parse_docs(docs):
    while '<DOC>' in docs:
        text = ""
        docend = docs.find('</DOC>')
        substr = docs[:docend]
        d_stt = substr.find('<DOCNO>') + len('<DOCNO>')
        d_end = substr.find('</DOCNO>')
        docno = substr[d_stt:d_end].strip()
        while "<TEXT>" in substr:
            t_stt = substr.find('<TEXT>') + len('<TEXT>')
            t_end = substr.find('</TEXT>')
            text = text + substr[t_stt:t_end].strip() + '\n'
            substr = substr[t_end + len('</TEXT>'):]
        docs = docs[docend + len('</DOC>'):]
        docs_dict[docno] = text

def read_file(filename):
    with open(filename, "r", encoding="ISO-8859-1") as f:
        return f.read()

def stemmed_tokens(docno, text):
    global unqno
    count = 1
    token_arr = []
    temp_arr = []
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token != "'s"]
    stemmed_tokens = [element for item in stemmed_tokens for element in item.split('-')]
    for i in stemmed_tokens:
        i = i.lower()
        if i not in stop_arr and re.search('[a-zA-Z0-9]', str(i)):
            if i not in unq_dict.keys():
                arr = [i, unqno, docno, count]
                unq_dict[i] = unqno
                unqno += 1
                count += 1
            else:
                arr = [i, unq_dict[i], docno, count]
                count += 1
            token_arr.append(arr)
    return token_arr

def unstemmed_tokens(docno, text):
    global unqno
    count = 1
    token_arr = []
    temp_arr = []
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    for i in tokens:
        i = i.lower()
        if i not in stop_arr and re.search('[a-zA-Z0-9]', str(i)):
            if i not in unq_dict.keys():
                arr = [i, unqno, docno, count]
                unq_dict[i] = unqno
                unqno += 1
                count += 1
            else:
                arr = [i, unq_dict[i], docno, count]
                count += 1
            token_arr.append(arr)
    return token_arr

In [4]:
def bulk_index(filepath):
    for file in tqdm(os.listdir(filepath), position=0, leave=True):
        parse_docs(read_file(os.path.join(filepath, file)))

bulk_index('../AP_DATA/ap89_collection')

len(docs_dict)

100%|██████████| 365/365 [00:03<00:00, 101.60it/s]


84678

In [4]:
def stemmed_dict():
    stem_dict = {}
    global unq_dict
    for key, words in tqdm(docs_dict.items(), position=0, leave=True, desc='Stemming and tokenizing...'):
        stem_dict[key] = stemmed_tokens(key, words)
    return stem_dict

def unstemmed_dict():
    unstem_dict = {}
    global unq_dict
    for key, words in tqdm(docs_dict.items(), position=0, leave=True, desc='Stemming and tokenizing...'):
        unstem_dict[key] = unstemmed_tokens(key, words)
    return unstem_dict

In [5]:
stemmed_dict = stemmed_dict()
# unstemmed_dict = unstemmed_dict()

Stemming and tokenizing...: 100%|██████████| 84678/84678 [22:15<00:00, 63.39it/s]


# [Inverted Index]

## Create partial lists

In [6]:
def inverted_index(stem_dict):
    offset = 0
    count = 1
    temp_dict = {}
    for docno, tuples in tqdm(stem_dict.items(), position=0, leave=True, desc='Creating inverted indexes...'):
        position_dict = {}
        for tpl in tuples:
            if tpl[0] not in position_dict.keys():
                position_dict[tpl[0]] = [n[3] for n in tuples if n[0] == tpl[0]]
        for tpl in tuples:
            if tpl[0] not in temp_dict.keys():
                temp_dict[tpl[0]] = [[docno, len(position_dict[tpl[0]]), position_dict[tpl[0]]]]
            elif tpl[2] not in [n[0] for n in temp_dict[tpl[0]]]:
                temp_dict[tpl[0]].append([docno, len(position_dict[tpl[0]]), position_dict[tpl[0]]])
        offset += 1
        if offset == 1000:
            filename = "./batches/batch"+str(count)+".txt"
            count += 1
            f = open(filename, 'w', encoding="ISO-8859-1")
            f.write(json.dumps(temp_dict, separators=(',', ':')))
            temp_dict = {}
            offset = 0
    filename = "./batches/batch"+str(count)+".txt"
    f = open(filename, 'w', encoding="ISO-8859-1")
    f.write(json.dumps(temp_dict, separators=(',', ':')))
    temp_dict = {}

In [7]:
inverted_index(stemmed_dict)
# inverted_index(unstemmed_dict)

Creating inverted indexes...: 100%|██████████| 84678/84678 [09:44<00:00, 144.79it/s]


# [Merge]

## Merge using Offset and Catalogs

In [134]:
def before_merge():
    catalog_counter = 1
    str1 = "./batches/batch"+ str(catalog_counter) + ".txt"

    f = open(str1, 'a')
    offset = 0
    i = 0
    count = 0
    dict_catalog = {}
    for k in tqdm(stemmed_dict, position=0, leave=True):
        if (i == 1):
            offset = offset + len(k) + len(" ")
            size = len(str(stemmed_dict[k])) + len("\n")
            str_catalog = str(k) + " " + str(offset) + " " + str(size) + "\n"
            dict_catalog[k] = [offset, size]
            offset = offset + size
        else:
            offset = len(k) + len(" ")
            size = len(str(stemmed_dict[k])) + len("\n")
            str_catalog = str(k) + " " + str(offset) + " " + str(size) + "\n"
            dict_catalog[k] = [offset, size]
            offset = offset + size
            i = 1
        g.write(str(k) + " " + str(stemmed_dict[k]) + "\n")
        f.write(str_catalog)
        count+=1
        if count == 1000:
            catalog_counter += 1
            str1 = "./Data/Catalog/"+"catalog"+ str(catalog_counter) + ".txt"
            filename = "./Data/Index/"+"inverted_index" + str(catalog_counter) + ".txt"
            f = open(str1, 'a')
            g = open(filename, 'a')
            count = 0
    g.close()
    f.close()

In [182]:
def create_catalog(catalog_counter):
    index_str = "./indices/inverted_index" + str(catalog_counter) + ".txt"
    batch_str = "./batches/batch" + str(catalog_counter) + ".txt"
    catalog_str = "./catalogs/catalog" + str(catalog_counter) + ".txt"
    with open(batch_str) as file:
        index_dict = json.loads(file.read())
    catalog_f = open(catalog_str, 'a')
    index_f = open(index_str, 'a')
    offset = 0
    i = 0
    count = 0
    catalog_dict = {}
    for key in index_dict:
        offset = offset + len(key) + len(" ")
        size = len(str(index_dict[key])) + len("\n")
        str_catalog = str(key) + " " + str(offset) + " " + str(size) + "\n"
        catalog_dict[key] = [offset, size]
        offset = offset + size
        index_f.write(str(key) + " " + str(index_dict[key]) + "\n")
        catalog_f.write(str_catalog)
    count += 1

In [183]:
before_merge()
for i in range(85):
    create_catalog(i+1)

In [5]:
import ast

def merge_using_catalogs(counter, merged_dict):
    temp_dict={}
    catalog_str = "./catalogs/catalog" + str(counter) + ".txt"
    index_str = "./indices/inverted_index" + str(counter) + ".txt"
    catalog_dict = {}
    iList1 = open(index_str, 'r')
    with open(catalog_str) as fh:
        for line in fh:
            word, description = line.strip().split(' ', 1)
            catalog_dict[word] = description.split()
    for i in catalog_dict.keys():
        iList1.seek(0)
        list_offset = catalog_dict[i]
        o = int(list_offset[0])
        s = list_offset[1]
        s = int(s)
        iList1.seek(o)
        data1 = iList1.read(s)
        data1 = data1.rstrip()
        data1 = ast.literal_eval(data1)
        temp_dict[i]=data1
    for key in temp_dict.keys():
        if key in merged_dict:
            for i in range(len(temp_dict[key])):
                merged_dict[key].append(temp_dict[key][i])
        else:
            merged_dict[key] = temp_dict[key]

def single_inverted_index(num_files):
    merged_dict = {}
    for i in tqdm(range(num_files)):
        merge_using_catalogs(i+1, merged_dict)
    return merged_dict

In [8]:
def merge_dict(dict1, dict2):
   merged_dict = {**dict1, **dict2}
   for key, value in merged_dict.items():
       if key in dict1 and key in dict2:
            for i in range(len(dict1[key])):
                merged_dict[key].append(dict1[key][i])
   return merged_dict

In [6]:
merged_dict = single_inverted_index(85)

100%|██████████| 85/85 [03:57<00:00,  2.80s/it]


## Merge using **kwargs

In [9]:
merged={}
for file in tqdm(os.listdir('./batches'), position=0, leave=True):
    if file != '.DS_Store':
        with open(os.path.join('./batches', file), encoding="ISO-8859-1") as file:
            dict2 = json.load(file)
            merged = merge_dict(merged, dict2)

f = open('merged.txt', 'w', encoding="ISO-8859-1")
f.write(json.dumps(merged))

100%|██████████| 86/86 [02:47<00:00,  1.95s/it]


453995399

In [10]:
import json

with open("merged.txt") as file:
    data = json.loads(file.read())

# [Compress and Decompress]

## Compress Using GZip Library

In [1]:
import gzip

fp = open("merged.txt","rb")
data = fp.read()
bindata = bytearray(data)
with gzip.open("merged.txt.gz", "wb") as f:
    f.write(bindata)

In [3]:
import json

with gzip.open("merged.txt.gz", "rb") as f:
	data = json.loads(f.read())

## Create Uncompressed Index

In [6]:
docno_map = {}
word_map = {}
docno_counter = 0
word_counter = 0
for docno in docs_dict:
    docno_map[docno] = docno_counter
    docno_counter += 1
for word in merged_dict:
    word_map[word] = word_counter
    word_counter += 1
for key in merged_dict:
    for item in merged_dict[key]:
        item[0] = docno_map[item[0]]

In [8]:
def change_key_name(merged_dict):
    for key in tqdm(merged_dict, desc='Changing key name'):
        if str(key) in word_map.keys() and str(key) in merged_dict.keys():
            merged_dict[int(word_map[key])] = merged_dict[str(key)]
            del merged_dict[str(key)]

In [9]:
def save_uncompressed_index(merged_dict):
    file_dict = {}
    for key in tqdm(merged_dict, position=0, leave=True, desc='saving uncompressed index'):
        file_dict[key] = []
        for item in merged_dict[key]:
            for i in item:
                if isinstance(i, int):
                    file_dict[key].append(i)
                else:
                    for a in i:
                        file_dict[key].append(a)
    f = open('./merge_test.txt', 'w', encoding="ISO-8859-1")
    f.write(json.dumps(file_dict, separators=(',', ':')))

In [12]:
change_key_name(merged_dict)



Changing key name:   0%|          | 0/188734 [00:00<?, ?it/s][A[A

Changing key name: 100%|██████████| 188734/188734 [00:00<00:00, 1117231.63it/s]


In [13]:
save_uncompressed_index(merged_dict)

Changing key name:  84%|████████▎ | 157887/188734 [00:20<00:00, 525139.69it/s]
saving uncompressed index:  81%|████████  | 153153/188734 [00:14<00:00, 112700.77it/s]
saving uncompressed index: 100%|██████████| 188734/188734 [00:14<00:00, 13020.25it/s]


# [Variables]

In [26]:
from collections import Counter

def term_freq(word, docno):
    return next(item[1] for item in data[word] if item[0] == docno)

def doc_freq(word):
    if data.get(word) is not None:
        return len(data[word])
    else:
        return 0

def doc_length(docno):
    return len(stemmed_dict[docno])

# def doc_length(docno):
#     return len(unstemmed_dict[docno])

def tf_q(queryno, word):
    count = Counter(query_dict[queryno])
    return count[word]

def avg_length():
    total_len = 0
    for docno in docs_dict.keys():
        total_len += doc_length(docno)
    return total_len / total_docs

def doc_filter(word):
    if data.get(word) is not None:
        return [n[0] for n in data[word]]
    else:
        return None

total_docs = len(docs_dict)
vocab_size = len(unq_dict)
avg_len = avg_length()

In [27]:
print('term frequency', term_freq('new', 'AP890101-0001'))
print('document frequency', doc_freq('transcript'))
print('document length', doc_length('AP890101-0001'))
print('average length', avg_len)
print('total # documents', total_docs)
print('vocabulary size', vocab_size)

term frequency 2
document frequency 288
document length 581
average length 261.25758756701856
total # documents 84678
vocabulary size 188734


# [Retrieve Queries]

## Queries for stemmed index

In [3]:
import nltk

querylist = open('../AP_DATA/query_desc.51-100.short.txt')
query_dict = {}
word_arr = []
query_arr = []

stemmer = PorterStemmer()

for line in querylist:
    if line.strip() != '':
        queryno = re.sub('[^A-Za-z0-9]+', '', line.split()[0])
        query = line.split()[1:]
        modified = line[line.find(".")+1:]
        tokens = word_tokenize(modified)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_tokens = [element for item in stemmed_tokens for element in item.split('-')]
        for i in stemmed_tokens:
            if i not in stop_arr and re.search('[a-zA-Z0-9]', str(i)):
                word_arr.append(i)
        query_dict[queryno] = word_arr
        word_arr = []
        query_arr = []
print(query_dict)

{'85': ['document', 'discuss', 'alleg', 'measur', 'taken', 'corrupt', 'public', 'offici', 'ani', 'government', 'jurisdict', 'worldwid'], '59': ['document', 'report', 'type', 'weather', 'event', 'ha', 'directli', 'caus', 'least', 'fatal', 'locat'], '56': ['document', 'includ', 'predict', 'prime', 'lend', 'rate', 'report', 'actual', 'prime', 'rate', 'move'], '71': ['document', 'report', 'incurs', 'land', 'air', 'water', 'border', 'area', 'countri', 'militari', 'forc', 'second', 'countri', 'guerrilla', 'group', 'base', 'second', 'countri'], '64': ['document', 'report', 'event', 'result', 'polit', 'motiv', 'hostage', 'tak'], '62': ['document', 'report', 'militari', 'coup', "d'etat", 'attempt', 'success', 'ani', 'countri'], '93': ['document', 'describ', 'identifi', 'support', 'nation', 'rifl', 'associ', 'nra', 'asset'], '99': ['document', 'identifi', 'develop', 'iran', 'contra', 'affair'], '58': ['document', 'predict', 'anticip', 'rail', 'strike', 'report', 'ongo', 'rail', 'strike'], '77': 

## Queries for unstemmed index

In [4]:
import nltk

querylist = open('../AP_DATA/query_desc.51-100.short.txt')
query_dict = {}
word_arr = []
query_arr = []

stemmer = PorterStemmer()
# stemmer = SnowballStemmer('english')
stop_arr.append('document')
# stop_arr.append('discuss')

for line in querylist:
    if line.strip() != '':
        queryno = re.sub('[^A-Za-z0-9]+', '', line.split()[0])
        query = line.split()[1:]
        modified = line[line.find(".")+1:]
        tokens = word_tokenize(modified)
        for i in tokens:
            if i not in stop_arr and re.search('[a-zA-Z0-9]', str(i)):
                word_arr.append(i)
        query_dict[queryno] = word_arr
        word_arr = []
        query_arr = []
print(query_dict)

{'85': ['Document', 'discuss', 'allegations', 'measures', 'taken', 'corrupt', 'public', 'officials', 'governmental', 'jurisdiction', 'worldwide'], '59': ['Document', 'report', 'type', 'weather', 'event', 'directly', 'caused', 'least', 'fatality', 'location'], '56': ['Document', 'prediction', 'prime', 'lending', 'rate', 'report', 'actual', 'prime', 'rate', 'move'], '71': ['Document', 'report', 'incursions', 'land', 'air', 'water', 'border', 'area', 'country', 'military', 'forces', 'second', 'country', 'guerrilla', 'group', 'based', 'second', 'country'], '64': ['Document', 'report', 'event', 'result', 'politically', 'motivated', 'hostage-taking'], '62': ['Document', 'report', 'military', 'coup', "d'etat", 'attempted', 'successful', 'country'], '93': ['Document', 'describe', 'identify', 'supporters', 'National', 'Rifle', 'Association', 'NRA', 'assets'], '99': ['Document', 'identify', 'development', 'Iran-Contra', 'Affair'], '58': ['Document', 'predict', 'anticipate', 'rail', 'strike', 're

In [14]:
def add_score(docno, model_scores, score):
    if docno in model_scores:
        model_scores[docno] += score
    else:
        model_scores[docno] = score

def rank_scores(key, scores, out):
    if len(scores) < 1000:
        iter = len(scores)
    else:
        iter = 1000
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for j in range(iter):
        str = ('{} Q0 {} {} {} Exp'
            .format(key, sorted_scores[j][0], j+1, sorted_scores[j][1]))
        out.write(str+"\n")
    scores.clear()

# Models & computing functions

In [15]:
def okapi_tf(tf, doc_len):
    return tf / (tf + 0.5 + 1.5 * (doc_len/avg_len))

def tf_idf(tf, doc_len, df, total_docs):
    return okapi_tf(tf, doc_len) * math.log(total_docs / df)

def okapi_bm25(tf, tf_q, doc_len, df, k1, k2, b, total_docs, avg_len):
    return math.log((total_docs + 0.5) / (df + 0.5)) * \
           ((tf + k1 * tf) / (tf + k1* ((1-b) + b * (doc_len / avg_len)))) * \
           ((tf_q + k2 * tf_q) / (tf_q + k2))

def unigram_lm_laplace(tf, doc_len, voc_size):
    return math.log((tf + 1) / (doc_len + voc_size))

In [16]:
from collections import defaultdict

def compute_okapi_tf(query):
    okapi_tf_out = open('./okapi-tf.txt', "a")
    okapi_tf_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing Okapi-TF'):
        for word in words:
            docs = doc_filter(word)
            if docs is not None:
                for docno in docs:
                    tf = term_freq(word, docno)
                    doc_len = doc_length(docno)
                    add_score(docno, okapi_tf_scores, okapi_tf(tf, doc_len))
        rank_scores(key, okapi_tf_scores, okapi_tf_out)
    okapi_tf_out.close()

In [17]:
from collections import defaultdict

def compute_tf_idf(query):
    tf_idf_out = open('./tf-idf.txt', "a")
    tf_idf_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing TF-IDF'):
        for word in words:
            docs = doc_filter(word)
            df = doc_freq(word)
            if docs is not None:
                for docno in docs:
                    tf = term_freq(word, docno)
                    doc_len = doc_length(docno)
                    add_score(docno, tf_idf_scores, tf_idf(tf, doc_len, df, total_docs))
        rank_scores(key, tf_idf_scores, tf_idf_out)
    tf_idf_out.close()

In [18]:
from collections import defaultdict
import math

def compute_okapi_bm25(query):
    okapi_bm25_out = open('./okapi-bm25.txt', "a")
    okapi_bm25_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing Okapi-BM25'):
        for word in words:
            docs = doc_filter(word)
            if docs is not None:
                for docno in docs:
                    df = len(docs)
                    tf = term_freq(word, docno)
                    doc_len = doc_length(docno)
                    qf = tf_q(key, word)
                    add_score(docno, okapi_bm25_scores, okapi_bm25(tf, qf, doc_len, df, 1.2, 100, 0.75, total_docs, avg_len))
        rank_scores(key, okapi_bm25_scores, okapi_bm25_out)
    okapi_bm25_out.close()

In [19]:
def compute_unigram_lm_laplace(query):
    laplace_scores = defaultdict(lambda: 0.0)
    out = open('./unigram-lm-laplace.txt', "a")
    for key,words in tqdm(query.items(), position=0, leave=True, desc='Computing Language Models(Unigram-LM-Laplace)'):
        for word in words:
            docs=doc_filter(word)
            for docno in docs_dict.keys():
                if docs is not None:
                    if docno in docs:
                        tf = term_freq(word, docno)
                        doc_len = doc_length(docno)
                    else:
                        tf = 0
                        doc_len = 0
                score = unigram_lm_laplace(tf, doc_len, vocab_size)
                add_score(docno, laplace_scores, score)
        rank_scores(key, laplace_scores, out)
    out.close()

# Compute VSM (TF-IDF)

In [36]:
compute_tf_idf(query_dict)

Computing TF-IDF: 100%|██████████| 25/25 [02:45<00:00,  6.61s/it]


# Compute Okapi-BM25

In [37]:
compute_okapi_bm25(query_dict)

Computing Okapi-BM25: 100%|██████████| 25/25 [02:54<00:00,  6.98s/it]


# Compute Laplace

In [None]:
compute_unigram_lm_laplace(query_dict)

# [Proximity Search]

In [23]:
def diff(blurb):
    return max(blurb) - min(blurb)

def compute_min_range(possible_combos):
    blurb_list = []
    for blurb in possible_combos:
        blurb_list.append(diff(blurb))
    return min(blurb_list)

def rank_and_retrieve(key, scores):
    top_k_scores = {}
    if len(scores) < 1000:
        iter = len(scores)
    else:
        iter = 1000
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for j in range(iter):
        # str = ('{} Q0 {} {} {} Exp'
        #     .format(key, sorted_scores[j][0], j+1, sorted_scores[j][1]))
        # print(str+"\n")
        top_k_scores[sorted_scores[j][0]] = sorted_scores[j][1]
    return top_k_scores

def proximity_search(min_range, num_terms, docno):
    return (1500 - min_range) * num_terms / (doc_length(docno) + vocab_size)

In [29]:
import itertools
from collections import defaultdict

prox_scores = defaultdict(lambda: 0.0)
out = open('./proximity-search.txt', "a")

countt = 0
keys = [*query_dict]

for qryno in tqdm(query_dict.keys(), position=0, leave=True):
    for docno in stemmed_dict.keys():
        num_terms = 0
        q_dict = {}
        w_list = [n[0] for n in stemmed_dict[docno]]
        for q in query_dict[qryno]:
            if q in w_list:
                num_terms += 1
        for q in query_dict[qryno]:
            if q in w_list and num_terms > 0:
                q_dict[q] = [n[3] for n in stemmed_dict[docno] if n[0] == q]
        if q_dict != {}:
            possible_combos = [values for values in q_dict.values()]
            possible_combos = list(itertools.product(*possible_combos))
            min_range = compute_min_range(possible_combos)
            add_score(docno, prox_scores, proximity_search(min_range, num_terms, docno))
    rank_scores(keys[countt], prox_scores, out)
    countt += 1

100%|██████████| 25/25 [02:12<00:00,  5.31s/it]


# [Proxmity Search using Feedback]

## Stemmed

In [33]:
import itertools
from collections import defaultdict
    
def compute_proximity_search(query):
    countt = 0
    keys = [*query_dict]
    proximity_out = open('./proximity-search.txt', "a")
    tf_idf_scores = defaultdict(lambda: 0.0)
    prox_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing Proximity Search'):
        for word in words:
            docs = doc_filter(word)
            if docs is not None:
                for docno in docs:
                    df = len(docs)
                    tf = term_freq(word, docno)
                    doc_len = doc_length(docno)
                    qf = tf_q(key, word)
                    add_score(docno, tf_idf_scores, tf_idf(tf, doc_len, df, total_docs))
        top_k_scores = rank_and_retrieve(key, tf_idf_scores)
        for docno in top_k_scores.keys():
            num_terms = 0
            q_dict = {}
            w_list = [n[0] for n in stemmed_dict[docno]]
            for q in query_dict[key]:
                if q in w_list:
                    num_terms += 1
            for q in query_dict[key]:
                if q in w_list and num_terms > 0:
                    q_dict[q] = [n[3] for n in stemmed_dict[docno] if n[0] == q]
            if q_dict != {}:
                possible_combos = [values for values in q_dict.values()]
                possible_combos = list(itertools.product(*possible_combos))
                min_range = compute_min_range(possible_combos)
                if min_range != 0:
                    add_score(docno, prox_scores, proximity_search(min_range, num_terms, docno))
        for docno in top_k_scores.keys():
            top_k_scores[docno] += prox_scores[docno]
        rank_scores(keys[countt], top_k_scores, proximity_out)
        prox_scores.clear()
        top_k_scores.clear()
        tf_idf_scores.clear()
        countt += 1

compute_proximity_search(query_dict)

Computing Proximity Search: 100%|██████████| 25/25 [15:13<00:00, 36.55s/it]


## Unstemmed

In [35]:
import itertools
from collections import defaultdict

def compute_proximity_search_unstemmed(query):
    countt = 0
    keys = [*query_dict]
    proximity_out = open('./proximity-search-unstemmed.txt', "a")
    tf_idf_scores = defaultdict(lambda: 0.0)
    prox_scores = defaultdict(lambda: 0.0)
    for key, words in tqdm(query.items(), position=0, leave=True, desc='Computing Proximity Search'):
        for word in words:
            docs = doc_filter(word)
            if docs is not None:
                for docno in docs:
                    df = len(docs)
                    tf = term_freq(word, docno)
                    doc_len = doc_length(docno)
                    qf = tf_q(key, word)
                    add_score(docno, tf_idf_scores, tf_idf(tf, doc_len, df, total_docs))
        top_k_scores = rank_and_retrieve(key, tf_idf_scores)
        for docno in top_k_scores.keys():
            num_terms = 0
            q_dict = {}
            w_list = [n[0] for n in unstemmed_dict[docno]]
            for q in query_dict[key]:
                if q in w_list:
                    num_terms += 1
            for q in query_dict[key]:
                if q in w_list and num_terms > 0:
                    q_dict[q] = [n[3] for n in unstemmed_dict[docno] if n[0] == q]
            if q_dict != {}:
                possible_combos = [values for values in q_dict.values()]
                possible_combos = list(itertools.product(*possible_combos))
                min_range = compute_min_range(possible_combos)
                if min_range != 0:
                    add_score(docno, prox_scores, proximity_search(min_range, num_terms, docno))
        for docno in top_k_scores.keys():
            top_k_scores[docno] += prox_scores[docno]
        rank_scores(keys[countt], top_k_scores, proximity_out)
        prox_scores.clear()
        top_k_scores.clear()
        tf_idf_scores.clear()
        countt += 1

compute_proximity_search_unstemmed(query_dict)

Computing Proximity Search: 100%|██████████| 25/25 [03:00<00:00,  7.24s/it]
