In [1]:
import os
import re
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /Users/harsh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/harsh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/harsh/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Question 1: Preprocessing

In [6]:
def extract_data(folder, new_folder):
    files = os.listdir(folder)
    for file in files:
        path = os.path.join(os.getcwd(), folder, file)
        with open(path) as fp:
            soup = BeautifulSoup(fp, 'html.parser')
            text = soup.findAll("text")[0].text
            title = soup.findAll("title")[0].text
            final_text = title + " " + text
        new_file_path = os.path.join(os.getcwd(), new_folder, file)
        with open(new_file_path, "w") as fw:
            fw.write(final_text)
            fw.close()

In [7]:
new_folder = 'Dataset'
try:
    os.mkdir(new_folder)
except:
    print(new_folder, " already exists.")
extract_data('CSE508_Winter2023_Dataset', new_folder)


Dataset  already exists.


In [None]:
def preprocess(text):

    text = text.lower()

    tokens = word_tokenize(text)
    
    final = [word for word in tokens if word not in stop_words]

    tokens = [word for word in final if word not in string.punctuation]
     
    final = [word for word in tokens if len(re.findall(r'\s+', word)) == 0]
    
    return final

In [9]:
L = []
idtoName = {}
files = os.listdir("Dataset")
for i, file in enumerate(files):
    idtoName[i] = file
    path = os.path.join(os.getcwd(), "Dataset", file)
    with open(path) as fp:
        text = fp.read()
        L.append(preprocess(text))

# Question 2: Boolean Queries

## Unigram Inverted Index

In [20]:
def unigram_inverted_index(doc_list):
    uni_inv_idx = {}
    for doc_id, tokens in enumerate(doc_list):
        for idx, token in enumerate(tokens):
            if token in uni_inv_idx:
                if doc_id not in uni_inv_idx[token]:
                    uni_inv_idx[token].append(doc_id)
            else:
                uni_inv_idx[token] = [doc_id]

    return uni_inv_idx

In [21]:
uni_inv_idx = unigram_inverted_index(L)

<h3>Saving to Pickle

In [22]:
import pickle
filehandler = open("uni_inv_idx.obj","wb")
pickle.dump(uni_inv_idx, filehandler)
filehandler.close()

<h3>Loading from Pickle

In [23]:
file = open("uni_inv_idx.obj",'rb')
uni_inv_idx = pickle.load(file)
file.close()

# Queries

In [24]:
def andQuery(L1, L2):
    ans = []
    comparison = 0
    len1, len2 = len(L1), len(L2)
    
    i , j = 0, 0
    while i < len1 and j < len2:
        if L1[i] == L2[j]:
            ans.append(L1[i])
            i += 1
            j += 1
        elif L1[i] < L2[j]:
            i += 1
        else:
            j += 1
        comparison += 1
    
    return ans, comparison

In [25]:
def andNotQuery(L1, L2):
    _L2 = [i for i in range(1400) if i not in L2]   
    return andQuery(L1, _L2)

In [26]:
def orQuery(L1, L2):
    ans = []
    comparison = 0
    len1, len2 = len(L1), len(L2)
    
    i , j = 0, 0
    while i < len1 and j < len2:
        if L1[i] == L2[j]:
            ans.append(L1[i])
            i += 1
            j += 1
        elif L1[i] < L2[j]:
            ans.append(L1[i])
            i += 1
        else:
            ans.append(L2[j])
            j += 1
            
        comparison += 1
    
    while i < len1:
        ans.append(L1[i])
        i += 1
    
    while j < len2:
        ans.append(L2[j])
        j += 1

    return ans, comparison

In [27]:
def orNotQuery(L1, L2):
    _L2 = [i for i in range(1400) if i not in L2]   
    return orQuery(L1, _L2)

## General Queries

In [28]:
def process_query(i, query, operator, comparisons):
    if i == len(operator):
        return query, comparisons
    
    res = []
    comp = 0

    if 'OR' in operator[i] and 'NOT' not in operator[i]:
        # print('OR')
        res, comp = orQuery(query[0], query[1])

    if 'AND' in operator[i] and 'NOT' not in operator[i]:
        # print('AND')
        res, comp = andQuery(query[0], query[1])

    if 'AND NOT' in operator[i]:
        # print('AND NOT')
        res, comp = andNotQuery(query[0], query[1])
    
    if 'OR NOT' in operator[i]:
        # print('OR NOT')
        res, comp = orNotQuery(query[0], query[1])
    
    del query[ : 2]
    query.insert(0, res)
    comparisons += comp
    
    return process_query(i + 1, query, operator, comparisons)


# Unigram query input and output

In [84]:
def unigram_queries(queries, operand, uni_inv_idx):
    queries_expression, no_of_docs, doc_names, no_of_comp = list(), list(), list(), list()
    for idx in range(len(queries)):
        op = operand[idx]
        ip1 = queries[idx]
        op = op.split(',')
        ip1 = preprocess(ip1)
        query = [uni_inv_idx[i] if i in uni_inv_idx else [] for i in ip1]

        if len(query) != len(op) + 1:
            queries_expression.append("Inappropriate query !!!. Input Mismatch")
            no_of_docs.append(-1)
            doc_names.append(list())
            no_of_comp.append(-1)
            continue

        sent = ""
        p, q = 0, 0
        for idx in range(len(query) + len(op)):
            if idx % 2 == 0:
                sent += ip1[p] + " "
                p += 1
            else:
                sent += op[q] + " "
                q += 1

        comparisons = 0
        output, comparisons = process_query(0, query, op, comparisons)
        docs = [idtoName[i] for i in output[0]]
        queries_expression.append(sent)
        no_of_docs.append(len(output[0]))
        doc_names.append(docs)
        no_of_comp.append(comparisons)
    
    return queries_expression, no_of_docs, doc_names, no_of_comp

    

In [86]:
N = int(input("Enter the number of queries."))
count = 1
queries = []
operand = []
while count <= N:
    query = input("Query : ")
    oper = input("Operand : ")
    queries.append(query)
    operand.append(oper)
    count +=1

queries_expression, no_of_docs, doc_names, no_of_comp = unigram_queries(queries, operand, uni_inv_idx)
for idx in range(len(queries)):
    print(f"Query {idx + 1}: ", queries_expression[idx])
    print(f"Number of documents retrieved for query {idx + 1}: ", no_of_docs[idx])
    print(f"Names of the documents retrieved for query {idx + 1}: ", doc_names[idx])
    print(f"Number of comparisons required for query {idx + 1}: ", no_of_comp[idx])

Enter the number of queries.1
Query : one two three
Operand : OR,AND
Query 1:  one OR two AND three 
Number of documents retrieved for query 1:  36
Names of the documents retrieved for query 1:  ['cranfield0750', 'cranfield0536', 'cranfield0641', 'cranfield0679', 'cranfield1357', 'cranfield1392', 'cranfield1351', 'cranfield0609', 'cranfield0059', 'cranfield0456', 'cranfield1125', 'cranfield0140', 'cranfield1092', 'cranfield0773', 'cranfield0582', 'cranfield0364', 'cranfield0791', 'cranfield1049', 'cranfield0844', 'cranfield0620', 'cranfield0673', 'cranfield0476', 'cranfield0672', 'cranfield0006', 'cranfield0454', 'cranfield1145', 'cranfield0064', 'cranfield0856', 'cranfield0660', 'cranfield1325', 'cranfield0266', 'cranfield1144', 'cranfield1036', 'cranfield0785', 'cranfield0948', 'cranfield0187']
Number of comparisons required for query 1:  890


# Question 3: Phrase Queries

## Bigram Inverted Index

In [48]:
def bigram_inverted_index(L, files):
    bi_inv_idx = {}
    for doc_id, tokens in enumerate(L):
        for idx, _ in enumerate(tokens):
#             print(tokens[idx])
            if idx <= len(tokens) - 2:
                bigram_word = tokens[idx] + " " + tokens[idx + 1]
                if bigram_word not in bi_inv_idx:
                    bi_inv_idx[bigram_word] = list()
                    bi_inv_idx[bigram_word].append(doc_id)
                else:
                    if doc_id not in bi_inv_idx[bigram_word]:
                        bi_inv_idx[bigram_word].append(doc_id)
                    
    return bi_inv_idx

In [49]:
bi_inv_idx = bigram_inverted_index(L, files)

In [52]:
filehandler = open("bi_inv_idx.obj","wb")
pickle.dump(bi_inv_idx, filehandler)
filehandler.close()

In [53]:
file = open("bi_inv_idx.obj",'rb')
bi_inv_idx = pickle.load(file)
file.close()

## Bigram Queries

In [78]:
def bigram_queries(queries, bi_inv_idx):
    no_of_docs = []
    doc_names = []
    for query in queries:
        query = preprocess(query)
#         print(query)
        bigram_words  = []
        for idx in range(len(query)):
            if idx <= len(query) - 2:
                bigram_words.append(query[idx] + " " + query[idx + 1])
#         print(bigram_words)
        operand = []  
        for idx in range(len(bigram_words) - 1):
            operand.append('AND')

#         print(operand)
        query_doc_list = [bi_inv_idx[i] if i in bi_inv_idx else [] for i in bigram_words]
#         print(query_doc_list)
        
        comparisons = 0
        output, _ = process_query(0, query_doc_list, operand, comparisons)
        docs = [idtoName[i] for i in output[0]]
        no_of_docs.append(len(output[0]))
        doc_names.append(docs)
        
    return no_of_docs, doc_names

        

## Final Query Input and Output

In [80]:
N = int(input("Enter the number of queries."))
count = 1
queries = []
while count <= N:
    query = input("Query : ")
    queries.append(query)
    count +=1

no_of_docs, doc_names = bigram_queries(queries, bi_inv_idx)
for idx in range(len(queries)):
    print(f"Number of documents retrieved for query {idx + 1} using bigram inverted index: ", no_of_docs[idx])
    print(f"Names of documents retrieved for query  {idx + 1} using bigram inverted index: ", doc_names[idx])
    


Enter the number of queries.1
Query : free-flight measurements static
Number of documents retrieved for query 1 using bigram inverted index:  10
Names of documents retrieved for query  1 using bigram inverted index:  ['cranfield1011', 'cranfield1010', 'cranfield1003', 'cranfield1004', 'cranfield1005', 'cranfield1009', 'cranfield1007', 'cranfield1000', 'cranfield1006', 'cranfield1008']
