# Importing Modules

In [1]:
import numpy as np
import re

# Construction of Matrix

In [2]:
def termIncidenceMatrix(l,n):
    docs = []
    terms = set()

    for i in l:
        file = open(i,'r').read().lower()
        content = re.sub("[^a-z0-9]"," ",file).split()
        terms.update(content)
        docs.append(set(content))

    mat = np.zeros((len(terms),n),dtype=bool)
    terms = sorted(terms)
    for i in range(len(terms)):
        for j in range(n):
            if terms[i] in docs[j]:
                mat[i][j] = 1
                
            else:
                mat[i][j] = 0
    return mat,terms

# Postfix Building

In [3]:
def precedence(op):
    if op == 'NOT':
        return 3
    elif op == 'AND':
        return 2
    elif op == 'OR':
        return 1
    else:
        return 0

In [4]:
def postfix(l):
    pf = []
    stack = []
    opList = ['AND','OR','NOT']
    for i in l:
        if i in opList:
            if stack == []:
                stack.append(i)
            else:
                while(stack != [] and precedence(i) <= precedence(stack[-1])):
                    pf.append(stack.pop())
                stack.append(i)
        elif i == '(':
            stack.append(i)
        elif i == ')':
            while(stack[-1] != '('):
                pf.append(stack.pop())
            stack.pop()
        else:
            pf.append(i)
    while(stack != []):
        pf.append(stack.pop())
    return pf

In [5]:
def getIncidences(s,d,n):
    if isinstance(s,str):
        try:
            return d[s]
        except:
            return np.zeros(n,dtype=bool)
    else:
        return s

# Query Evaluation

In [6]:
def queryEval(query,d,n):
    pf = postfix(query.split())
    stack = []
    opList = ['AND','OR','NOT']
    if len(pf) == 1:
        return getIncidences(pf[0],d,n)
    for i in pf:
        if i in opList:
            if i == 'NOT':
                op = stack.pop()
                r = ~getIncidences(op,d,n)
            elif i == 'AND':
                op1 = getIncidences(stack.pop(),d,n)
                op2 = getIncidences(stack.pop(),d,n)
                r = op1&op2
            else:
                op1 = getIncidences(stack.pop(),d,n)
                op2 = getIncidences(stack.pop(),d,n)
                r = op1|op2
            stack.append(r)
        else:
            stack.append(i)
    return stack.pop()

In [7]:
# n = 4
# l = np.array(["doc1.txt","doc2.txt","doc3.txt","doc4.txt"])
n = int(input("Enter the no. of documents : "))
l = []
for i in range(n):
    l.append(input("Enter the doc - "+str(i+1)+" name : "))
l = np.array(l)
TIM,terms = termIncidenceMatrix(l,n)
print("the generated matrix is : \n",TIM.astype(int)) 
d = {}
for i in range(len(terms)):
    d[terms[i]] = TIM[i]


Enter the no. of documents : 4
Enter the doc - 1 name : doc1.txt
Enter the doc - 2 name : doc2.txt
Enter the doc - 3 name : doc3.txt
Enter the doc - 4 name : doc4.txt
the generated matrix is : 
 [[0 1 0 1]
 [1 1 1 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 1 0]
 [1 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [1 1 1 0]
 [1 1 1 0]
 [0 0 1 0]
 [1 0 0 1]
 [0 1 0 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 1 1]
 [0 0 1 0]
 [0 0 0 1]]


In [8]:
rules = """
The term insurance matrix have been created.
Rules for entering the query:
 1.There should be a single space between operand and operator
 2.There should be space before And after '(' , ')'
 3.Boolean operators should be in capital and words in small letters.
Enter the query:
"""
query = input(rules)
resultDoc = queryEval(query,d,n).astype(bool)
if(len(l[resultDoc]) == 0):
    print("No matching documents")
else:
    print(l[resultDoc])


The term insurance matrix have been created.
Rules for entering the query:
 1.There should be a single space between operand and operator
 2.There should be space before And after '(' , ')'
 3.Boolean operators should be in capital and words in small letters.
Enter the query:
words AND pen OR NOT sheet
['doc2.txt' 'doc3.txt']
