In [98]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import glob
import re
import os
import numpy as np
import sys
Stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [99]:
document=[
    "Doc1.txt",
    "Doc2.txt",
    "Doc3.txt",
    "Doc4.txt",
    "Doc5.txt"
]

In [100]:
index = 1
files_with_index = {}
text=[]
for doc in document:
    print(doc)
    fname = doc
    file = open(doc , "r") #opening the document
    text.append(file.read()) #reading documents 
    files_with_index[index] = os.path.basename(fname)
    index = index + 1

Doc1.txt
Doc2.txt
Doc3.txt
Doc4.txt
Doc5.txt


In [101]:
docmnt = [w[:-4] for w in document]  #slicing doc filename to store as DocID in dict
docs = {docmnt[i]: text[i].lower() for i in range(len(document))} #creating dictionary with key:DocID 
                                                                   #value as text read from docs
print(docs)

{'Doc1': 'information retrieval systems is used with database systems', 'Doc2': 'information is in storage storage', 'Doc3': 'digital speech systems can be used in synthesis and systems', 'Doc4': 'speech filtering, speech retrieval systems are applications of information retrieval', 'Doc5': 'database management system is used for storage storage'}


In [102]:
#importing necessary libraries for calculations
import glob  
import math
import string
import numpy as np
import pandas as pd


from collections import Counter
from collections import OrderedDict

In [103]:
def pre_process(docs):
    stop = stopwords.words('english') + list(string.punctuation) + ['\n']
    words = []
    for doc in docs.values():
        for word in word_tokenize(doc.lower().strip()): 
            if not word in stop:
                words.append(word)
    return words  

wordlist = pre_process(docs)   #preprocessing the dictionary values
print(wordlist)

['information', 'retrieval', 'systems', 'used', 'database', 'systems', 'information', 'storage', 'storage', 'digital', 'speech', 'systems', 'used', 'synthesis', 'systems', 'speech', 'filtering', 'speech', 'retrieval', 'systems', 'applications', 'information', 'retrieval', 'database', 'management', 'system', 'used', 'storage', 'storage']


In [104]:
# Bag of Words (bow)
bow = list(set(wordlist))  # using set for unique words
bow

['retrieval',
 'synthesis',
 'management',
 'speech',
 'information',
 'applications',
 'used',
 'filtering',
 'system',
 'storage',
 'digital',
 'systems',
 'database']

Question 2. Construct a Boolean Model for the words by considering documents 1, 2, 3,4 and 5.

In [105]:
def booleanModel(bow, doc_dict):
    boolean_dict = {}
    for doc_id in doc_dict.keys():
        boolean_dict[doc_id] = {}

    for word in bow:
        for doc_id, doc in doc_dict.items():
            if doc.count(word) > 1:
                boolean_dict[doc_id][word] = 1 #if the word is present assigning 1
            else:
                boolean_dict[doc_id][word] = doc.count(word) #else 0
    return boolean_dict

In [106]:
# Boolean Vector Values
boolean_dict = booleanModel(bow, docs)
bool_vect = pd.DataFrame(boolean_dict)
bool_vect

Unnamed: 0,Doc1,Doc2,Doc3,Doc4,Doc5
retrieval,1,0,0,1,0
synthesis,0,0,1,0,0
management,0,0,0,0,1
speech,0,0,1,1,0
information,1,1,0,1,0
applications,0,0,0,1,0
used,1,0,1,0,1
filtering,0,0,0,1,0
system,1,0,1,1,1
storage,0,1,0,0,1


In [107]:
def queryVector(query, bow):   #query vector
    q_bool = {}
    for word in bow:
        if word in query:
            q_bool[word] = 1 #when query word is present in bow 
        else:
            q_bool[word] = 0
    return q_bool

In [108]:
def simpleMatch(q_bool, dictinry):  #for simple match
    doc_ids = []
    for dc_id, doc in dictinry.items():
        for word in doc:
            if q_bool[word] == 1 and doc[word] == 1: #when both value matches to 1
                if dc_id not in doc_ids:
                    doc_ids.append(dc_id)
    return doc_ids

In [109]:
def weightMatch(q_bool, dictinry): #for weighted match
    doc_ids = []
    doc_counts = {}
    for d_id, doc in dictinry.items():
        c = 0
        for word in doc:
            if q_bool[word] == 1 and doc[word] == 1:
                c += 1
        doc_counts[d_id] = c
    sorted_d = sorted(doc_counts.items(), key = lambda kv:kv[1], reverse = True)
    
    for d in sorted_d:
        doc_ids.append(d[0])
        
    return doc_ids

In [110]:
def booleanQuery(query, dictinry, bow, match_type):
    doc_ids = []
    query_list = query.lower().split()
    q_bool = queryVector(query_list, bow)
    if match_type.lower() == "simple":
        doc_ids = simpleMatch(q_bool, dictinry)

    elif match_type.lower() == "weighted":
        doc_ids = weightMatch(q_bool, dictinry)

    for i in doc_ids:
        print(i)

Question 2.a. Retrieve the documents for the Boolean query “Information Retrieval Synthesis” using simple match.

In [111]:
booleanQuery('Information Retrieval Synthesis', boolean_dict, bow, 'simple')  

Doc1
Doc2
Doc3
Doc4


Question 2.b. Retrieve the documents for the Boolean query “Database Retrieval Storage” using weighted match. (Rank the documents in the order of relevance)

In [112]:
booleanQuery('Database Retrieval Storage', boolean_dict, bow, 'weighted')  # Ranked in order of relevance

Doc1
Doc5
Doc2
Doc4
Doc3


Question 3. Construct a vector space model to build the term weights. Compute the TF-IDF and identify the most important terms across the documents.

In [113]:
# Term frequency in each document (TF)
def termFrequency(bow, doc_dict):
    tf_documents = {}
    for doc_id in doc_dict.keys():
        tf_documents[doc_id] = {}
    
    for word in bow:
        for doc_id,doc in doc_dict.items():
            tf_documents[doc_id][word] = doc.count(word) / len(doc.split()) #occurence in docid/total words in that doc
    return tf_documents

In [114]:
# Document frequency (DF)
def docFrequeny(bow, doc_dict):
    df = {}
    for word in bow:
        frq = 0
        for doc in doc_dict.values():
            # if word in doc.lower().split():
            if word in word_tokenize(doc.lower().strip()):
                frq = frq + 1
        df[word] = frq  
    return df    

In [115]:
# Inverse Document Frequency (IDF): log(N/ni)
def IDF(bow, doc_freq, length):
    idf= {} 
    for word in bow:     #total docs in which word is present
        idf[word] = np.log10(length / doc_freq[word])
    return idf

In [116]:
# TF-IDF (weights: tf*idf)
def tfIdf(bow,tf, idf_scr, doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in bow:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

In [117]:
tf = termFrequency(bow, docs)  # term frequency
df = docFrequeny(bow, docs)  # doc frequency
idf = IDF(bow, df, len(docs))  # inverse doc frequency
tf_idf = tfIdf(bow, tf, idf, docs)  # TF-IDF
# TF-IDF
tfidf_df = pd.DataFrame(tf_idf)
tfidf_df

Unnamed: 0,Doc1,Doc2,Doc3,Doc4,Doc5
retrieval,0.049743,0.0,0.0,0.079588,0.0
synthesis,0.0,0.0,0.069897,0.0,0.0
management,0.0,0.0,0.0,0.0,0.087371
speech,0.0,0.0,0.039794,0.079588,0.0
information,0.027731,0.04437,0.0,0.022185,0.0
applications,0.0,0.0,0.0,0.069897,0.0
used,0.027731,0.0,0.022185,0.0,0.027731
filtering,0.0,0.0,0.0,0.069897,0.0
system,0.174743,0.0,0.139794,0.069897,0.087371
storage,0.0,0.159176,0.0,0.0,0.099485


In [118]:
def vectorSpaceModel(query, doc_dict, tfidf_scr):
    bools = ['and', 'or', 'not']
    query_bow = []
    for word in query.lower().split():
        if word not in query_bow and word not in bools:
            query_bow.append(word)

    query_wc = {}
    for word in query_bow:
        query_wc[word] = query.lower().split().count(word)
    
    importance_scores = {}
    if 'or' in query.lower().split():        
        for doc_id in doc_dict.keys():
            score = 0
            for word in query_bow:
                score += query_wc[word] * tfidf_scr[doc_id][word]
            importance_scores[doc_id] = score
    if 'and' in query.lower().split():
        word1 = query_bow[0]
        word2 = query_bow[1]
        for doc_id in doc_dict.keys():
            score = 0
            if word1 in doc_dict[doc_id] and word2 in doc_dict[doc_id]:
                score += query_wc[word1] * tfidf_scr[doc_id][word1]  # Word before AND
                score += query_wc[word2] * tfidf_scr[doc_id][word2]  # Word after AND
            importance_scores[doc_id] = score
    
    if 'not' in query.lower().split():
        word1 = query_bow[0]
        word2 = query_bow[1]
        for doc_id in doc_dict.keys():
            score = 0
            if word1 in doc_dict[doc_id] and word2 not in doc_dict[doc_id]:
                score += query_wc[word1] * tfidf_scr[doc_id][word1]  # Word before AND
                score += query_wc[word2] * tfidf_scr[doc_id][word2]  # Word after AND NOT
            importance_scores[doc_id] = score
            
    sorted_value = OrderedDict(sorted(importance_scores.items(), key=lambda x: x[1], reverse = True))
    ranked = {k: sorted_value[k] for k in list(sorted_value)}
    return ranked

In [119]:
def doc_rank(query_dict):
    for k in query_dict.keys():
        print(k)

Question 3.a. Rank all the documents in the collection for the query “Speech” AND “Systems”? (Rank the documents in the order of relevance)

In [120]:
q1 = vectorSpaceModel('Speech AND Systems', docs, tf_idf)
doc_rank(q1) # by weights

Doc4
Doc3
Doc1
Doc2
Doc5


Question 3.b. Rank all the documents in the collection for the query “Database” OR “Systems”? (Rank the documents in the order of relevance)

In [121]:
q2 = vectorSpaceModel('Database OR Systems', docs, tf_idf)
doc_rank(q2)

Doc1
Doc5
Doc3
Doc4
Doc2


Question 3.c. Rank all the documents in the collection for the query contains “Systems” AND NOT “Information” (Rank the documents in the order of relevance)

In [122]:
q3 = vectorSpaceModel('Systems AND NOT Information', docs, tf_idf)
doc_rank(q3)

Doc3
Doc1
Doc2
Doc4
Doc5


Question 4. Compute the cosine similarities between docs 1 and docs 2.

In [123]:
np.array(bool_vect['Doc1'])

array([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1])

In [124]:
np.array(bool_vect['Doc2'])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0])

In [125]:
def cosine_similarity(doc1, doc2):
    D1 = np.array(bool_vect[doc1])
    D2 = np.array(bool_vect[doc2])
    numerator = np.sum(np.multiply(D1, D2))  #  A . B
    denominator = math.sqrt(np.sum(np.square(D1))*np.sum(np.square(D2)))  #||A||X||B||
    return numerator / denominator

In [126]:
cosine_similarity('Doc1', 'Doc2')

0.2886751345948129

Question 5. Compute Dice Co-efficient between docs 3 and docs 4.

In [127]:
np.array(bool_vect['Doc3'])

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0])

In [128]:
np.array(bool_vect['Doc4'])

array([1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0])

In [129]:
def dice_coeff(doc1, doc2):
    D1 = np.array(bool_vect[doc1])
    D2 = np.array(bool_vect[doc2])
    numerator = 2 * (np.sum(np.multiply(D1, D2)))  # 2 * |A inter B|
    denominator = np.sum(np.square(D1))
    denominator += np.sum(np.square(D2)) #|A| + |B|
    return numerator / denominator

In [130]:
dice_coeff('Doc3', 'Doc4')

0.46153846153846156

Question 6. Compute the Jaccard co-efficient between docs 4 and docs 5.

In [131]:
np.array(bool_vect['Doc4'])

array([1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0])

In [132]:
np.array(bool_vect['Doc5'])

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1])

In [133]:
def jaccardcoeff(doc1, doc2):
    D1 = np.array(bool_vect[doc1])
    D2 = np.array(bool_vect[doc2])
    numerator = np.sum(np.multiply(D1, D2))  # |A intersection B|
    addn = np.sum(np.square(D1))
    addn += np.sum(np.square(D2)) #|A U B|
    denominator = addn - numerator
    return numerator / denominator

In [134]:
jaccardcoeff('Doc4', 'Doc5')

0.09090909090909091