In [1]:
import nltk
import os
import pandas as pd
import numpy as np
import re
import math as m
from collections import Counter
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_list = stopwords.words('english')

In [2]:
import math

# Declaring variables for file path
in_path = 'corpus'
out_path = 'prep_corpus'

# Declaring variables for query files
query = 'queries.txt'
preproc_query = 'preprocessed_queries.txt'

In [3]:
if not os.path.isdir(out_path):
    os.mkdir(out_path)
filenames = os.listdir(in_path)
st = PorterStemmer()
shortword = re.compile(r'\W*\b\w{1,2}\b')

In [4]:
print(filenames)

['doc332', 'doc130', 'doc12', 'doc320', 'doc466', 'doc71', 'doc410', 'doc295', 'doc663', 'doc515', 'doc465', 'doc114', 'doc566', 'doc593', 'doc55', 'doc481', 'doc30', 'doc602', 'doc407', 'doc581', 'doc471', 'doc380', 'doc303', 'doc246', 'doc451', 'doc330', 'doc608', 'doc499', 'doc267', 'doc490', 'doc479', 'doc366', 'doc288', 'doc449', 'doc284', 'doc507', 'doc210', 'doc39', 'doc48', 'doc394', 'doc343', 'doc543', 'doc397', 'doc589', 'doc604', 'doc647', 'doc596', 'doc337', 'doc232', 'doc77', 'doc512', 'doc448', 'doc273', 'doc41', 'doc289', 'doc115', 'doc669', 'doc275', 'doc575', 'doc61', 'doc248', 'doc560', 'doc541', 'doc639', 'doc473', 'doc383', 'doc227', 'doc306', 'doc276', 'doc684', 'doc419', 'doc222', 'doc600', 'doc431', 'doc131', 'doc474', 'doc280', 'doc338', 'doc440', 'doc92', 'doc461', 'doc80', 'doc270', 'doc673', 'doc324', 'doc510', 'doc229', 'doc656', 'doc535', 'doc151', 'doc650', 'doc611', 'doc364', 'doc659', 'doc203', 'doc329', 'doc233', 'doc118', 'doc331', 'doc316', 'doc661', 

In [5]:
print("Total Number of Documents :{}".format(len(filenames)))

Total Number of Documents :690


In [6]:
def tokenize(data):
    lines = data.lower()
    lines = re.sub('[^A-Za-z]+', ' ', lines)
    tokens = lines.split()
    clean_tokens = [word for word in tokens if word not in stop_list]
    stem_tokens = [st.stem(word) for word in clean_tokens]
    clean_stem_tokens = [word for word in stem_tokens if word not in stop_list]
    clean_stem_tokens = ' '.join(map(str,  clean_stem_tokens))
    clean_stem_tokens = shortword.sub('', clean_stem_tokens)
    return clean_stem_tokens

def extractTokens(beautSoup, tag):
    textData = beautSoup.findAll(tag)
    textData = ''.join(map(str, textData))
    textData = textData.replace(tag, '')
    textData = tokenize(textData)
    return textData

In [7]:
for fname in filenames:
    infilepath = in_path + '/' + fname
    outfilepath = out_path + '/' + fname
    with open(infilepath) as infile:
        with open(outfilepath, 'w') as outfile:
            fileData = infile.read()
            soup = BeautifulSoup(fileData)
            title = extractTokens(soup, 'title')
            text = extractTokens(soup, 'text')
            outfile.write(title)
            outfile.write(" ")
            outfile.write(text)
        outfile.close()
    infile.close()

In [8]:
all_docs = []

for fname in filenames:
    outfilepath = out_path + '/' + fname
    with open(outfilepath) as file:
        fileData = file.read()
        all_docs.append(fileData)

In [9]:
no_of_docs=len(all_docs)

for i in range(no_of_docs):
    tokens = all_docs[i].split()
    print(tokens)
    

['similitud', 'hyperson', 'real', 'flow', 'slender', 'bodi', 'blunt', 'nose', 'basi', 'hyperson', 'small', 'perturb', 'theori', 'law', 'similitud', 'hyperson', 'inviscid', 'flow', 'field', 'thin', 'slender', 'bodi', 'examin', 'restrict', 'ideal', 'gase', 'constant', 'specif', 'heat', 'bodi', 'point', 'nose', 'remov', 'steadi', 'plane', 'axisymmetr', 'flow', 'consid', 'inspect', 'govern', 'system', 'equat', 'show', 'similitud', 'law', 'exist', 'flow', 'field', 'local', 'thermal', 'equilibrium', 'free', 'stream', 'atmospher', 'flow', 'ideal', 'constant', 'specif', 'heat', 'requir', 'free', 'stream', 'atmospher', 'composit', 'pressur', 'densiti', 'replac', 'requir', 'ratio', 'specif', 'heat', 'flow', 'blunt', 'wedg', 'cone', 'special', 'law', 'similitud', 'obtain', 'applic', 'similar', 'rule', 'examin', 'case', 'hyperson', 'flow', 'ideal', 'flat', 'plate', 'blunt', 'lead', 'edg', 'case', 'equilibrium', 'air', 'flow', 'wedg', 'possibl', 'simul', 'nonequilibrium', 'flow', 'slender', 'thin',

In [10]:
frequency_map = {}
for i in range(no_of_docs):
    tokens = all_docs[i].split()
    for w in tokens:
        try:
            frequency_map[w].add(i)
        except:
            frequency_map[w] = {i}
for i in frequency_map:
    frequency_map[i] = len(frequency_map[i])


In [11]:
print(frequency_map)

{'similitud': 8, 'hyperson': 106, 'real': 16, 'flow': 430, 'slender': 60, 'bodi': 167, 'blunt': 77, 'nose': 59, 'basi': 36, 'small': 117, 'perturb': 15, 'theori': 224, 'law': 32, 'inviscid': 54, 'field': 107, 'thin': 41, 'examin': 30, 'restrict': 25, 'ideal': 28, 'gase': 24, 'constant': 102, 'specif': 36, 'heat': 184, 'point': 130, 'remov': 4, 'steadi': 48, 'plane': 54, 'axisymmetr': 24, 'consid': 124, 'inspect': 1, 'govern': 27, 'system': 42, 'equat': 200, 'show': 105, 'exist': 61, 'local': 68, 'thermal': 51, 'equilibrium': 36, 'free': 124, 'stream': 130, 'atmospher': 39, 'requir': 74, 'composit': 13, 'pressur': 273, 'densiti': 54, 'replac': 20, 'ratio': 141, 'wedg': 24, 'cone': 52, 'special': 39, 'obtain': 224, 'applic': 117, 'similar': 99, 'rule': 7, 'case': 182, 'flat': 101, 'plate': 121, 'lead': 97, 'edg': 100, 'air': 105, 'possibl': 78, 'simul': 14, 'nonequilibrium': 11, 'also': 153, 'behaviour': 8, 'non': 28, 'linear': 67, 'mani': 19, 'phenomena': 21, 'occur': 55, 'world': 1, 'a

In [14]:
frequency_map

{'similitud': 8,
 'hyperson': 106,
 'real': 16,
 'flow': 430,
 'slender': 60,
 'bodi': 167,
 'blunt': 77,
 'nose': 59,
 'basi': 36,
 'small': 117,
 'perturb': 15,
 'theori': 224,
 'law': 32,
 'inviscid': 54,
 'field': 107,
 'thin': 41,
 'examin': 30,
 'restrict': 25,
 'ideal': 28,
 'gase': 24,
 'constant': 102,
 'specif': 36,
 'heat': 184,
 'point': 130,
 'remov': 4,
 'steadi': 48,
 'plane': 54,
 'axisymmetr': 24,
 'consid': 124,
 'inspect': 1,
 'govern': 27,
 'system': 42,
 'equat': 200,
 'show': 105,
 'exist': 61,
 'local': 68,
 'thermal': 51,
 'equilibrium': 36,
 'free': 124,
 'stream': 130,
 'atmospher': 39,
 'requir': 74,
 'composit': 13,
 'pressur': 273,
 'densiti': 54,
 'replac': 20,
 'ratio': 141,
 'wedg': 24,
 'cone': 52,
 'special': 39,
 'obtain': 224,
 'applic': 117,
 'similar': 99,
 'rule': 7,
 'case': 182,
 'flat': 101,
 'plate': 121,
 'lead': 97,
 'edg': 100,
 'air': 105,
 'possibl': 78,
 'simul': 14,
 'nonequilibrium': 11,
 'also': 153,
 'behaviour': 8,
 'non': 28,
 'lin

In [15]:
N=690
term_matrix = {}
# print(N)
for key in sorted(frequency_map):
    x=(N - frequency_map[key] + 0.5)/(frequency_map[key] + 0.5)
    term_matrix[key] = math.log(x, 2) 
    
term_matrix

{'abbrevi': 8.844444240792825,
 'abil': 8.105384749247602,
 'abl': 7.253190908704789,
 'ablat': 7.253190908704789,
 'abovement': 8.844444240792825,
 'abrupt': 8.105384749247602,
 'abruptli': 8.105384749247602,
 'absenc': 8.105384749247602,
 'absent': 8.844444240792825,
 'absolut': 6.961581237141928,
 'absorb': 7.253190908704789,
 'absorpt': 7.617860981241781,
 'abstract': 8.105384749247602,
 'academ': 8.844444240792825,
 'acceler': 5.353280513589814,
 'accept': 6.961581237141928,
 'accommod': 8.105384749247602,
 'accompani': 6.164642333361623,
 'accomplish': 6.509906931997541,
 'accord': 5.266251118517337,
 'accordingli': 6.961581237141928,
 'account': 3.664843277589852,
 'accru': 8.844444240792825,
 'accumul': 8.844444240792825,
 'accur': 3.8957928867251885,
 'accuraci': 4.164424823065223,
 'achiev': 5.266251118517337,
 'ackeret': 7.617860981241781,
 'acoust': 6.718467013116808,
 'acquir': 8.844444240792825,
 'acr': 8.844444240792825,
 'across': 4.648204934712976,
 'acrothermoelast': 

Creating TF-IDF table

In [16]:
def TF(wordDict):
    
#     max_fi = 0
#     for key, val in wordDict.items():
#         if val>max_fi:
#             max_fi = val
    
    newDict = {}
    for key, val in wordDict.items():
        if val>0:
            
            #log normalization
            newDict[key] = 1+ math.log(val,2)
            
            
            #double normalization K
            #newDict[key] = 0.5 * ((0.5*val)/ max_fi)
    
    return newDict

In [17]:
dictList = [dict() for x in range(N)]
for i in range(N):
    dictList[i] = dict.fromkeys(wordSet, 0) 
    
    for word in BOW('prep_corpus/doc'+str(i+1)):
        dictList[i][word]+=1

NameError: name 'wordSet' is not defined

In [None]:
def computeIDF(dictList):
    idfDict = {}
    N = len(dictList)
    print(N)
    
    idfDict = dict.fromkeys(dictList[0].keys(), 0)
    
    for doc in dictList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        #inverse frequency
        idfDict[word] = math.log((N / float(val)), 2)
        
        #inverse frequency smooth
        #idfDict[word] = math.log((1+(N/float(val))), 2)
        
        #probabilistic inverse frequency
        #idfDict[word] = math.log(((N-val)/val), 2)
        
    return idfDict

In [None]:
idfs = computeIDF(dictList)
idfs

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [None]:
finalTF = [dict() for x in range(N)]
finalTFIDF = [dict() for x in range(N)]
for i in range(N):
    finalTF[i] = TF(dictList[i])
    
finalTF

In [None]:
for i in range(N):
    finalTFIDF[i] = computeTFIDF(finalTF[i], idfs)

In [None]:
import pandas as pd
df = pd.DataFrame(finalTFIDF)
df

In [None]:
import numpy as np
  

In [None]:
df = df.replace(np.nan, 0)
df

In [None]:
query = "flow past"
queryTerms = query.split(' ')
for i in range(len(queryTerms)):
    queryTerms[i] = queryTerms[i].lower()
    
queryTerms  

In [None]:
queryCount = dict.fromkeys(queryTerms, 0) 
for term in queryTerms:
    queryCount[term]+=1
    
queryCount

BM1

In [None]:

result = []
for i in range(N):
    relval = 0
    for word in queryTerms:
        table_val = df.at[i, word]
        if table_val>0:
            relval=relval+int(term_matrix[word])
    
    result.append(relval)

In [None]:
result

In [None]:
ranking_dict = {}

for i in range(N):
    ranking_dict[i] = result[i]

ranking_dict 

In [None]:
from heapq import nlargest
final_result = nlargest(3, ranking_dict, key = ranking_dict.get)
final_result

In [None]:
print("Top 3 documents: ")
for doc in final_result:
    print("Document", doc, ": ", result[doc])

In [None]:
for doc in final_result:
    loc = 'prep_corpus/doc'+str(doc+1)
    file = open(loc, 'r')
    for line in file: 
        print(line)

In [None]:
dictList[0]

In [None]:
def term_freq_factor_Fij(term, document):
    K1 = 5
    S1 = K1+1
    fij = dictList[document][term]    
    Fij = (S1 * fij)/(K1 + fij)
    
    return Fij

In [None]:
dictList[0]

In [None]:
term_freq_factor_Fij('face', 0)

In [None]:
all_doc_len = []
for i in range(N):
    all_doc_len.append(len(BOW('prep_corpus/doc'+str(i+1))))
    
all_doc_len 

In [None]:
from statistics import mean
avg_doc_len = mean(all_doc_len)
avg_doc_len

In [None]:
def doc_len_normal(term, document):
    
    K1 = 5
    S1 = K1+1
    fij = dictList[document][term]
    doc_length = all_doc_len[document]
    
    F_dash = (S1 * fij)/(((K1 * doc_length)/avg_doc_len) + fij)
    
    return F_dash

In [None]:
doc_len_normal('flow', 3)

In [None]:
def correction_factor(document, query_length):
    K2 = 0
    doc_length = all_doc_len[document]
    G = K2 * query_length * ((avg_doc_len - doc_length)/(avg_doc_len + doc_length))
    
    return G
    

In [None]:
correction_factor(686, 2)

In [None]:
def term_freq_within_query(term, queryCount):
    K3 = 10
    S3 = K3 + 1
    
    Fiq = (S3 * queryCount[term])/(K3 + queryCount[term])
    
    return Fiq
    

In [None]:
term_freq_within_query('flow', queryCount)

In [None]:
def Bij(term, document):
    K1 = 1
    b = 0.99
    num = (K1 + 1)* dictList[document][term]
    
    den = (K1 * ((1 - b) + (b*all_doc_len[document])/avg_doc_len)) + dictList[document][term]
    
    return (num/den)

In [None]:
querylength = len(queryTerms)
querylength

In [None]:
queryTerms

In [None]:
BM = []
for i in range(N):
    G = correction_factor(i, querylength)
    relval = 0
    for word in queryTerms:
        table_val = df.at[i, word]
        if table_val > 0:
            #BM11
            relval = relval + doc_len_normal(word, i) * term_freq_within_query(word, queryCount) * int(term_matrix[word])
            
            #BM15
            #relval = relval + term_freq_factor_Fij(word, i) * term_freq_within_query(word, queryCount) * term_matrix[word]
    relval = G + relval
    BM.append(relval)

In [None]:
BM

In [None]:
BM_ranking = {}

for i in range(N):
    BM_ranking[i] = BM[i]

BM_ranking 

In [None]:

final_result = nlargest(3, BM_ranking, key = BM_ranking.get)
final_result

In [None]:
print("Top 3 documents: ")
for doc in final_result:
    print("Document", doc, ": ", BM_ranking[doc])

In [None]:
for doc in final_result:
    loc = 'prep_corpus/doc'+str(doc+1)
    file = open(loc, 'r')
    for line in file: 
        print(line)

In [None]:
K1 = 5

In [None]:
BM = []
for i in range(N):
    relval = 0
    for word in queryTerms:
        table_val = df.at[i, word]
        if table_val > 0:
            #Simpler BM15
            #relval = relval + (((K1+1)* dictList[i][word])/K1 + dictList[i][word]) * term_matrix[word]
            
            #Simpler BM11
            relval = relval + (((K1+1)* dictList[i][word])/((K1*all_doc_len[i])/avg_doc_len) + dictList[i][word])* int(term_matrix[word])
            
            #BM25
            #relval += Bij(term, i) * term_matrix[word]
    relval = G + relval
    BM.append(relval)

In [None]:
result = sorted(BM_ranking.items(), key = lambda kv:(kv[1], kv[0]))

In [None]:
result[-3:]