# Imports

In [1]:
!pip install nltk



In [2]:
import nltk
import os
import pandas as pd
import numpy as np
import re
import math as m
from collections import Counter
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_list = stopwords.words('english')


 # Declare directories

In [3]:
# Declaring variables for file path
in_path = 'corpus'
out_path = 'prep_corpus'

# Declaring variables for query files
query = 'queries.txt'
preproc_query = 'preprocessed_queries.txt'



In [4]:
if not os.path.isdir(out_path):
    os.mkdir(out_path)
filenames = os.listdir(in_path)
st = PorterStemmer()
shortword = re.compile(r'\W*\b\w{1,2}\b')

In [5]:
print("Total Number of Documents :{}".format(len(filenames)))

Total Number of Documents :690


In [6]:
print(filenames)

['doc332', 'doc130', 'doc12', 'doc320', 'doc466', 'doc71', 'doc410', 'doc295', 'doc663', 'doc515', 'doc465', 'doc114', 'doc566', 'doc593', 'doc55', 'doc481', 'doc30', 'doc602', 'doc407', 'doc581', 'doc471', 'doc380', 'doc303', 'doc246', 'doc451', 'doc330', 'doc608', 'doc499', 'doc267', 'doc490', 'doc479', 'doc366', 'doc288', 'doc449', 'doc284', 'doc507', 'doc210', 'doc39', 'doc48', 'doc394', 'doc343', 'doc543', 'doc397', 'doc589', 'doc604', 'doc647', 'doc596', 'doc337', 'doc232', 'doc77', 'doc512', 'doc448', 'doc273', 'doc41', 'doc289', 'doc115', 'doc669', 'doc275', 'doc575', 'doc61', 'doc248', 'doc560', 'doc541', 'doc639', 'doc473', 'doc383', 'doc227', 'doc306', 'doc276', 'doc684', 'doc419', 'doc222', 'doc600', 'doc431', 'doc131', 'doc474', 'doc280', 'doc338', 'doc440', 'doc92', 'doc461', 'doc80', 'doc270', 'doc673', 'doc324', 'doc510', 'doc229', 'doc656', 'doc535', 'doc151', 'doc650', 'doc611', 'doc364', 'doc659', 'doc203', 'doc329', 'doc233', 'doc118', 'doc331', 'doc316', 'doc661', 

# Preprocessing

In [7]:
def tokenize(data):
    lines = data.lower()
    lines = re.sub('[^A-Za-z]+', ' ', lines)
    tokens = lines.split()
    clean_tokens = [word for word in tokens if word not in stop_list]
    stem_tokens = [st.stem(word) for word in clean_tokens]
    clean_stem_tokens = [word for word in stem_tokens if word not in stop_list]
    clean_stem_tokens = ' '.join(map(str,  clean_stem_tokens))
    clean_stem_tokens = shortword.sub('', clean_stem_tokens)
    return clean_stem_tokens

def extractTokens(beautSoup, tag):
    textData = beautSoup.findAll(tag)
    textData = ''.join(map(str, textData))
    textData = textData.replace(tag, '')
    textData = tokenize(textData)
    return textData

In [8]:
for fname in filenames:
    infilepath = in_path + '/' + fname
    outfilepath = out_path + '/' + fname
    with open(infilepath) as infile:
        with open(outfilepath, 'w') as outfile:
            fileData = infile.read()
            soup = BeautifulSoup(fileData)
            title = extractTokens(soup, 'title')
            text = extractTokens(soup, 'text')
            outfile.write(title)
            outfile.write(" ")
            outfile.write(text)
        outfile.close()
    infile.close()

In [9]:
all_docs = []

for fname in filenames:
    outfilepath = out_path + '/' + fname
    with open(outfilepath) as file:
        fileData = file.read()
        all_docs.append(fileData)

In [10]:
all_docs

['similitud hyperson real flow slender bodi blunt nose basi hyperson small perturb theori law similitud hyperson inviscid flow field thin slender bodi examin restrict ideal gase constant specif heat bodi point nose remov steadi plane axisymmetr flow consid inspect govern system equat show similitud law exist flow field local thermal equilibrium free stream atmospher flow ideal constant specif heat requir free stream atmospher composit pressur densiti replac requir ratio specif heat flow blunt wedg cone special law similitud obtain applic similar rule examin case hyperson flow ideal flat plate blunt lead edg case equilibrium air flow wedg possibl simul nonequilibrium flow slender thin bodi also point',
 'behaviour non linear system mani phenomena occur world around govern nonlinear relationship develop mathemat scienc difficulti nonlinear analysi hinder formul nonlinear concept would permit understand phenomena present articl progress understand behavior nonlinear system review attempt 

# convert to tokens

In [11]:
no_of_docs=len(all_docs)

for i in range(no_of_docs):
    tokens = all_docs[i].split()
    print(tokens)

['similitud', 'hyperson', 'real', 'flow', 'slender', 'bodi', 'blunt', 'nose', 'basi', 'hyperson', 'small', 'perturb', 'theori', 'law', 'similitud', 'hyperson', 'inviscid', 'flow', 'field', 'thin', 'slender', 'bodi', 'examin', 'restrict', 'ideal', 'gase', 'constant', 'specif', 'heat', 'bodi', 'point', 'nose', 'remov', 'steadi', 'plane', 'axisymmetr', 'flow', 'consid', 'inspect', 'govern', 'system', 'equat', 'show', 'similitud', 'law', 'exist', 'flow', 'field', 'local', 'thermal', 'equilibrium', 'free', 'stream', 'atmospher', 'flow', 'ideal', 'constant', 'specif', 'heat', 'requir', 'free', 'stream', 'atmospher', 'composit', 'pressur', 'densiti', 'replac', 'requir', 'ratio', 'specif', 'heat', 'flow', 'blunt', 'wedg', 'cone', 'special', 'law', 'similitud', 'obtain', 'applic', 'similar', 'rule', 'examin', 'case', 'hyperson', 'flow', 'ideal', 'flat', 'plate', 'blunt', 'lead', 'edg', 'case', 'equilibrium', 'air', 'flow', 'wedg', 'possibl', 'simul', 'nonequilibrium', 'flow', 'slender', 'thin',

# Calculating Df values for each term in vocabulary

In [12]:
DF = {}
for i in range(no_of_docs):
    tokens = all_docs[i].split()
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [13]:
print(DF)

{'similitud': 8, 'hyperson': 106, 'real': 16, 'flow': 430, 'slender': 60, 'bodi': 167, 'blunt': 77, 'nose': 59, 'basi': 36, 'small': 117, 'perturb': 15, 'theori': 224, 'law': 32, 'inviscid': 54, 'field': 107, 'thin': 41, 'examin': 30, 'restrict': 25, 'ideal': 28, 'gase': 24, 'constant': 102, 'specif': 36, 'heat': 184, 'point': 130, 'remov': 4, 'steadi': 48, 'plane': 54, 'axisymmetr': 24, 'consid': 124, 'inspect': 1, 'govern': 27, 'system': 42, 'equat': 200, 'show': 105, 'exist': 61, 'local': 68, 'thermal': 51, 'equilibrium': 36, 'free': 124, 'stream': 130, 'atmospher': 39, 'requir': 74, 'composit': 13, 'pressur': 273, 'densiti': 54, 'replac': 20, 'ratio': 141, 'wedg': 24, 'cone': 52, 'special': 39, 'obtain': 224, 'applic': 117, 'similar': 99, 'rule': 7, 'case': 182, 'flat': 101, 'plate': 121, 'lead': 97, 'edg': 100, 'air': 105, 'possibl': 78, 'simul': 14, 'nonequilibrium': 11, 'also': 153, 'behaviour': 8, 'non': 28, 'linear': 67, 'mani': 19, 'phenomena': 21, 'occur': 55, 'world': 1, 'a

In [14]:
vocab_size = len(DF)
print(vocab_size)

3278


In [15]:
vocab = [term for term in DF]
print(vocab)

['similitud', 'hyperson', 'real', 'flow', 'slender', 'bodi', 'blunt', 'nose', 'basi', 'small', 'perturb', 'theori', 'law', 'inviscid', 'field', 'thin', 'examin', 'restrict', 'ideal', 'gase', 'constant', 'specif', 'heat', 'point', 'remov', 'steadi', 'plane', 'axisymmetr', 'consid', 'inspect', 'govern', 'system', 'equat', 'show', 'exist', 'local', 'thermal', 'equilibrium', 'free', 'stream', 'atmospher', 'requir', 'composit', 'pressur', 'densiti', 'replac', 'ratio', 'wedg', 'cone', 'special', 'obtain', 'applic', 'similar', 'rule', 'case', 'flat', 'plate', 'lead', 'edg', 'air', 'possibl', 'simul', 'nonequilibrium', 'also', 'behaviour', 'non', 'linear', 'mani', 'phenomena', 'occur', 'world', 'around', 'nonlinear', 'relationship', 'develop', 'mathemat', 'scienc', 'difficulti', 'analysi', 'hinder', 'formul', 'concept', 'would', 'permit', 'understand', 'present', 'articl', 'progress', 'behavior', 'review', 'attempt', 'made', 'result', 'way', 'may', 'appli', 'gener', 'problem', 'structur', 'aer

# Calculating tf-idf values for each term in the vocabulary

In [26]:

tf_idf = {}

def log_inverse_tfIdf():
    doc = 0
    for i in range(no_of_docs):
        tokens = all_docs[i].split()
        counter = Counter(tokens)
        words_count = len(tokens)

        for token in np.unique(tokens):
            tf = counter[token]/words_count
            df = DF[token] if token in vocab else 0
            idf = np.log((no_of_docs)/(df))
            tf_idf[doc, token] = tf*idf

        doc += 1

def log_inverse_smooth_tfIdf():
    doc = 0
    for i in range(no_of_docs):
        tokens = all_docs[i].split()
        counter = Counter(tokens)
        words_count = len(tokens)

        for token in np.unique(tokens):
            tf = counter[token]/words_count
            df = DF[token] if token in vocab else 0
            idf = np.log(1+(no_of_docs)/(df))
            tf_idf[doc, token] = tf*idf

        doc += 1
        
def log_prob_tfIdf():
    doc = 0
    for i in range(no_of_docs):
        tokens = all_docs[i].split()
        counter = Counter(tokens)
        words_count = len(tokens)

        for token in np.unique(tokens):
            tf = counter[token]/words_count
            df = DF[token] if token in vocab else 0
            idf = np.log((no_of_docs-df)/(df))
            tf_idf[doc, token] = tf*idf

        doc += 1
        
log_inverse_smooth_tfIdf()
        
# new_tf_idf={}
# def tf_idf_log_inverse():

#     for i in range(no_of_docs):
#         token=docs_data[i]
#         words=len(token)
#         count=Counter(token)

#         for x in np.unique(token):
#             tf=np.log(1+count[x])
#             df=DF[x] if x in vocabulary else 0
#             idf=np.log(1+(len(vocabulary)/(df)))
#             new_tf_idf[cnt,x]=tf*idf

In [28]:
tf_idf

{(0, 'air'): 0.019097941174498194,
 (0, 'also'): 0.01609933053405113,
 (0, 'applic'): 0.018218393712001818,
 (0, 'atmospher'): 0.05524739784677381,
 (0, 'axisymmetr'): 0.03200782199992112,
 (0, 'basi'): 0.028339915814798923,
 (0, 'blunt'): 0.06505702017491242,
 (0, 'bodi'): 0.061714871931359425,
 (0, 'case'): 0.02956193843079593,
 (0, 'composit'): 0.03764535409763328,
 (0, 'cone'): 0.02507646721307503,
 (0, 'consid'): 0.01775168679620286,
 (0, 'constant'): 0.0386714826137765,
 (0, 'densiti'): 0.024745820644045447,
 (0, 'edg'): 0.019498705278046945,
 (0, 'equat'): 0.014084000907341026,
 (0, 'equilibrium'): 0.056679831629597846,
 (0, 'examin'): 0.05996327981788577,
 (0, 'exist'): 0.02368426214708324,
 (0, 'field'): 0.037887280081666204,
 (0, 'flat'): 0.019416768404965985,
 (0, 'flow'): 0.0812800830227716,
 (0, 'free'): 0.03550337359240572,
 (0, 'gase'): 0.03200782199992112,
 (0, 'govern'): 0.030936216741465013,
 (0, 'heat'): 0.04409843258622311,
 (0, 'hyperson'): 0.07608151289555908,
 (0

# Forming document vectors using the tf-idf values

In [29]:
D = np.zeros((no_of_docs, vocab_size))
for i in tf_idf:
    ind = vocab.index(i[1])
    D[i[0]][ind] = tf_idf[i]

In [30]:
import numpy as np

print(np.matrix(D))

[[0.16863312 0.07608151 0.03572667 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.15204976 0.         0.        ]
 [0.         0.         0.         ... 0.         0.10378    0.10378   ]]


In [31]:
def gen_vector(tokens):
    Q = np.zeros((len(vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = DF[token] if token in vocab else 0
        idf = m.log((no_of_docs+1)/(df+1))

        try:
            ind = vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

# Generating Ranked List using cosine similarity as closeness measure

In [32]:
def cosine_sim(x, y):
    cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
    
    return cos_sim

In [33]:
def cosine_similarity(k, query):
    tokens = query.split()
    d_cosines = []
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    if k == 0:
        out = np.array(d_cosines).argsort()[::-1]
    else:
        out = np.array(d_cosines).argsort()[-k:][::-1]
    return out

In [34]:
query_file = open(preproc_query, 'r')
queries = query_file.readlines()
queries

['investig made wave system creat static pressur \n',
 'vortic heat transfer\n',
 'absenc vortic\n',
 'gener effect flow field\n',
 '\n']

In [35]:
def list_of_docs(k):
    cos_sims = []
    for i in range(len(queries)):
        cs = [i, cosine_similarity(k, queries[i])]
        cos_sims.append(cs)    
    return cos_sims

# Rank list

In [36]:
nos=0
list_of_docs(nos)

  cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))


[[0,
  array([468, 461, 164, 307, 372, 248,  37, 341, 237, 446, 176, 137, 194,
         331,   1,  90, 285, 638, 236, 147, 549, 643, 181, 348, 502, 379,
         122, 553, 208, 519, 421, 352, 598, 197, 337, 126, 242, 543, 264,
         470, 617, 548, 193, 310, 531, 357, 625, 494, 241,   9, 589, 476,
         162,  34, 469, 185, 389,   5, 659, 426, 283, 608, 597, 411, 277,
         537,  11, 261,   8, 550, 201, 355, 490,  79, 439, 161, 165, 251,
         128, 576, 329, 282, 204, 220, 425, 190, 347, 414, 507, 117, 247,
         325, 124, 376,  96, 308, 150,  98, 163, 395,  71, 272, 437, 654,
         525, 547, 246, 641, 660, 672,  25, 330, 110,  24, 430, 647,  87,
         104, 344,  76, 114, 301,   4,  48,  86, 378, 271, 323, 166, 610,
         358, 173, 653, 311, 685, 223, 514,  60,   0, 157, 399, 491, 590,
         322, 569, 221, 459,  95, 119, 130,  14, 428, 484, 418, 263, 256,
         375, 186, 377, 493, 281, 234, 343, 218, 140,  26, 268,  58, 229,
         184, 105, 415,  51, 232,