In [44]:
import os
import re
import nltk
import pandas as pd
from tabulate import tabulate
import string
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy.linalg import norm

from collections import Counter
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /Users/harsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/harsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/harsh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Preprocessing

In [45]:
def extract_data(folder, new_folder):
    files = os.listdir(folder)
    count = 0
    for file in files:
        path = os.path.join(os.getcwd(), folder, file)
        with open(path) as fp:
            
            soup = BeautifulSoup(fp, 'html.parser')
            if count < 5:
                print("\033[1m" + "Before Extraction : " + "\033[0m", soup)
            text = soup.findAll("text")[0].text
            title = soup.findAll("title")[0].text
            final_text = title + " " + text
            if count < 5:
                print("\033[1m" + "After Extraction : " + "\033[0m" , final_text)
                count += 1
            
        new_file_path = os.path.join(os.getcwd(), new_folder, file)
        with open(new_file_path, "w") as fw:
            fw.write(final_text)
            fw.close()

In [46]:
new_folder = 'Dataset'
try:
    os.mkdir(new_folder)
except:
    print(new_folder, " already exists.")
file_map = extract_data('CSE508_Winter2023_Dataset', new_folder)

Dataset  already exists.
[1mBefore Extraction : [0m <doc>
<docno>
1223
</docno>
<title>
inviscid-incompressible-flow theory of static two-dimensional
solid jets, in proximity to the ground .
</title>
<author>
strand,t.
</author>
<biblio>
j. ae. scs. 1962, 170.
</biblio>
<text>
  the inviscid-incompressible-flow theory of static two-dimensional
solid jets impinging orthogonally on the ground is presented
using conformal mapping methods .
  it is shown that the thrust of a solid jet at constant power
initially decreases as the ground is approached .  the magnitude
of the thrust out of ground effect is regained only at a very
low height-to-jet width ratio (approximately 0.55) .  the maximuin
decrease is about 6 percent .  the ground effect on solid
jets is thus largely unfavorable .
</text>
</doc>

[1mAfter Extraction : [0m 
inviscid-incompressible-flow theory of static two-dimensional
solid jets, in proximity to the ground .
 
  the inviscid-incompressible-flow theory of static two-d

In [47]:
def preprocess(text, flag):
    if flag:
        print("\033[1m" + "Before lower case text : " + "\033[0m" , text)
        
    text = text.lower()
    if flag:
        print("\033[1m" + "After lower case text and before tokenization : "+ "\033[0m", text)

    tokens = word_tokenize(text)
    if flag:
        print("\033[1m" + "After tokenization and before stopwords removal : "+ "\033[0m", tokens)
    
    final = [word for word in tokens if word not in stop_words]
    
    if flag:
        print("\033[1m" + "After stopwords removal and before punctuations removal : "+ "\033[0m", final)

    tokens = [word for word in tokens if word not in string.punctuation]
    
    if flag:
        print("\033[1m" + "After punctuations and before blank space token removal : "+ "\033[0m", tokens)
     
    final = [word for word in tokens if len(re.findall(r'\s+', word)) == 0]
    
    if flag:
        print("\033[1m" + "After blank space token removal : "+ "\033[0m", final)
    
    return final

In [48]:
L = []
idtoName = {}
files = os.listdir("Dataset")
count = 0

for i, file in enumerate(files):
    idtoName[i] = file
    path = os.path.join(os.getcwd(), "Dataset", file)
    
    with open(path) as fp:
        text = fp.read()
        if count < 5:
            print("**********************************************************************************************")
            print("**********************************************************************************************")
            print(f"Printing Statement : {count+1}")
            L.append(preprocess(text, True))
            print("**********************************************************************************************")
            print("**********************************************************************************************")
            count += 1
        else:
            L.append(preprocess(text, False))

**********************************************************************************************
**********************************************************************************************
Printing Statement : 1
[1mBefore lower case text : [0m 
inviscid-incompressible-flow theory of static two-dimensional
solid jets, in proximity to the ground .
 
  the inviscid-incompressible-flow theory of static two-dimensional
solid jets impinging orthogonally on the ground is presented
using conformal mapping methods .
  it is shown that the thrust of a solid jet at constant power
initially decreases as the ground is approached .  the magnitude
of the thrust out of ground effect is regained only at a very
low height-to-jet width ratio (approximately 0.55) .  the maximuin
decrease is about 6 percent .  the ground effect on solid
jets is thus largely unfavorable .

[1mAfter lower case text and before tokenization : [0m 
inviscid-incompressible-flow theory of static two-dimensional
solid jets, i

## TF-IDF

In [49]:
class TfIDf:
    
    def __init__(self, tokens, tf_weighting = "log_norm"):
        
        self.tokens = tokens
        self.idf = {}
        self.tf_idf = None
        self.v = None
        self.n = len(tokens)
        self.vocab = None
        self.tf_weighting = tf_weighting
        self.calc_tf_idf()
        
    def normalize_tf(self, tf, doc):
        # tf = 0 will not occur. We are not computing for those terms.
        if self.tf_weighting == 'binary':
            return 1
        
        elif self.tf_weighting == 'raw_count':
            return tf
        
        elif self.tf_weighting == 'term_frequency':
            return tf / (len(doc))
        
        elif self.tf_weighting == 'log_norm':
            return np.log ( 1 + tf)
        
        else:
            return 0.5 + 0.5 * (tf / max(list(Counter(doc).values())))
            
        
    def calc_tf_idf(self):
        
        term_freq = {}
        # Go through each document. Create frequency dictionary for each doc
        # Update term frequency dictionary with raw count.
        for i in range(self.n):
            freq = Counter(self.tokens[i])
            for term in freq.elements():
                if term not in term_freq:
                    term_freq[term] = {i:freq[term]}
                else:
                    term_freq[term][i] = freq[term]
        
        self.v = len(term_freq)
        self.vocab = list(term_freq.keys())
        
        # Create idf
        for term in term_freq:
            self.idf[term] = np.log(self.n/len(term_freq[term]) + 1)
        
        # Initialize TF-IDF
        self.tf_idf = np.zeros((self.n, self.v))
        
        # Calculate TF-IDF for terms which are present in doc else tf-idf is always 0.
        for term in term_freq:
            for doc in term_freq[term]:
                tf =  self.normalize_tf(term_freq[term][doc], self.tokens[doc])
                self.tf_idf[doc][self.vocab.index(term)] = tf * self.idf[term]
    
    def query(self, query_doc, num_doc = 5):
        
        q_freq = Counter(query_doc)
        q_vec = np.zeros(self.v).reshape(-1,1)

        for term in q_freq:
            if term in self.vocab:
                tf = self.normalize_tf(q_freq[term], query_doc)
                q_vec[self.vocab.index(term)] = tf * self.idf[term]
                
        # Dot product of tf-idf and query vector
        # Sort it in descending order along with its document id
        # Result is (score, doc-id)
        if norm(q_vec) == 0 or norm(self.tf_idf) == 0:
            result = sorted(zip((self.tf_idf@q_vec).flatten(), np.arange(self.n)), reverse = True, key = lambda x: x[0])
        else:
            result = sorted(zip(((self.tf_idf@q_vec)/(norm(self.tf_idf)*norm(q_vec))).flatten(), np.arange(self.n)), reverse = True, key = lambda x: x[0])


        return result[0:num_doc]
                         

In [50]:
def queries_tf_idf_score(queries):
    binary_obj = TfIDf(L,"binary")
    raw_count_obj = TfIDf(L,"raw_count")
    term_frequency_obj = TfIDf(L,"term_frequency")
    log_norm_obj = TfIDf(L,"log_norm")
    double_norm_obj = TfIDf(L,"double_norm")
    df_list = []
    for query in queries:
        query_token = preprocess(query, False)
        ll = []
        for obj in [binary_obj, raw_count_obj, term_frequency_obj, log_norm_obj, double_norm_obj]:
            result = obj.query(query_token, 5)
            df = pd.DataFrame(result, columns=['Score', 'Document'])
            df['Document'] = df['Document'].apply(lambda x: idtoName[x])
            ll.append(df)
        df_list.append(ll)
    return df_list
        
        

In [51]:
N = int(input("Enter the number of queries. "))
count = 1
queries = []
operand = []
while count <= N:
    query = input("Query : ")
    queries.append(query)
    count +=1

df_list = queries_tf_idf_score(queries)
for idx in range(len(queries)):
    print(f"Query {idx + 1}: ", queries[idx])
    print(f"Documents retrieved for query {idx + 1} for binary: \n", 
          tabulate(df_list[idx][0], headers='keys', tablefmt='psql'))
    print(f"Documents retrieved for query {idx + 1} for raw count: \n", 
          tabulate(df_list[idx][1], headers='keys', tablefmt='psql'))
    print(f"Documents retrieved for query {idx + 1} for term frequency: \n", 
          tabulate(df_list[idx][2], headers='keys', tablefmt='psql'))
    print(f"Documents retrieved for query {idx + 1} for log normalization: \n", 
          tabulate(df_list[idx][3], headers='keys', tablefmt='psql'))
    print(f"Documents retrieved for query {idx + 1} for double normalization: \n", 
          tabulate(df_list[idx][4], headers='keys', tablefmt='psql'))

Enter the number of queries. 1
Query : Which document should this query belong to? Maybe our algorithm can help figure it out.
Query 1:  Which document should this query belong to? Maybe our algorithm can help figure it out.
Documents retrieved for query 1 for binary: 
 +----+------------+---------------+
|    |      Score | Document      |
|----+------------+---------------|
|  0 | 0.00353159 | cranfield0153 |
|  1 | 0.00319108 | cranfield0928 |
|  2 | 0.00277138 | cranfield0962 |
|  3 | 0.00250726 | cranfield0798 |
|  4 | 0.00250726 | cranfield0499 |
+----+------------+---------------+
Documents retrieved for query 1 for raw count: 
 +----+------------+---------------+
|    |      Score | Document      |
|----+------------+---------------|
|  0 | 0.00359466 | cranfield0962 |
|  1 | 0.00307852 | cranfield0928 |
|  2 | 0.00295812 | cranfield0914 |
|  3 | 0.00253147 | cranfield0798 |
|  4 | 0.00250967 | cranfield1147 |
+----+------------+---------------+
Documents retrieved for query 1 

## Jacard Coefficient

In [52]:
def jaccard_coefficient(queries):
    sets_doc ={}
    for i in range(len(L)):
        sets_doc[i] = set(L[i])
    df_list = []
    for query in queries:
        query_token = preprocess(query, False)
        query_token = set(query_token)
        jc = np.zeros(len(L))
        for doc in sets_doc:
            jc[doc] = len(sets_doc[doc] & query_token) / len(sets_doc[doc] | query_token)
        result = sorted(zip(jc, np.arange(len(L))), reverse = True, key = lambda x: x[0])
        df = pd.DataFrame(result[0:10], columns=['Score', 'Document'])
        df['Document'] = df['Document'].apply(lambda x: idtoName[x])
        df_list.append(df)
    return df_list

In [53]:
N = int(input("Enter the number of queries."))
count = 1
queries = []
operand = []
while count <= N:
    query = input("Query : ")
    queries.append(query)
    count +=1

df_list = jaccard_coefficient(queries)
for idx in range(len(queries)):
    print(f"Query {idx + 1}: ", queries[idx])
    print(f"Documents retrieved for query {idx + 1} using Jaccard Coefficient: \n", 
          tabulate(df_list[idx], headers='keys', tablefmt='psql'))
   

Enter the number of queries.1
Query : Which document should this query belong to? Maybe our algorithm can help figure it out.
Query 1:  Which document should this query belong to? Maybe our algorithm can help figure it out.
Documents retrieved for query 1 using Jaccard Coefficient: 
 +----+-----------+---------------+
|    |     Score | Document      |
|----+-----------+---------------|
|  0 | 0.1       | cranfield0281 |
|  1 | 0.0754717 | cranfield0835 |
|  2 | 0.0740741 | cranfield0153 |
|  3 | 0.0704225 | cranfield1275 |
|  4 | 0.0694444 | cranfield0637 |
|  5 | 0.0677966 | cranfield0855 |
|  6 | 0.0666667 | cranfield0909 |
|  7 | 0.0657895 | cranfield1388 |
|  8 | 0.0638298 | cranfield0506 |
|  9 | 0.0632911 | cranfield0469 |
+----+-----------+---------------+
