# **NOTE**: If the user want to run the program in the collab then it is better to download the abstract zip and upload it in the google drive in a folder called Data. Inside the Data create an Abstract folder. The Abstract folder should contain all the documents.

In [None]:
import multiprocessing
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
import requests
from google.colab import drive
import re
import nltk
nltk.download("all")
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np

<hr>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## DOCUMENTING


### 1. Writing the file to local drive

In [None]:
drive.mount("/content/gdrive/", force_remount=True)

Mounted at /content/gdrive/


In [None]:
!pwd

/content


In [None]:
%cd gdrive/MyDrive
%cd Data/
# %cd /content/gdrive/MyDrive/Data

/content/gdrive/MyDrive
/content/gdrive/MyDrive/Data


In [None]:
def write_to_file(text):
  global count
  file = open(f"doc.txt","a")
  file.write(text)
  file.write("\n")
  file.close()

In [None]:
class MultiThreadedCrawler:
  def __init__(self, seed_url):
    self.seed_url = seed_url
    self.root_url = '{}://{}'.format(urlparse(self.seed_url).scheme, urlparse(self.seed_url).netloc)
    self.pool = ThreadPoolExecutor(max_workers=5)
    self.scraped_pages = set([])
    self.crawl_queue = Queue()
    self.crawl_queue.put(self.seed_url)
  
  def parse_links(self, html):
    soup = BeautifulSoup(html, 'html.parser')
    Anchor_Tags = soup.find_all('a', href=True)
    for link in Anchor_Tags:
      url = link['href']
      if url.startswith('/') or url.startswith(self.root_url):
        url = urljoin(self.root_url, url)
        if url not in self.scraped_pages:
          self.crawl_queue.put(url)

  def scrape_info(self, html):
    global count
    soup = BeautifulSoup(html, "html5lib")
    web_page_paragraph_contents = soup('p')
    text = ''
    for para in web_page_paragraph_contents:
      if not ('https:' in str(para.text)):
        text = text + str(para.text).strip()
    # print(f'\n <---Text Present in The WebPage is --->\n', text, '\n')

# ---------------------------------------Modify-----------------------------------
    write_to_file(text)
# ---------------------------------------Modify-----------------------------------

    return

  def post_scrape_callback(self, res):
    result = res.result()
    if result and result.status_code == 200:
      self.parse_links(result.text)
      self.scrape_info(result.text)

  def scrape_page(self, url):
    try:
      res = requests.get(url, timeout=(3, 30))
      return res
    except requests.RequestException:
      return

  def run_web_crawler(self):
    while True:
      try:
        # print("\n Name of the current executing process: ",
					# multiprocessing.current_process().name, '\n')
        target_url = self.crawl_queue.get(timeout=60)
        if target_url not in self.scraped_pages:
          # print("Scraping URL: {}".format(target_url))
          self.current_scraping_url = "{}".format(target_url)
          self.scraped_pages.add(target_url)
          job = self.pool.submit(self.scrape_page, target_url)
          job.add_done_callback(self.post_scrape_callback)

      except Empty:
        return
      except Exception as e:
        print(e)
        continue

  def info(self):
    print('\n Seed URL is: ', self.seed_url, '\n')
    print('Scraped pages are: ', self.scraped_pages, '\n')


if __name__ == '__main__':
  cc = MultiThreadedCrawler("https://arxiv.org/abs/2212.04486")
  cc.run_web_crawler()
  cc.info()


 Seed URL is:  https://arxiv.org/abs/2212.04486 

Scraped pages are:  {'https://arxiv.org/search/cs?searchtype=author&query=Parsa%2C+B', 'https://arxiv.org/pdf/2211.10904', 'https://arxiv.org/prevnext?id=2212.04486&function=prev&context=cs.AI', 'https://arxiv.org/format/2212.06642', 'https://arxiv.org/pdf/1708.05904', 'https://arxiv.org/pdf/2107.08345', 'https://arxiv.org/search/cs?searchtype=author&query=Lamont%2C+S', 'https://arxiv.org/format/2212.05412', 'https://arxiv.org/abs/2212.06795', 'https://arxiv.org/pdf/2212.06923', 'https://arxiv.org/search/cs?searchtype=author&query=Kanan%2C+C', 'https://arxiv.org/search/cs?searchtype=author&query=Aghasi%2C+H', 'https://arxiv.org/search/cs?searchtype=author&query=Fei-Fei%2C+L', 'https://arxiv.org/pdf/1904.09307', 'https://arxiv.org/abs/2206.07036', 'https://arxiv.org/help/config_browser#pdf', 'https://arxiv.org/pdf/2212.04832', 'https://arxiv.org/format/2212.05510', 'https://arxiv.org/search/quant-ph?searchtype=author&query=Caro%2C+M+C',

### 2. Breaking the file into individual documents

In [None]:
N = 0

In [None]:
def get_title(text):
    return text[12:-8]

In [None]:
with open("doc.txt",'r',encoding="utf8") as read_file:

    write_title = False
    write_content = False
    global N

    for line in read_file:
        if re.search("\[pdf, other\]",line):
            N += 1
            # Documents are stored inside Abstracts Folder
            write_file = open(f"Abstracts//Doc{N}.txt",'w',encoding="utf8")
            write_file.write(get_title(line.strip()))
            write_file.write("\n")
            write_title = True

        if write_title and re.search("▽ More",line):
            write_content = True
            continue

        if write_content and re.search("△ Less",line):
            write_content = False
            write_title = False
            write_file.close()
            if N == 2000:
              break

        if write_content:
            write_file.write(line.strip())

# CREATING THE INVERTED INDEX

In [None]:
# Stop-words list
STOPWORDS = stopwords.words("english")

In [None]:
# Check whether the word only contains alphabet
# Ex - Apple, app100, app100le is accepted, 100 is not accepted
def is_unnecessary(word):
    if word.isalnum():
        return False
    return True

In [None]:
# Remove punctuations from the word
# Ex - End. = End, sky-fire = skyfire
def remove_punctuations(word):
    while True:
        if re.search("\.",word):
            word = word.replace(".",'')
        elif re.search(",",word):
            word = word.replace(",",'')
        elif re.search("-",word):
            word = word.replace("-",'')
        elif re.search("\/",word):
            word = word.replace("/",'')
        elif re.search("=",word):
            word = word.replace("=",'')
        elif re.search("\(",word):
            word = word.replace("(",'')
        elif re.search("\)",word):
            word = word.replace(")",'')
        elif re.search("'",word):
            word = word.replace("'",'')
        elif re.search('"',word):
            word = word.replace('"','')
        else:
            break
    return word

In [None]:
# To perform porter stemming on the words
def porter_stemming(token_list):
    stemming_tokens = []
    ps = PorterStemmer()
    for token in token_list:
        stemming_tokens.append(ps.stem(token))
    return stemming_tokens

In [None]:
# Getting all the word in the sentence, removing punctuation, stopwords and checking if it starts with letter
def get_words(sentence):
    global STOPWORDS
    words_raw = sentence.split()
    word_list = []
    for word in words_raw:
        word = remove_punctuations(word)
        if not is_unnecessary(word) and word not in STOPWORDS:
            word_list.append(word.lower())
    return word_list

In [None]:
# Get all the words in the collection
def get_documents_word_list():
    documents_word_list = []
    for i in range(1,N):
        with open(f"Abstracts//Doc{i}.txt",'r',encoding="UTF-8") as file:
            for line in file:
                words = get_words(line)
                for word in words:
                    documents_word_list.append(word)
    return documents_word_list 

In [None]:
# Generate all the (term,DocId) pairs
def get_all_pairs(term_list):
    all_pairs = []
    for i in range(1,N):
        with open(f"Abstracts//Doc{i}.txt",'r',encoding="UTF-8") as file:
    
            document_word_list = []
            for line in file:
                words_line = get_words(line)
                for word in words_line:
                    document_word_list.append(word)

            stemmed_document_words = porter_stemming(document_word_list)

            for term in term_list:
                if term in stemmed_document_words:
                    all_pairs.append((term,str(i)))
    return all_pairs

In [None]:
# Getting the word list from all the documents
documents_word_list = get_documents_word_list()

# Tokens after peforming porter stemming
tokens = porter_stemming(documents_word_list)

# Vocabulary terms
term_list = set(tokens)

In [None]:
# All (term, DocId) Pairs
all_pairs = get_all_pairs(term_list)

# Creating the postings list
vocabulary = {}
term_list = sorted(list(term_list))
for term in term_list:
    vocabulary[term] = []

for tup in all_pairs:
    vocabulary[tup[0]].append(tup[1])

In [None]:
# Inverted Index
Inverted_Index = {}
for k,v in vocabulary.items():
    posting_list = [str(len(v))]
    posting_list += v
    Inverted_Index[k] = posting_list

In [None]:
# Wtiting the vocabulary and posting list in a file
def write_dict_post_to_file():

    global Inverted_Index

    dictionary_file = open('dictionary.txt','w',encoding="UTF-8")
    posting_list_file = open('posting_list.txt','w',encoding="UTF-8")

    for key in Inverted_Index.keys():
        dictionary_file.write(key)
        dictionary_file.write("\n")

    for key in Inverted_Index:
        posting_list = " ".join(Inverted_Index[key])
        posting_list_file.write(posting_list)
        posting_list_file.write("\n")

    dictionary_file.close()
    posting_list_file.close()

# Uncomment to write to file
# write_dict_post_to_file()

## COSINE SIMILARITY

In [None]:
# Get the inverted index from the file
def get_inverted_index():
    index = {}
    term_list = []
    with open("dictionary.txt","r",encoding="UTF-8") as file:
        for line in file:
            word = line.strip()
            index[word] = 0
            term_list.append(word)
    with open("posting_list.txt","r",encoding="UTF-8") as file:
        i = 0
        for line in file:
            index[term_list[i]] = line.split()
            i += 1
    return(index)

Inverted_Index = get_inverted_index()

In [None]:
# Get all the words present in the document
def get_document_words(id):
    word_list = []
    with open(f"Abstracts//Doc{id}.txt",'r',encoding="UTF-8") as file:
        for line in file:
            words = get_words(line)
            for word in words:
                word_list.append(word)
    return word_list

In [None]:
# Vocabulary Terms
term_list = list(Inverted_Index.keys())

# Number of Documents
#!!!! will be updated when re-run
# N = 3535
# Number of Terms in Vocabulary
M = len(term_list)

# An inverse_term_list mapping term to index
inverse_term_list = {}
for i in range(M):
    inverse_term_list[term_list[i]] = i

In [None]:
# Counting the Frequency
def count_freq(words_list):
    freq = {}
    for word in words_list:
        if word in freq:
            freq[word] += 1
        else:
            freq[word] = 1
    return freq

In [None]:
N=2000

In [None]:
# Generating TF Matrix for all the Document
def tf_matrix():
    TF_MATRIX = [[0 for _ in range(M)] for __ in range(N)]
    for i in range(N):
        doc_words = get_document_words(i+1)
        word_freq = count_freq(doc_words)
        for j in range(M):
            if term_list[j] in word_freq:
                TF_MATRIX[i][j] = word_freq[term_list[j]] 
    return TF_MATRIX

TF_SCORES = tf_matrix()

In [None]:
# Getting TF score from the TF Matrix
def tf_score(docID,term):
    global TF_SCORES, term_list, inverse_term_list

    doc = docID-1
    t = inverse_term_list[term]
    
    return TF_SCORES[doc][t]

In [None]:
# Getting IDF score
def idf_score(term):
    global Inverted_Index

    df = int(Inverted_Index[term][0])
    idf = math.log(N/df)
    return idf

In [None]:
# TF-IDF score 
def tf_idf_score(docID,term):
    
    tf = tf_score(docID,term)
    idf = idf_score(term)
    return tf*idf

In [None]:
# Calculating Dot Product between two vectors
def dot_product(v1,v2):
    global M
    val = 0
    for i in range(M):
        val += v1[i]*v2[i]
    return val

In [None]:
# Calculating magnitude of a vector
def magnitude(v):
    squared_sum = sum([x*x for x in v])
    return math.sqrt(squared_sum)

In [None]:
# Vectorizing Query or Document and futher normalizing the values
def vector_normalize(docID=None,terms=None,isQuery=False):
    global term_list, M
    hashmap = {t:0 for t in term_list}

    if isQuery:
        for t in terms:
            if t in hashmap:
                hashmap[t] += 1
    else:
        for t in term_list:
            hashmap[t] = tf_idf_score(docID,t)

    m = magnitude(hashmap.values())
    
    norm_vector = [0]*M
    if m:
        for i in range(M):
            norm_vector[i] = hashmap[term_list[i]]/m
    
    return norm_vector

In [None]:
# Creating Term Document Matrix to store the TF-IDF score 
def term_document_matrix():
    global M,N
    TERM_DOCUMENT_MATRIX = []
    for i in range(N):
        TERM_DOCUMENT_MATRIX.append(vector_normalize(i+1))
    return TERM_DOCUMENT_MATRIX


TERM_DOCUMENT_MATRIX = term_document_matrix()

In [None]:
# Calculating Cosine Similarity between two vectors
def cosine_similarity(v1,v2):
    m1 = magnitude(v1)
    m2 = magnitude(v2)
    m = m1*m2
    if m:
        sim = dot_product(v1,v2)/m
        return sim
    return 0

In [None]:
# To calculate the cosine similarity between query and documents
def query_doc_similarity(q):
    global TERM_DOCUMENT_MATRIX, N

    QUERY_DOC_SIMILAR = [0 for _ in range(N)]

    for i in range(N):
        doc = TERM_DOCUMENT_MATRIX[i]
        QUERY_DOC_SIMILAR[i] = cosine_similarity(q,doc)
    return QUERY_DOC_SIMILAR

In [None]:
# Order by most similar document
def order_by_similar(array):
    sim_score = np.array(array)
    top_sim = np.argsort(sim_score)
    return top_sim[::-1]

In [None]:
# Retriving the Top 10 Document Descriptions
def get_abstract_descriptions(ids):
    description = set()
    for id in ids:
        if len(description)>=10:
            break

        with open(f"Abstracts/Doc{id+1}.txt","r",encoding="utf8") as file:
            title = ''
            text = ''
            for line in file:
                if not title:
                    title = line.strip()
                else:
                    text = line[:200]
            description.add((title,text))
    return description

## Okapi BM25

In [None]:
k = 1.2
b = 0.75

In [None]:
# Get the length of the document
def get_doc_length(docId):
    words = get_document_words(docId)
    return len(words)

In [None]:
# An array to score each document length
DOC_LENGTH = [0]*N
SUM = 0
for i in range(N):
    DOC_LENGTH[i] = get_doc_length(i+1)
    SUM += DOC_LENGTH[i]

# Storing the average length of the document
AVG_DOC_LENGTH = SUM/N

In [None]:
# An IDF Score for BM25
def IDF(term):
    global Inverted_Index,N

    df = int(Inverted_Index[term][0])
    idf = math.log((N - df + 0.5) / (df + 0.5))
    return idf

In [None]:
# Score using BM25
def rsv(q,doc):
    global TF_SCORES, k, b, DOC_LENGTH, AVG_DOC_LENGTH, inverse_term_list
    score = 0
    for t in q:
        t_index = inverse_term_list[t]
        tf = TF_SCORES[doc-1][t_index]
        num = tf * (k+1)
        deno = tf + k*( (1-b) + (b*DOC_LENGTH[doc-1]/AVG_DOC_LENGTH))
        idf = IDF(t)
        score += idf*num/deno
    return score

In [None]:
# BM25 score for each document given the query
def BM25(q):
    bm25_scores = []
    for i in range(N):
        bm25_scores.append(rsv(q,i+1))
    return bm25_scores

In [None]:
def run_cosine(query):
    query = query.split()
    stemmed_query = porter_stemming(query)
    vectorized_query = vector_normalize(terms=stemmed_query,isQuery=True)

    QUERY_DOC_SIMILAR = query_doc_similarity(vectorized_query)

    similar_doc = order_by_similar(QUERY_DOC_SIMILAR)

    titles = get_abstract_descriptions(similar_doc)

    return titles

In [None]:
def run_bm25(query):
    q = []
    for t in query.split():
        if t not in STOPWORDS:
            q.append(t)
    q = porter_stemming(q)

    bm25_scores = BM25(q)

    bm25_sim_doc = order_by_similar(bm25_scores)
    
    titles = get_abstract_descriptions(bm25_sim_doc)
    
    return titles
    

In [None]:
def show_description(description):
    count = 1
    for des in description:
        print("=="*15,count,"=="*15)
        print(des[0])
        print(des[1])
        count += 1

In [112]:
# Storing The query
QUERY = input("Enter QUERY: ")
# QUERY = "Elastic Scaling"

print("*"*50)
titles = run_cosine(QUERY)
show_description(titles)
print("*"*50)

print("*"*50)
bm_titles = run_bm25(QUERY)
show_description(titles)
print("*"*50)

Enter QUERY: Cloud Computing
**************************************************
Machine Learning for Performance Prediction of Spark Cloud Applications
Big data applications and analytics are employed in many sectors for a variety of goals: improving customers satisfaction, predicting market behavior or improving processes in public health. These app
Adaptive Graph Convolution for Point Cloud Analysis

Skyplane: Optimizing Transfer Cost and Throughput Using Cloud-Aware Overlays
Cloud applications are increasingly distributing data across multiple regions and cloud providers. Unfortunately, wide-area bulk data transfers are often slow, bottlenecking applications. We demonstra
Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud Learning

Adversarial Attack by Limited Point Cloud Surface Modifications
Recent research has revealed that the security of deep neural networks that directly process 3D point clouds to classify objects can be threatened by adversarial samples. Altho

In [113]:
goodness = 0

for i in bm_titles:
  if i in titles:
    print("Match    >>>", "Title: ", i[0])
    goodness += 1
  else:
    print("Unmatches")

print("There were "+str(goodness)+"/10 matches found")


Unmatches
Match    >>> Title:  Skyplane: Optimizing Transfer Cost and Throughput Using Cloud-Aware Overlays
Match    >>> Title:  Adaptive Graph Convolution for Point Cloud Analysis
Match    >>> Title:  Adversarial Attack by Limited Point Cloud Surface Modifications
Unmatches
Match    >>> Title:  Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud Learning
Unmatches
Match    >>> Title:  Machine Learning for Performance Prediction of Spark Cloud Applications
Match    >>> Title:  PointCA: Evaluating the Robustness of 3D Point Cloud Completion Models Against Adversarial Examples
Match    >>> Title:  Cloud-Device Collaborative Adaptation to Continual Changing Environments in the Real-world
There were 7/10 matches found
