In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from datetime import datetime
import pandas as pd
import time
from tqdm import tqdm
from sys import stdout

search_keys = ["Willis", "Sycamore", "Discovery Park", "Welch", "Union", "General Academic Building", "Gateway center", "Research", "Computer Science", "Data Engineering"]

url_list = []
for search in search_keys:
    url = 'https://www.unt.edu/search-results?search='+search.replace(' ', '+')+'&sa=Search'
    url_list.append(url)

contents = []

for url in url_list:
    driver = uc.Chrome(use_subprocess=True,)
    driver.get(url)    

    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@class='gsc-wrapper']")))

    for index in range(10):
        try:
            index += 1 
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@class='gsc-cursor']/div["+str(index)+"]"))).click()

            time.sleep(1)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@class='gsc-wrapper']")))
            elements = driver.find_elements(By.XPATH, "//*[@class='gsc-thumbnail-inside']")
            for element in elements:
                contents.append(element.text)
        except:
            break

First: 
    Query sets:
        1. Willis
        2. Sycamore
        3. Discovery Park
        4. Welch
        5. Union
        6. General Academic Building
        7. Gateway center
        8. Research
        9. Computer Science
        10. Data Engineering
Run the query on the original unt.edu and scrape the search results from UNT.edu site.

Second: Save the scarpped results to csv file.

In [4]:
import pandas as pd
texts = []
for content in  contents:
    if not content == '':
        texts.append(content)
df = pd.DataFrame(
    {'texts': texts})
df.to_csv('resources/text.csv', index=True, encoding='utf-8')

Third: Create a dictionary of words (Indexing) for vector space retrieval model for the search  and then create a search engine with vector space retrieval model for the search.

In [5]:
import os
import sys
import pickle
import math

import pandas
from collections import Counter
from utils import textprocessing, helpers
''' Index data '''

print('Indexing....')

resources_path = os.path.join(os.getcwd(), 'resources')
data_path = os.path.join(os.getcwd(), 'data')

if not os.path.isdir(resources_path):
    print('ERROR: The {} is not a directory or does not exist'.format(
        resources_path))
    sys.exit(1)

if not os.path.exists(data_path):
    os.mkdir(data_path)

# Get dataset path and stopwords file
dataset_path = os.path.join(resources_path, 'text.csv')
stopwords_file = os.path.join(resources_path, 'stopwords_en.txt')

# Get stopwords set
stopwords = helpers.get_stopwords(stopwords_file)

df = pandas.read_csv(dataset_path)
print(df.get('texts'))
docs = list(df.get('texts'))


corpus = []
for doc in docs:    
    text = doc
    words = textprocessing.preprocess_text(text, stopwords)
    bag_of_words = Counter(words)
    corpus.append(bag_of_words)

idf = helpers.compute_idf(corpus)
for doc in corpus:
    helpers.compute_weights(idf, doc)
    helpers.normalize(doc)

inverted_index = helpers.build_inverted_index(idf, corpus)

docs_file = os.path.join(data_path, 'docs.pickle')
inverted_index_file = os.path.join(data_path, 'inverted_index.pickle')
dictionary_file = os.path.join(data_path, 'dictionary.txt')

# Serialize data
with open(docs_file, 'wb') as f:
    pickle.dump(docs, f)

with open(inverted_index_file, 'wb') as f:
    pickle.dump(inverted_index, f)

with open(dictionary_file, 'w') as f:
    for word in idf.keys():
        f.write(word + '\n')

print('Index done.')


Indexing....
0            Willis Library - University Libraries - UNT
1      Reserving a Study Space - University Libraries...
2                                          UNT Libraries
3                    Spaces - University Libraries - UNT
4                           Willis Library: Fourth Floor
                             ...                        
548                               Mechanical Engineering
549    Dataflow based Near Data Computing Achieves Ex...
550    Dataflow based Near-Data Processing using Coar...
551    * The Program Inventory displays the minimum n...
552    An Industrial Case Study About Test Failure Pr...
Name: texts, Length: 553, dtype: object
Index done.


Finally Run query of same search key on created engine with vector space retrieval model for the search.

In [3]:
import pickle
import os
import sys
import math
from utils import textprocessing
from utils import helpers
from collections import Counter
''' Query '''

docs_file = os.path.join(os.getcwd(), 'data', 'docs.pickle')
inverted_index_file = os.path.join(
    os.getcwd(), 'data', 'inverted_index.pickle')

stopwords_file = os.path.join(os.getcwd(), 'resources', 'stopwords_en.txt')

# Deserialize data
with open(docs_file, 'rb') as f:
    docs = pickle.load(f)
with open(inverted_index_file, 'rb') as f:
    inverted_index = pickle.load(f)

stopwords = helpers.get_stopwords(stopwords_file)

dictionary = set(inverted_index.keys())

# Get query from command line
query_input = input("Query: ")
query = query_input
# Preprocess query
query = textprocessing.preprocess_text(query, stopwords)
query = [word for word in query if word in dictionary]
query = Counter(query)

# Compute weights for words in query
for word, value in query.items():
    query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

helpers.normalize(query)

scores = [[i, 0] for i in range(len(docs))]
for word, value in query.items():
    for doc in inverted_index[word]['postings_list']:
        index, weight = doc
        scores[index][1] += value * weight

scores.sort(key=lambda doc: doc[1], reverse=True)

print('----- Results query: ('+ query_input +')------ ')
for index, score in enumerate(scores):
    if score[1] == 0:
        break
    print('{}. {} - {}'.format(index + 1, docs[score[0]], score[1]))

----- Results query: (Computer Science)------ 
1. Computer Science & Engineering - 0.8810942888417447
2. Department of Computer Science and Engineering - 0.7547141503480617
3. Department of Computer Science and Engineering - University of ... - 0.7304441293993396
4. Contact Us | Computer Science and Engineering - 0.6908853157399746
5. Contact Us | Computer Science and Engineering - 0.6908853157399746
6. Discovery Park B205 | Computing for Arts + Sciences - 0.6684611170637984
7. GATE 141 | Computing for Arts + Sciences - 0.5357695888096836
8. Departmental Staff | Computer Science and Engineering - 0.520615437958739
9. Computer Labs - University Libraries - UNT - 0.42836244440554433
10. Computer Labs - University Libraries - UNT - 0.42836244440554433
11. The Student Computer Lab at Discovery Park | College of Information - 0.3228691794000635
12. Social Science | University of North Texas - 0.311183794929343
13. Information Science | College of Information - 0.3041174301061119
14. UNT Com