Web Crawler for publications from the school of economics, finance and accounting of coventry University

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

def mycrawler(maxcount):
    """ 
    this function is the crawler which parses throuhh the coventry university school of economic,finance and accounting websites and retrives 
    details about publications on the website. 
    """
    crawled_data = [] #stores the list of dictionaries which contain details about each publication.
    count = 0
    while (count < maxcount):
        url = "https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=" + str(count)
        print("fetching: ", url)
        count +=1
        html = requests.get(url)
        time.sleep(1)
        soup = BeautifulSoup(html.text, "html.parser")
        publications = soup.findAll("h3",{"class":"title"})
        for p in publications:
            publication_details = {} # disctionary of each publication with name of item as key and the details as values
            publication_title = p.get_text()
            publication_link = p.a.get("href")
            new_url = publication_link
            html = requests.get(new_url)
            soup = BeautifulSoup(html.text, "html.parser")
            time.sleep(1)
            authors = soup.find("p",{"class":"relations persons"}).get_text()  
            get_abstract = soup.find("div",{"class":"textblock"})
            date = soup.find("span", {"class":"date"}).get_text() 
            author_link = soup.findAll("a",{"class":"link person"})   
            author_links = []  
            for link in author_link:
                authlink = link.get("href")
                author_links.append(authlink)
            if get_abstract is None:
                abstract = " "
            else: 
                abstract = get_abstract.get_text()
            publication_details.update([
                                ("publication title", publication_title),
                                ("abstract",abstract),
                                ("authors", authors),
                                ("publication link", publication_link),
                                ("authors link", author_links),
                                ("publication date", date)
                                ])
            crawled_data.append(publication_details)
            with open("database.json", "w") as jsonfile: #stroing the crawled_data list as a json file
                json.dump(crawled_data, jsonfile, indent=4)
    else:
        print(f"Crawling Completed with {count} number of pages")
        
mycrawler(1)

In [None]:
# with open("database.json", "r") as jsonfile:
#     for lines in jsonfile:
#         print(lines)

In [None]:
User-Agent: *
Crawl-Delay: 1
Disallow: /*?*format=rss
Disallow: /*?*export=xls
Sitemap: https://pureportal.coventry.ac.uk/sitemap.xml

Word Processor

In [6]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import string
ps = PorterStemmer()
sw = stopwords.words('english')


def word_processor(tokens):
    """ function to clean the documents in the database (files stored by the crawler)
    """
    stem_sentence = []
    text = str(tokens).lower() # converts all words to lower case  
    text = text.translate(str.maketrans('','', string.punctuation)) # removes punctuations
    token_words= word_tokenize(text) #tokennize each word
    for word in token_words:
        if word not in sw:   
            word = ps.stem(word)
            stem_sentence.append(word)          
            
    return stem_sentence

Inverted Index

In [7]:
import json

def inverted_index(data):
    """ 
    this function builds the inverted index for each documents present in the data base
    """
    inv_index = {} # stores inverted 
    for document in data:
        for key,value in document.items():
            if 'http' in value: #ignoring urls
                continue
            if type(value) is list: #profile links were saved in a list so we ignorne them 
                continue
            doc_token = word_processor(value)  #applying preprocessing on all words in the data base
            doc_index = data.index(document) #creating the index for each token
            for word in doc_token:
                if word in inv_index:
                    inv_index[word].append(doc_index)
                else:
                    inv_index[word] = [doc_index]
    # grouping the DocID of each document by the number of times they occur
    for word in inv_index:
        w = (inv_index[word])
        inv_index[word] = [(i, w.count(i)) for i in set(w)] 
    with open("inv_index.json", "w") as jsonfile:
        json.dump(inv_index, jsonfile)     

Query Processor

In [12]:
import math

def query_processor(data,query,index):
    """ 
    this function retrieves the relevant document matching the user query in a ranked order 
    (i.e it uses the tf-idf score of the query as the reference for ranking when doing the retrieval)
    """
    doc_len = len(data)
    processed_query = word_processor(query)
    retrieved_id = [] #stores a ranked list of documents that matches query in descending order 
    score_list = []   # a list of all (docID , tf_idf_score)
    score_dict = {}  # a dictionary containing  (docID , [tf_idf_score])
    words_relevant_to_results = [] # a list of all words relevant to results
    for word in processed_query:       
        if word in index.keys():
            words_relevant_to_results.append(word)
            for i in range(0, len(index[word])):
                tf = math.log10(1 + index[word][i][1])
                idf = math.log10(1 + doc_len/(len(index[word])))
                tf_idf = round((tf*idf), 2)           
                score_list.append([index[word][i][0], tf_idf])
                
    for score in score_list:
        if score[0] in score_dict:
            score_dict[score[0]].append(score[1])
        else:
            score_dict[score[0]] = []
            score_dict[score[0]].append(score[1])               
        # taking the sum of all the scores in the list of the score_dict values
    weighted_score = {key: round(sum(score_dict[key]),2) for key in score_dict}
    sort_dict = dict(sorted(weighted_score.items(), key=lambda item: item[1], reverse=True)) # sorts the score_dict in descending order
    for key,value in sort_dict.items():
        retrieved_id.append(key)  
    print(f"your search:  {query}")
    print(f"Relevant words in your search {words_relevant_to_results}")
    print(""" """)
    print(f"Your search returned {len(retrieved_id)} result")
    print(""" """)
    for i in retrieved_id:
        print("Publication Title: ", data[i]['publication title'])
        print("Publication link: ", data[i]['publication link'])
        print("Abstract: ", data[i]['abstract'][:400] +'............')
        print("Authors: ", data[i]['authors'])
        print("Publication Date: ", data[i]['publication date'])      
        print( ''' ''')

Get Results

In [18]:
import json

def get_query():
    file1 = open('database.json')
    data = json.load(file1)
    file1.close()
    file2 = open('inv_index.json')
    index = json.load(file2)
    file2.close()
    query = input("Enter query to search: ")
    retrieved = query_processor(data,query,index)
    
    return retrieved

get_query()

your search:  the school of science publications
Relevant words in your search ['school', 'scienc', 'public']
 
Your search returned 73 result
 
Publication Title:  Can Citizen Science Increase Trust in Research ? A Case Study of Delineating Polish Metropolitan Areas
Publication link:  https://pureportal.coventry.ac.uk/en/publications/can-citizen-science-increase-trust-in-research-a-case-study-of-de
Abstract:  We assess the relationship between citizens’ participation in scientific research and public trust in research results within social sciences. We conduct an online citizen science quasi- experiment concerning the delineation of metropolitan areas of Poland’s two major cities. It consists of two stages. In stage one, participants in one region are exposed to citizen science and directly involved in............
Authors:  Baptiste Bedessem, Bogna Gawronska-Nowak, Piotr Lis
Publication Date:  17 May 2021
 
Publication Title:  Marketing for higher education institutions: determinants 