Web Crawler for publications from the school of economics, finance and accounting of coventry University

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

def mycrawler(maxcount):
    crawled_data = []
    count = 0
    while (count < maxcount):
        url = "https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=" + str(count)
        print("fetching: ", url)
        count +=1
        html = requests.get(url)
        time.sleep(1)
        soup = BeautifulSoup(html.text, "html.parser")
        publications = soup.findAll("h3",{"class":"title"})
        for p in publications:
            publication_details = {}
            publication_title = p.get_text()
            publication_link = p.a.get("href")
            new_url = publication_link
            html = requests.get(new_url)
            soup = BeautifulSoup(html.text, "html.parser")
            time.sleep(1)
            authors = soup.find("p",{"class":"relations persons"}).get_text()  
            get_abstract = soup.find("div",{"class":"textblock"})
            date = soup.find("span", {"class":"date"}).get_text() 
            author_link = soup.findAll("a",{"class":"link person"})            
            author_links = []  
            for link in author_link:
                authlink = link.get("href")
                author_links.append(authlink)
            if get_abstract is None:
                abstract = " "
            else: 
                abstract = get_abstract.get_text()
            publication_details.update([
                                ("publication title", publication_title),
                                ("abstract",abstract),
                                ("authors", authors),
                                ("publication link", publication_link),
                                ("authors link", author_links),
                                ("publication date", date)
                                ])
            crawled_data.append(publication_details)
            with open("database.json", "w") as jsonfile:
                json.dump(crawled_data, jsonfile, indent=4)
    else:
        print(f"Crawling Completed with {count} number of pages")
        
mycrawler(13)

In [None]:
User-Agent: *
Crawl-Delay: 1
Disallow: /*?*format=rss
Disallow: /*?*export=xls
Sitemap: https://pureportal.coventry.ac.uk/sitemap.xml

Word Processor

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
ps = PorterStemmer()
sw = stopwords.words('english')


def word_processor(tokens):
    stem_sentence = []
    text = str(tokens).lower() # converts all words to lower case  
    text = text.translate(str.maketrans('','', string.punctuation)) # removes punctuations
    token_words= word_tokenize(text) #tokennize each word
    for word in token_words:
        if word not in sw:   
            word = ps.stem(word)
            stem_sentence.append(word)          
            
    return stem_sentence

Inverted Index

In [None]:
import json

file = open('database.json')
data = json.load(file)


def inverted_index(data):
    inv_index = {} # stores inverted 
    for document in data:
        for key,value in document.items():
            if 'http' in value: #ignoring urls
                continue
            if type(value) is list: #profile links were saved in a list so we ignorne them 
                continue
            doc_token = word_processor(value)  #applying preprocessing on all words in the data base
            doc_index = data.index(document) #creating the index for each token
            for word in doc_token:
                if word in inv_index:
                    inv_index[word].append(doc_index)
                else:
                    inv_index[word] = [doc_index]
    # grouping the DocID of each document by the number of times they occur an
    for word in inv_index:
        w = (inv_index[word])
        inv_index[word] = [(i, w.count(i)) for i in set(w)] 
    return inv_index

inverted_index(data)

Query Processor

In [None]:
import math

def query_processor(data,query,index):
    doc_len = len(data)
    processed_query = word_processor(query)
    retrieved_id = [] #stores a ranked list of documents that matches query in descending order 
    score_list = []   # a list of all (docID , tf_idf_score)
    score_dict = {}  # a dictionary containing  (docID , [tf_idf_score])
    for word in processed_query:       
        if word in index.keys():
            for i in range(0, len(index[word])):
                tf = math.log10(1 + index[word][i][1])
                idf = math.log10(1 + doc_len/(len(index[word])))
                tf_idf = round((tf*idf), 2)           
                score_list.append([index[word][i][0], tf_idf])
    for score in score_list:
        if score[0] in score_dict:
            score_dict[score[0]].append(score[1])
        else:
            score_dict[score[0]] = []
            score_dict[score[0]].append(score[1])               
        # taking the sum of all the scores in the list of the score_dict values
    weighted_score = {key: round(sum(score_dict[key]),2) for key in score_dict}
    sort_dict = dict(sorted(weighted_score.items(), key=lambda item: item[1], reverse=True)) # sorts the score_dict in descending order
    for key,value in sort_dict.items():
        retrieved_id.append(key)  
    print(f"Your search returned {len(retrieved_id)} result")
    print(""" """)
    for i in retrieved_id:
        print("Publication Title: ", data[i]['publication title'])
        print("Publication link: ", data[i]['publication link'])
        print("Abstract: ", data[i]['abstract'][:400] +'............')
        print("Authors: ", data[i]['authors'])
        print("Publication Date: ", data[i]['publication date'])       
        print( ''' ''')

Get Results

In [None]:
import json

def get_query():
    file = open('database.json')
    data = json.load(file)
    query = input("Enter query to search: ")
    index = inverted_index(data)
    retrieved = query_processor(data,query,index)
    
    return retrieved

get_query()
