Improvement list:

1. Tell how many articles were found with the search term
2. Clock all the phases and find bottlenecks
3. Investigate why some pages show reference list and others don't, see if there is a way to go around it

https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default&field1=AllField&text1=circular+economy&field2=AllField&text2=plastic+packaging&field3=AllField&text3=value+creation

In [1]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import time
import json
import re

In [2]:
MAX_THREADS = 30

In [3]:
search_query = ["circular economy", "plastic packaging", "value creation"]

https://httpbin.org/anything

In [4]:
#headers = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
#    "Accept-Encoding": "gzip, deflate, br", 
#    "Accept-Language": "en-US,en;q=0.9,fi;q=0.8", 
#    "Host": "httpbin.org", 
#    "Sec-Fetch-Dest": "document", 
#    "Sec-Fetch-Mode": "navigate", 
#    "Sec-Fetch-Site": "none", 
#    "Upgrade-Insecure-Requests": "1", 
#    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", 
#    "X-Amzn-Trace-Id": "Root=1-5f997c9a-1b62026a26061a4035776665"
#  }

# Classes

In [5]:
class SearchResult:
    
    def __init__(self, search_query, articles):
        self.search_query = search_query
        self.articles = articles

In [6]:
class Article:
    
    def __init__(self, name, link = "", references = []):
        self.__name = self.generalize_article_name(name)
        self.__link = link
        self.__references = [self.generalize_article_name(ref) for ref in references]
        
    def get_name(self):
        return self.__name
    
    def get_link(self):
        return self.__link
    
    def get_references(self):
        return self.__references
        
    def set_name(self, name):
        self.__name = self.generalize_article_name(name)
        
    def set_link(self, link):
        self.__link = link
    
    def set_references(self, references):
        self.__references = [self.generalize_article_name(ref) for ref in references]
        
    def generalize_article_name(self, name):
        """Generalize the name of an article reference into a simplified version to avoid having different names that 
        are linked to the same article
    
        Keyword arguments:
        name -- the name of the reference
        """
        alphanumeric = re.sub("[^\w\s]", "", name)
        return alphanumeric.lower()

# Functions

In [7]:
def create_base_url(search_query):
    url = "https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default"
    num = len(search_query)
    for i in range(num):
        pre_term = "&field" + str(i+1) + "=AllField&text" + str(i+1) + "="
        search_term = search_query[i].replace(" ", "+")
        url = url + pre_term + search_term
    return url

In [8]:
def create_page_list(base_url, page_size, page_amount):
    return [base_url + "&pageSize=" + str(page_size) + "&startPage=" + str(i) for i in range(page_amount)]

In [9]:
def create_search_urls(search_query, page_size, page_amount):
    """Create an url from the given search query

    Keyword arguments:
    search_query -- a list of given search words
    amount -- the amount of articles mathicng the searh qyery to be shown
    """
    url = create_base_url(search_query)
    url_pages = create_page_list(url, page_size, page_amount)
    return url_pages
    

In [10]:
def find_names_links(soup):
    """Create an article dictionary that contains article names as its keys and article 
    paths as its keys.

    Keyword arguments:
    soup -- a webpage parsed with Beautifulsoup
    """
    sage_url = "https://journals.sagepub.com"
    articles = []
    for d in soup.find_all("span", attrs = {"class": "art_title"}):
        name = d.text
        link = sage_url + d.a["href"]
        articles.append(Article(name, link))
    return articles

In [11]:
def get_web_page_content(url):
    print(url)
    session = requests.Session()
    # session.headers.update(headers)
    try:
        resp = session.get(url)
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
    content = resp.content
    soup = BeautifulSoup(content)
    time.sleep(0.25)
    return soup

In [12]:
def get_relevant_articles(search_queries):
    threads = min(MAX_THREADS, len(search_queries))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        page_contents = list(executor.map(get_web_page_content, search_queries))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        articles = list(executor.map(find_names_links, page_contents))

    articles = [article for article_list in articles for article in article_list]
    return articles

In [13]:
def find_references(page_content):
    """Make a list of the references found from a webpage content

    Keyword arguments:
    content -- the webpage content parsed with BeautifulSoup
    """
    references = []
    for d in page_content.find_all("span", attrs = {"class": "NLM_article-title"}):
        references.append(d.text)
    return references

In [31]:
def get_relevant_references(articles):
    threads = min(MAX_THREADS, len(articles))
    links = [article.get_link() for article in articles]
    print(len(links))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        page_contents = list(executor.map(get_web_page_content, links))
    #print(page_contents)
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        references = list(executor.map(find_references, page_contents))
    print(references)
    for i in range(len(articles)):
        articles[i].set_references(references[i])
    
    return articles

In [32]:
def main(search_query, pages, artcles_per_page):
    # PHASE1
    t0 = time.time()
    urls = create_search_urls(search_query, pages, artcles_per_page)
    articles = get_relevant_articles(urls)
    # contents = download_stories(story_urls)
    t1 = time.time()
    print(f"{t1-t0} seconds to get {pages * artcles_per_page} relevant articles.")
    # PHASE2
    t0 = time.time()
    articles = get_relevant_references(articles)
    t1 = time.time()
    print(f"{t1-t0} seconds to get {pages * artcles_per_page} relevant references.")
    return articles

In [33]:
articles = main(search_query, 3, 4)

https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default&field1=AllField&text1=circular+economy&field2=AllField&text2=plastic+packaging&field3=AllField&text3=value+creation&pageSize=3&startPage=0
https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default&field1=AllField&text1=circular+economy&field2=AllField&text2=plastic+packaging&field3=AllField&text3=value+creation&pageSize=3&startPage=1
https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default&field1=AllField&text1=circular+economy&field2=AllField&text2=plastic+packaging&field3=AllField&text3=value+creation&pageSize=3&startPage=2
https://journals.sagepub.com/action/doSearch?content=articlesChapters&countTerms=true&target=default&field1=AllField&text1=circular+economy&field2=AllField&text2=plastic+packaging&field3=AllField&text3=value+creation&pageSize=3&startPage=3
1.8701679706573486 seconds to get 12 rel

In [34]:
articles[0].get_name(), articles[0].get_link(), articles[0].get_references()

('circular economy 3d printing and the biosphere rules',
 'https://journals.sagepub.com/doi/full/10.1177/0008125618759684',
 [])

In [None]:
folder = "data/search_results/"

In [None]:
file_name = "_".join(search_query) + ".json"; file_name

In [None]:
json.dump(articles, open(file_name, "w"))