In [None]:
import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm

Improvement list:

1. How to make content fetching faster? - Investigate sagepub content and see if you can search only a a part of the whole content

In [8]:
search_query = ["technology", "for", "peacebuilding"]

# Get Relevant Articles

In [9]:
def parse_url_content(url):
    """Parse url content with BeautifulSoup 

    Keyword arguments:
    url -- the webpage url
    """
    request = requests.get(url)
    content = request.content
    soup = BeautifulSoup(content)
    return soup

In [10]:
def create_sagepub_url_from_search_query(search_query, amount):
    """Create an url from the given search query

    Keyword arguments:
    search_query -- a list of given search words
    amount -- the amount of articles mathicng the searh qyery to be shown
    """
    website = "https://journals.sagepub.com/action/doSearch?filterOption=allJournal&AllField="
    return website + "+".join(search_query) + "&pageSize=" + str(amount)
    

In [11]:
def create_article_link_dict(soup):
    """Create an article dictionary that contains article names as its keys and article 
    paths as its keys.

    Keyword arguments:
    soup -- a webpage parsed with Beautifulsoup
    """
    articles = {}
    for d in soup.find_all("span", attrs = {"class": "art_title"}):
        articles[d.text] = d.a["href"]
    return(articles)

In [13]:
def search_articles(search_query, amount = 200):
    """Search from sagepub website articles that match a given search query

    Keyword arguments:
    search_query -- a list of given search words
    amount -- the amount of matching articles to be returned
    """
    url = create_url(search_query, amount)
    soup = parse_url_content(url)
    articles = create_article_link_dict(soup)
    return(articles)

In [14]:
articles = search_articles(search_query); print(len(articles))

186


In [15]:
file_name = "_".join(search_query) + "_articles" +  ".json"; file_name

'technology_for_peacebuilding_articles.json'

In [16]:
json.dump( articles, open( file_name, 'w' ) )


In [17]:
articles = json.load( open( file_name ) ); print(len(articles))


186


# Get Relevant References

In [20]:
def create_article_url(path):
    """Create an article url by combining sagepub url and the article specific path

    Keyword arguments:
    path -- the path to a specific article
    """
    sage_url = "https://journals.sagepub.com"
    return sage_url + path

In [21]:
def get_article_reference_list(content):
    """Make a list of the references found from a webpage content

    Keyword arguments:
    content -- the webpage content parsed with BeautifulSoup
    """
    references = []
    for d in content.find_all("span", attrs = {"class": "NLM_article-title"}):
        references.append(d.text)
    return references

In [22]:
def create_reference_dict(articles): 
    """Create a dictionary that has as its keys the names of different articles and as its
    values the references that were mentioned in an article

    Keyword arguments:
    articles -- a dictionary of articles where the key is the article name and the item is
    the link to the article
    """
    reference_dict = {}
    for name, link in tqdm(articles.items()):
        url = create_article_url(link)
        content = parse_url_content(url)
        references = get_article_reference_list(content)
        reference_dict[name] = references
    return reference_dict

In [23]:
references  = create_reference_dict(articles)

100%|██████████| 186/186 [09:54<00:00,  3.57s/it]


In [25]:
file_name = "_".join(search_query) + "_references" +  ".json"; file_name

'technology_for_peacebuilding_references.json'

In [26]:
json.dump( references, open( file_name, 'w' ) )

In [27]:
references = json.load( open( file_name ) )
