## Purpose.

Here we collect abstracts and titles from [*International Journal of Business Strategy*](https://ijbs-journal.org/IJBS-JOURNAL/Default.aspx).  This information is to be used as part of an attempt to apply text classification to charting the progression of business strategy.

## Approach.

Titles and abstracts are available without paywall login. But we have to do this in ***three stages***.  

⓵ **Get the URL for the each volume**. We start with the journal's hope page.  Our essential information is embedded in a frame depicted below on the right-hand side of the page. 

⓶ **Get a list of issue URLs**.  Each volume page contains thumbnail images of individual issues. These include URLs to the individual isses. 

⓷ **Collect lists of titles**.  Follow each issue's URL to its issue table of contents. The tables of contents contain titles, as well as URLs to pages for individual articles.

⓸ **Collect abstracts**.  Abstracts are accessible from individual articles. We have to get the abstracts from these individual-article pages.






In [1]:
# Import libaries
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import itertools as it
import json
import io
from copy import deepcopy
import datetime

In [2]:
# We use lots of list-comprehension, which drives requests.get operations.
# We need to "throttle" these, so as to avoid the appearance of a DDoS
# attack.  We accomplish this by a get_sleep function.  This function 
# executes a req.get operation, returning the result, with a one-second
# delay.
def sleep_get(url, headers):
    time.sleep(np.random.uniform(low = 0.5,
                                 high = 2.3))
    return(req.get(url,
                   headers = headers))
#
# Partition a list into a specified number of bins.  Our inputs
# are:
# ⧐ parted_list is the list to be partitioned;
# ⧐ partition_counts specifies the number of bins into which
#   parted_list is divided.
# We produce an enumerated dictionary of the list partitions.
def partition_list(parted_list, partition_counts):
    parted_list = np.sort(np.array(parted_list))
    partition_len = int(np.ceil(len(parted_list)/partition_counts))
    partitions = [np.array(object = range(partition_len)) + part * partition_len
                     for part in range(partition_counts)]
    partitions[-1] = np.arange(start = partitions[-1][0],
                               stop = parted_list.shape[0])
    return dict(enumerate([list(parted_list[part])
                             for part in partitions]))





In [5]:
headers = {
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }

BeautifulSoup(sleep_get('https://ijbs-journal.org/IJBS-JOURNAL/',
                        headers = headers).content,
              'lxml')

<!DOCTYPE html>
<html lang="en">
<head>
<style type="text/css">

								
	.banner-logo         {																			position:relative; TOP:20px; 
								LEFT:84px; 
																						WIDTH:44px; HEIGHT:44px; z-index: 3;														     			}
																						
	.banner-before-Tit   {																			-webkit-column-width: 	45px; 	-moz-column-width: 45px; column-width: 	45px;														     	}

								

	.top-banner-Title    {					color: #000; 
								font-size: 30pt; 
								width: 800px; 														}

								

	.bot-banner-ISSNs    {					color: #000; 
								font-size: 12pt;																}

	.top-banner-gradient {					background: linear-gradient(180deg, #669934, #9CCF60); 								height: 63px;	}
	.bot-banner-gradient {					background: linear-gradient(180deg, #9CCF60, #669934); 								height: 63px;	}				</style>
<title>IJBS-JOURNAL</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="css/bootstrap.min

In [None]:
https://ijbs-journal.org/IJBS-JOURNAL/Documents/Abstracts/IJBS-12-4_Abstracts.pdf
jbs_url = 'https://www.emerald.com'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
jbs_html = sleep_get(jbs_url + '/insight/publication/issn/0275-6668',
                     headers = headers)
jbs_soup = BeautifulSoup(jbs_html.content, 'lxml')



In [None]:
# 🅐 Get URLs for all of the issues. These are conveniently available on a single
#    root page.
jbs_title_abstr = {issue.attrs.get('title')\
                              .replace('.', '')\
                              .lower(): {'ToC_href' : issue.attrs.get('href')}
                    for issue in jbs_soup.find_all('a', {'class' : 'intent_tocIssueLink'})}



In [None]:


# 🅑 Now get the article titles and the URLs to their abstract pages.
#    ⓵ First define this via a function that parses the issue ToC and returns an
#       enumarated dictionary.
def get_issue_ToC(root_url, issue):
    return\
    dict(enumerate([{'art_title' : article.find('h4').text.lower(),
                     'art_href' : article.find('h4').find('a').attrs.get('href')}
                       for article in BeautifulSoup(sleep_get(jbs_url + jbs_title_abstr.get(issue).get('ToC_href'),
                                                              headers = headers).content,
                                                    'lxml').find('div', {'id' : 'publicationTabContent'})\
                                                           .find_all('div', {'class' : 'col'})
                        if article.find('h4') is not None]))
#
#    ⓶ Next, cycle through the issues.  We use try-except logic.
issues_done = list()
issues_missed = list()
for issue in jbs_title_abstr.keys():
    try:
        jbs_title_abstr.get(issue).update({'issue_toc' : get_issue_ToC(jbs_url, issue)})
        print('{}, {} articles, success at {}'.format(issue, 
                                                      str(len(jbs_title_abstr.get(issue).get('issue_toc'))),
                                                      datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))
        issues_done.append(issue)
    except:
        print('{} failure at {}'.format(issue, datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))
        issues_missed.append(issue)
        
print('done')

with io.open('./data/jbs_title_abstr.json', 'w', encoding = 'utf-8') as f:
    json.dump(jbs_title_abstr, 
              f, 
              ensure_ascii = False, 
              indent = 4)

In [None]:
for issue in issues_missed:
    try:
        jbs_title_abstr.get(issue).update({'issue_toc' : get_issue_ToC(jbs_url, issue)})
        print('{}, {} articles, success at {}'.format(issue, 
                                                      str(len(jbs_title_abstr.get(issue).get('issue_toc'))),
                                                      datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))
        issues_done.append(issue)
    except:
        print('{} failure at {}'.format(issue, datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))
        issues_missed.append(issue)
        
print('done')

In [None]:
issue = np.random.choice(a = list(jbs_title_abstr.keys()),
                         size = 1).item(0)
article = np.random.choice(a = list(jbs_title_abstr.get(issue).get('issue_toc').keys()),
                           size = 1).item(0)
title = jbs_title_abstr.get(issue).get('issue_toc').get(article).get('art_title')
url = jbs_url + jbs_title_abstr.get(issue).get('issue_toc').get(article).get('art_href')
print(f'Issue : {issue}\nArticle : {article}\nTitle : {title}\nURL : {url}')


In [None]:
# 🅒 Now get the article abstracts themselves.  Each is on a dictinct page
#    for the article.
#    ⓵ Define a function that harvests abstract and publication date
#       given url for an article page.
def get_article_abstract(article_url):
    # ⓐ Get the page's html itself.
    article_soup = BeautifulSoup(sleep_get(article_url,
                                           headers = headers).content,
                                  'lxml')
    #
    # ⓑ Harvest the publication date.
    pub_date = article_soup.find('div', {'class' : 'col-12 col-md-6'})\
                            .find('span', {'class' : 'intent_journal_publication_date'})\
                            .text\
                            .split(': ')
    #
    # ⓒ Get the abstract.  In later issues, it may be distributed across multiple
    #    text blocks.  We need to get them all.
    abstr_blocks =  [abstr_block.find('p').text\
                                          .lower()
                    for abstr_block in article_soup.find_all('section', {'class' : 'intent_sub_content Abstract__block__text'})]
    #
    # ⓓ Return a dictionary item containing the abstract — all of its blocks joined into a single string —
    #    and the publication date.
    return\
    {'abstract' : ' '.join(abstr_blocks),
     'pub_date' : pub_date[1]}
#
#   ⓶ Next, cycle through issues and articles.  We harvest the pub dates and abstracts,
#      adding them to each article's dictionary value.




In [None]:
article_count = 0

for issue in issues_done:
    article_complete = list()
    article_missed = list()
    for article in jbs_title_abstr.get(issue).get('issue_toc').keys():
        try:
            # ⓐ Get the attributes we have thus far for the specific article.  This is
            #    a dictionary item in our master dictionary, jbs_title_abstract.
            article_value = jbs_title_abstr.get(issue).get('issue_toc').get(article) 
            #
            # ⓑ Get the abstract and the publication date, using the internal function
            #    `get_article_abstract`.  This takes the previously-obtained URL fragment
            #    as its argument and returns a dictionary object.
            abstr_pubdate = get_article_abstract(jbs_url + article_value.get('art_href'))
            #
            # ⓒ Add the abstract and publication date to the article's dictionary value.
            #   We use the dictionary.update method, which is an in-place operation.
            article_value.update(abstr_pubdate)
            #
            # ⓓ Add the article title to the `article_complete` list, for 
            #    progress/completion-accountingn purposes.
            article_complete.append(article_value.get('art_title'))
            #
            # ⓔ Increment the article counter.
            article_count += 1
            #print(f'article count : {article_count}')
            #
            # ⓕ Save the dictionary as a json file each tenth article.
            if article_count % 10 == 0:
                with io.open('./data/jbs_title_abstr.json', 'w', encoding = 'utf-8') as f:
                    json.dump(jbs_title_abstr, 
                              f, 
                              ensure_ascii = False, 
                              indent = 4)
                time_now = datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')
                print(f'Writing {article_count}th article to json at time {time_now}.')            
        except:
            article_missed.append(article_value.get('art_title'))
    print('{}, {} articles, success at {}'.format(issue, 
                                                  str(len(article_complete)),
                                                  datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))

    #
    # ⓖ Add the article_complete, article_missed lists to our issue-dictionary object.
    #    Again, use the in-place operation dictionary.update.
    jbs_title_abstr.get(issue).update({'article_complete' : article_complete,
                                       'article_missed' : article_missed})
    #
with io.open('./data/jbs_title_abstr.json', 'w', encoding = 'utf-8') as f:
    json.dump(jbs_title_abstr, 
              f, 
              ensure_ascii = False, 
              indent = 4)
time_now = datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')
print(f'Writing {article_count}th article to json at time {time_now}.')            
#
print('Done at {}'.format(datetime.datetime.utcnow().strftime('%y-%m-%d, %H%M%SZ')))

In [None]:
article_count