In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

import logging
import time
import dateparser

In [2]:
# https://realpython.com/python-logging/
logging.basicConfig(level = logging.DEBUG, filename='app.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s')

# Program Structure

* Scrape sites
    * Open page
    * Look for a way to build a list
    * Look for attributes within each list
    * Combine into a dictionary
    * Add dictionary to 'data' list
* Build CSV

Assume run every day from a server at 6AM - what implications does this have?
* It means this will be run automatically without someone always watching it
* No manual input
* No error handling. The program has to be robust
* Are there any implications on how recent CFPs should be?


Assumptions
* Journals won't have so many CFPs they spill over into more pages

In [3]:
class CFP():
    def __init__(self, journal, title, authors, due, link):
        self.journal = journal
        self.title = title
        self.authors = authors
        self.due = due
        self.link = link

# Building Helpers

In [4]:
# This will be a list of dictionaries to build our dataframe.
# This is more efficient than continuously 'appending' to our dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
data = []

# helper function to scrape a site with a url
# This is important because this script will be running automatically, so it needs to be robust
# Error catching from https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
# Best practice for raising errors https://stackoverflow.com/questions/2052390/manually-raising-throwing-an-exception-in-python

def scraper(url):
    try:
        res = requests.get(url)
        res.raise_for_status()
    except requests.exceptions.Timeout as e:
        logging.warning(e)
        for retry in range(10):
            try:
                time.sleep(60)
                res = requests.get(url)
            except requests.exceptions.Timeout:
                logging.warning('Connection timed out')
                continue
            except requests.exceptions.RequestException as e:
                logging.error(e)
                raise
            else:
                break
    except requests.exceptions.TooManyRedirects as e:
        logging.error(e)
        raise
    except requests.exceptions.RequestException as e:
        raise
    except requests.exceptions.HTTPError as e:
        logging.error(e)
        raise
    else:
        soup = BeautifulSoup(res.content, 'html.parser')
        return soup

# Scraping
**TODO:**
* Turn things into modularised functions
* Add more testing to make sure sites have CFPs, etc.
* Add error checking (what if they don't have a due date, etc.)

## Information Systems Journal (ISJ)
* https://onlinelibrary.wiley.com/journal/13652575
* https://onlinelibrary.wiley.com/page/journal/13652575/homepage/special_issues.htm

### Notes
* Use Blair's downloaded HTML - Cloudflare blocks this

### Ingesting HTML from moodle extract

In [5]:
def get_cfp_isj():
    l = []

    # Snapshot from https://onlinelibrary.wiley.com/page/journal/13652575/homepage/special_issues.htm

    html = open('Information Systems Journal.html', 'r')
    soup = BeautifulSoup(html, 'html.parser')

    # Assume there can be 1 to arbitrary n rows in the table that holds our CFPs
    # Assume the table will always have 2 columns, one for the paper, one for the due date

    cfp_rows = soup.find('strong', string = 'Call for Papers').find_next('table').find_all('tr')

    if cfp_rows:
        for row in cfp_rows:
            d = {}
            d['Journal'] = 'Information Systems Journal'
            d['URL'] = row.td.a['href']
            d['Title'] = row.td.text
            d['Due Date'] = dateparser.parse(row.find_all('td')[1].text)
            l.append(d)
    return l

In [6]:
data.extend(get_cfp_isj())

## Information Systems Research (ISR)
* https://pubsonline.informs.org/journal/isre
* https://pubsonline.informs.org/page/isre/calls-for-papers

### Notes
* There doesn't seem to be a convention for doing a "due date". The authors just do whatever they want

In [7]:
def get_cfp_isr():
    l = []
    base_url = 'https://pubsonline.informs.org'
    cfp_announcement_url = base_url +'/page/isre/calls-for-papers'
    
    soup = scraper(cfp_announcement_url)
    
    # Only h1 is "Calls for Papers"
    page_anchor = soup.find('h1')
    
    # Last 2 elements are unrelated - assume these elements are always going to be there
    cfp_list = page_anchor.find_all_next('h2')[:-2]
    
    if cfp_list:
        for cfp in cfp_list:
            d = {}
            d['Journal'] = 'Information Systems Research'
            d['Title'] = cfp.text.strip()
            # Assume researchers don't want verbosity
            # d['Authors'] = '; '.join(cfp.find_next('h4', string = 'Special Issue Editors').next_sibling.next_sibling.text.split('\n'))
            d['Authors'] = '; '.join([researcher.split(' (')[0] for researcher in cfp.find_next('h4', string = 'Special Issue Editors').next_sibling.next_sibling.text.split('\n')])
            d['URL'] = base_url + cfp.find_next('a', href = re.compile('^/doi'))['href']
            # Due/Submission date seems to be unstructured (no specific way of wording it)
            # The assumptions is the words 'Submission' AND 'Due' or 'Deadline' will be there. Otherwise N/A
            d['Due Date'] = dateparser.parse(scraper(d['URL']).find('td', string = re.compile('(Submission.*(Due|Deadline))|((Due|Deadline).*Submission)')).next_sibling.text)
            l.append(d)
    else:
        logging.debug('ISR returned no CFPs')
    return l

In [8]:
data.extend(get_cfp_isr())

## Journal of the Association for Information Systems (JAIS)
* https://aisel.aisnet.org/jais/

In [9]:
def get_cfp_jais():
    l = []
    url = 'https://aisel.aisnet.org/jais/'
    
    soup = scraper(url)
    
    cfp_list = [cfp.next_sibling for cfp in soup.find_all(string = re.compile('Special Issue Call for Papers'))]
    
    if cfp_list:
        for cfp in cfp_list:
            d = {}
            d['Journal'] = 'Journal of the Association for Information Systems'
            d['URL'] = cfp['href']
            d['Title'] = cfp.text.strip()
            l.append(d)
    else:
        logging.debug('JAIS returned no CFPs')
    return l

In [10]:
data.extend(get_cfp_jais())

## Journal of Information Technology (JIT)
* https://journals.sagepub.com/home/jina

In [11]:
def get_cfp_jit():
    l = []
    url = 'https://journals.sagepub.com/home/jina'
    soup = scraper(url)

    cfp_table = soup.find('h3', string = 'Call for Papers').find_next('table')
    cfp_list = cfp_table.find_all('a')

    if cfp_list:
        for cfp in cfp_list:
            d = {}
            d['URL'] = cfp['href']
            d['Journal'] = 'Journal of Information Technology'
            d['Title'] = cfp.text.strip(":“” ")
            d['Due Date'] = dateparser.parse(re.match('^ First round submission deadline: (.+).$', cfp.next_sibling)[1])
            l.append(d)
    else:
        logging.debug('JIT returned no CFPs')
    return l

In [12]:
data.extend(get_cfp_jit())

## Journal of Strategic Information Systems (JSIS)
### Note: It's actually Journal of Management Information Systems (JMIS)
* https://jmis-web.org/issues

In [13]:
def get_cfp_jmis():
    l = []

    url = 'https://jmis-web.org/issues'
    soup = scraper(url)

    cfp_list = [cfp for cfp in soup.find_all('a', class_ = 'alert-link') if cfp.previous_sibling == ' A new call for papers has been posted: ']

    if cfp_list:
        for cfp in cfp_list:
            d = {}
            d['Journal'] = 'Journal of Management Information Systems'
            d['URL'] = 'https://jmis-web.org' + cfp['href']
            d['Title'] = cfp.text
            l.append(d)
    return l

In [14]:
data.extend(get_cfp_jmis())

## Management Information Systems Quarterly (MISQ)
* https://misq.org/

In [15]:
def get_cfp_misq():
    l = []
    
    url = 'https://misq.org'
    soup = scraper(url)

    cfp_list = soup.find_all('a', string = re.compile('^Call for Papers:'))

    cfp = cfp_list[0]

    for cfp in cfp_list:
        d = {}
        d['Journal'] = 'Management Information Systems Quarterly'
        d['URL'] = url + cfp['href']
        d['Title'] = re.match('^Call for Papers:  (.*)$', cfp.text.strip())[1]
        d['Due Date'] = dateparser.parse(re.match('^The submission deadline for this special issue is (.*)$', cfp.parent.next_sibling.next_sibling.text)[1])
        l.append(d)
    return l

In [16]:
data.extend(get_cfp_misq())

# Building CSV

In [17]:
df = pd.DataFrame(data)

In [18]:
# https://stackoverflow.com/questions/36107094/pandas-apply-to-all-values-except-missing
# https://www.programiz.com/python-programming/datetime/strftime
df['Due Date'] = df['Due Date'].apply(lambda x: str(x.strftime('%d/%m/%Y')) if pd.notnull(x) else 'Not Known')

In [19]:
rename_dict = {'Journal': 'Journal Name',
              'URL': 'Link to CFP details page',
              'Title': 'CFP title',
              'Authors': 'CFP authors',
              'Due Date': 'Due date',}
df = df.rename(columns = rename_dict)

In [20]:
output_order = ['Journal Name', 'CFP title', 'CFP authors', 'Due date', 'Link to CFP details page']

In [21]:
df.to_csv('out.csv', index = False, na_rep = 'N/A', columns = output_order)