In [1]:
from bs4 import BeautifulSoup
import requests
import httpx
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
base_url = 'https://www.aup-online.com/content/journals/00259454/browse?page=previous-issues'
url = "https://www.aup-online.com"

In [3]:
def get_list_of_volumes(base_url):
    r = httpx.get(base_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    selector = "#main-content-container > div.issuecontents.row > div.col-xs-12.col-sm-9.volume-sections > div > div > div > div > div.panel-body > ul"
    items = soup.select(selector)[0]
    
    volume_items = soup.find_all('li', class_='volume-item')
    results = []
    for item in volume_items:
        # Find the <a> tag
        a_tag = item.find('a')
        if a_tag:
            raw_text = a_tag.get_text(strip=True).replace('\r', '')  # Get the text content
            href = url + a_tag['href'].split('&showDates')[0]  # Get the href attribute
            results.append((raw_text, href))
    return(results)

In [4]:
def get_list_of_issues(volume_items):
    results = []
    for volume, volume_url in volume_items:
        r = httpx.get(volume_url)
        soup = BeautifulSoup(r.content, 'html.parser')
    
        issue_items = soup.find_all('li')
        for issue in issue_items:
            issue_url = url + issue.find('a').get('href')
            issue_name = issue.find('span', {'class': 'issuenumber'}).get_text()
            issue_mnth = issue.find('span', {'class': 'issueyear'}).get_text(strip=True).split('\n')[1]
            results.append((volume, volume_url, issue_url, issue_name, issue_mnth))

    return(results)

In [5]:
def extract_article_info(article):
    # Extract the DOI URL
    doi_url = None
    doi_tag = article.find('a', href=True, string=lambda s: 'doi.org' in s if s else False)
    if doi_tag:
        doi_url = doi_tag['href']
    
    # Extract the title
    title = None
    section = 'algemeen'
    title_tag = article.find('h3')
    if title_tag:
        title = title_tag.get_text(strip=True)
        if 'Boekbespreking' in title:
            section = 'boekbespreking'
    
    eng_title = None
    abstract = None
    info_tag = article.find('div', {'class': 'js-desc'})
    if info_tag:
        info = info_tag.find_all('p')
        abstract = info[-1].get_text().strip().replace(' .', '.')
        if len(info) > 1:
            title_parts = [i.get_text(strip=True).strip().replace(' .', '') for i in info[:-1]]
            eng_title = ' -- '.join(title_parts) + '.'

    authors = None
    authors_tag = article.find_all('a', {'class': 'nonDisambigAuthorLink'})
    if authors_tag:
        authors = ', '.join([author.get_text().strip() for author in authors_tag])

    return((title, eng_title, doi_url, authors, section, abstract))

In [6]:
def get_article_data(issue_items):
    results = []
    for issue in issue_items:
        issue_url = issue[2]
        r = httpx.get(issue_url)
        soup = BeautifulSoup(r.content, 'html.parser')
    
        selector = "#main-content-container > div.issuecontents.row > div.col-xs-12.col-sm-9.issue-listing > div > div.panel-body > div.publistwrapper.contain > div > div"
        article_items = soup.select(selector)[0].find_all('ul', {'class': 'list-unstyled', 'role': 'listitem'})
    
        for article in article_items:
            article_info = extract_article_info(article)
            article_info = {
                'volume': issue[0],
                'issue': issue[3], 
                'month': issue[4],
                'title': article_info[0],
                'eng_title': article_info[1],
                'doi_url': article_info[2],
                'authors': article_info[3],
                'section': article_info[4],
                'abstract': article_info[5]
            }
            results.append(article_info)
    
    return(pd.DataFrame(results))



In [21]:
def add_pdf_url(df):
    results = []
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        doi_url = row['doi_url']

        # httpx does not load full website
        r = requests.get(doi_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        form = soup.find('form', {'class': 'ft-download-content__form ft-download-content__form--pdf js-ft-download-form'})
    
        action_url = None
        if form:
            action_url = form.get('action')
    
        pdf_url = None
        pdf_response = None
        if action_url:
            pdf_url = requests.compat.urljoin(url, action_url)
    
        results.append({'id': row['id'], 'pdf_url': pdf_url})

    return(df.merge(pd.DataFrame(results), how='inner'))

In [22]:
volume_items = get_list_of_volumes(base_url)
issue_items = get_list_of_issues(volume_items)
df = get_article_data(issue_items)

In [23]:
df['id'] = 'article-' + df.index.to_series().pipe(lambda val: 90000 + val).astype(str)
df['database'] = 3

In [25]:
df = add_pdf_url(df)

  0%|          | 0/472 [00:00<?, ?it/s]

In [26]:
df['issue'] = df['volume']\
    .str.replace('ume','')\
    .str.split(' ', expand=True)\
    .dropna(axis=1)\
    .set_axis(['a', 'b', 'd'], axis=1)\
    .assign(c = 'No ' + df['issue'].str[-1])\
    .sort_index(axis=1)\
    .agg(' '.join, axis=1)


In [27]:
df.to_csv('scraped_data3.csv', index=False)

In [28]:
df

Unnamed: 0,volume,issue,month,title,eng_title,doi_url,authors,section,abstract,id,database,pdf_url
0,Volume 99 (2024),Vol 99 No 3 (2024),Aug,"Studentenprotesten, polarisatie en de rol van ...",,https://doi.org/10.5117/MEM2024.3.001.KEES,Laura Keesman,algemeen,,article-90000,3,https://www.aup-online.com/deliver/fulltext/00...
1,Volume 99 (2024),Vol 99 No 3 (2024),Aug,Intergenerationele overdracht in adoptiegezinnen,Intergenerational transmission in adoptive fam...,https://doi.org/10.5117/MEM2024.3.002.LAKE,"Lixin Lakeman, Ruben van Gaalen",algemeen,Intercountry adoptees provide a unique opportu...,article-90001,3,
2,Volume 99 (2024),Vol 99 No 3 (2024),Aug,Trends in vormen van maatschappelijke betrokke...,,https://doi.org/10.5117/MEM2024.3.003.MEIJ,"Maikel Meijeren, Marcel Lubbers, Peer Scheepers",algemeen,,article-90002,3,
3,Volume 99 (2024),Vol 99 No 3 (2024),Aug,Herhaald slachtofferschap inschatten met behul...,,https://doi.org/10.5117/MEM2024.3.004.GEUR,Roos Geurts,algemeen,,article-90003,3,
4,Volume 99 (2024),Vol 99 No 3 (2024),Aug,Europese populistische partijen: Wat beïnvloed...,,https://doi.org/10.5117/MEM2024.3.005.MARO,Francesco Marolla,algemeen,,article-90004,3,
...,...,...,...,...,...,...,...,...,...,...,...,...
467,Volume 84 (2009),Vol 84 No 1 (2009),Mar,Xenofobie onder jongeren: de invloed van inter...,Xenophobia among youngsters: the influence of ...,https://doi.org/10.5117/MEM2009.1.BEKH,"Hidde Bekhuis, Stijn Ruiter, Marcel Coenders",algemeen,This study examines xenophobic attitudes of hi...,article-90467,3,https://www.aup-online.com/deliver/fulltext/18...
468,Volume 84 (2009),Vol 84 No 1 (2009),Mar,Reactie - De ‘O’ is voor … Een reactie op Kooi...,,https://doi.org/10.5117/MEM2009.1.DERS,Frans van der Slik,algemeen,,article-90468,3,https://www.aup-online.com/deliver/fulltext/18...
469,Volume 84 (2009),Vol 84 No 1 (2009),Mar,Reactie - Een Babylonische spraakverwarring?,,https://doi.org/10.5117/MEM2009.1.KOOI,"Jan-Paul Kooistra, Wout Ultee, Ben Pelzer",algemeen,,article-90469,3,https://www.aup-online.com/deliver/fulltext/18...
470,Volume 84 (2009),Vol 84 No 1 (2009),Mar,Boekbespreking - Jaarrapport Integratie 2008. ...,,https://doi.org/10.5117/MEM2009.1.SCHI,H. Schijf,boekbespreking,,article-90470,3,https://www.aup-online.com/deliver/fulltext/18...
