In [22]:
from bs4 import BeautifulSoup
import requests
import httpx
import pandas as pd
import numpy as np

In [2]:
class trialContextManager:
    def __enter__(self): pass
    def __exit__(self, *args): return True
trial = trialContextManager()

In [23]:
base_url = 'https://www.aup.nl/en/journal/mens-en-maatschappij/back-issues'
url = 'https://journal-archive.aup.nl'

In [24]:
def get_article_urls(base_url):
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, 'lxml')
    hrefs = soup.find_all("a")
    article_urls = [href for href in hrefs if 'journal-downloads' in href.get('href')]
    return(article_urls)

In [27]:
def urls_to_df(article_urls):
    hold = []
    for a in article_urls:
        href = url + a.get('href').replace('/journal-downloads', '')
        issue, title = a.text.split(' - ')
        issue = issue.replace('no', 'No')
    
        hold.append([href, issue, title])
    
    df = pd.DataFrame(hold)\
        .set_axis(['pdf_url', 'issue', 'title'], axis=1)
    return(df)

In [28]:
def add_section(df):
    df['Boekbespreking'] = df['title'].str.contains('Boekbespreking')
    df['Algemeen'] = (df.groupby('issue').cumcount() == 0)\
        | df['title'].str.contains('Rectificatie')\
        | df['title'].str.contains('In memoriam')\
        | df['title'].str.contains('Ontvangen publicaties')\
        | df['title'].str.match('Van Doorns Indische lessen')
    df['section'] = 'artikel'
    df['section'] = df['section'].mask(df['Boekbespreking'], 'boekbespreking').mask(df['Algemeen'], 'algemeen') 
    return(df.drop(['Boekbespreking', 'Algemeen'], axis=1))

In [29]:
def add_id_and_year(df):
    df ['id'] = 'article-' + \
        df['issue'].str.extractall('(\d)')\
            .reset_index()\
            .set_axis(['idx', 'match', 'number'], axis=1)\
            .groupby('idx')['number'].sum() + \
            (df.groupby('issue').cumcount() + 1).astype(str).str.zfill(2)
    
    df['year'] = df['issue'].str.extract('(\d+)').astype(int) + 1925
    df['issue'] = df['issue'] + ' (' + df['year'].astype(str) + ')'
    return(df)

In [31]:
article_urls = get_article_urls(base_url)
df = urls_to_df(article_urls)\
    .pipe(add_section)\
    .pipe(add_id_and_year)\
    .assign(database = 2)

In [32]:
df.to_csv('scraped_data2.csv', index=False)

Unnamed: 0,pdf_url,issue,title,section,id,year,database
0,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 83 No 1 (2008),Effectevaluaties wetenschappelijker,algemeen,article-83101,2008,2
1,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 83 No 1 (2008),"Meningen over abortus in West-Europa, 1981-2000",artikel,article-83102,2008,2
2,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 83 No 1 (2008),De electorale steun voor de Nationaal Socialis...,artikel,article-83103,2008,2
3,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 83 No 1 (2008),Vrijheid versus veiligheid,artikel,article-83104,2008,2
4,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 83 No 1 (2008),Hoe meet ik beroep,artikel,article-83105,2008,2
...,...,...,...,...,...,...,...
192,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 77 No 3 (2002),Ontvangen publicaties Mens & Maatschappij 2002-3,algemeen,article-77303,2002,2
193,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 77 No 3 (2002),Op welke leeftijd,artikel,article-77304,2002,2
194,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 77 No 3 (2002),Van groeikern tot Deltametropool,artikel,article-77305,2002,2
195,https://journal-archive.aup.nl/mens-en-maatsch...,Vol 77 No 3 (2002),Mensbeeld en methode,artikel,article-77306,2002,2
