In [64]:
from bs4 import BeautifulSoup
import requests
import httpx
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [65]:
class trialContextManager:
    def __enter__(self): pass
    def __exit__(self, *args): return True
trial = trialContextManager()

# Archive 1

In [70]:
def parse_items(items, section='algemeen'):
    hold = []
    if section != 'algemeen':
        for i, item in enumerate(items):
            title = item.find_all("div", {"class": "title"})
            if (i % 2) == 0 :
                id = title[0].find('a').get('id')
                url = title[0].find('a').get('href')
                with trial: title = title[0].find('a').string.strip()
                authors = item.find_all('div', {'class': 'authors'})[0].string.strip().split('\t')
                with trial: pages = item.find_all('div', {'class': 'pages'})[0].string.strip()
                article = pd.DataFrame({'id': id, 'title': title, 'authors': authors, 'pages': pages, 'url': url})
                hold.append(article)
    else:
        for i, item in enumerate(items):
            title = item.find_all("div", {"class": "title"})
            if (i % 2) == 0 :
                id = title[0].find('a').get('id')
                url = title[0].find('a').get('href')
                with trial: title = title[0].find('a').string.strip()
                authors = item.find_all('div', {'class': 'authors'})[0].string.strip().split('\t')
                pages = None
                article = pd.DataFrame({'id': id, 'title': title, 'authors': authors, 'pages': pages, 'url': url})
                hold.append(article)

    df2 = pd.concat(hold).assign(section=section)
    return(df2)

def scrape_article(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    hold = []
    for section in soup.find_all("div", {"class": "section"}):
        title = section.h2.contents[0].strip()
        elements = section.find_all('ul', {'class': "cmp_article_list articles"})
        element_hold = []
        for element in elements:
            hold.append(parse_items(element.find_all('li'), section=title.lower()))

    df2 = pd.concat(hold)
    return(df2)

In [71]:
def scrape_archive(url):
    # load a page
    r = httpx.get(url)
    soup = BeautifulSoup(r.content, 'lxml')

    # parse webpage
    listings = soup.find_all("a", {"class": "title"})
    urls = [listing.get('href') for listing in listings]
    titles = [listing.find(string=True).strip() for listing in listings]

    # add data to dataframe
    df = pd.DataFrame({'issue': titles, 'url' : urls})

    return(df)

In [72]:
base_url = "https://ugp.rug.nl/MenM/issue/archive"
is_not_last_page = ''
hold = []
for i in range(1, 17):
    if i == 1:
        url = base_url
    else:
        url = f"{base_url}/{i}"

    # scrape data
    is_not_last_page = url.split("/")[-1]
    df = scrape_archive(url)
    hold.append(df)

df = pd.concat(hold).reset_index(drop=True)

In [73]:
hold = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    article = scrape_article(row['url']).assign(issue = row['issue'])
    hold.append(article)

df2 = pd.concat(hold).reset_index(drop=True)

  0%|          | 0/387 [00:00<?, ?it/s]

In [74]:
def parse_abstract(soup):
    abstract = None
    abstract_tag = soup.find('section', {"class": "item abstract"})
    if abstract_tag:
        abstract = abstract_tag.get_text().replace('\n', '').replace('\t', '').replace('Samenvatting', '')
    return abstract

def parse_tags(soup):
    tags = None
    tags_tag = soup.find('section', {"class": "item keywords"})
    if tags_tag:
        tags = tags_tag.get_text().replace('\n', '').replace('\t', '').replace('Trefwoorden:', '')
    return tags

def parse_date(soup):
    date = None
    date_tag = soup.find('div', {"class": "item published"})
    if date_tag:
        date = date_tag.get_text().replace('\n', '').replace('\t', '').replace('Gepubliceerd', '')
    return date


def scrape_article_urls(self):
  for index,row in tqdm(self.iterrows(), total=self.shape[0]):
      r = httpx.get(row['url'])
      soup = BeautifulSoup(r.content, 'lxml')
      self.loc[index,'abstract'] = parse_abstract(soup)
      self.loc[index,'tags'] = parse_tags(soup)
      self.loc[index,'date'] = parse_date(soup)
  return(self)


In [59]:
df2 = df2.pipe(scrape_article_urls)

  0%|          | 0/3281 [00:00<?, ?it/s]

In [75]:
def get_pdf_url(row):
    r = httpx.get(row['url'])
    if r.status_code != 200:
        r = requests.get(row['url'])
    soup = BeautifulSoup(r.content, 'lxml')
    view_url = soup.find('a', {'class': 'obj_galley_link pdf'}).get('href')
    r = httpx.get(view_url)
    if r.status_code != 200:
        r = requests.get(view_url)
    soup = BeautifulSoup(r.content, 'lxml')
    pdf_url = soup.find('a', {'class': 'download'}).get('href')
    return(pdf_url)

In [76]:
for i, row in tqdm(df2.iterrows(), total=df2.shape[0]):
  df2.loc[i, 'pdf_url'] = row.pipe(get_pdf_url)

  0%|          | 0/3281 [00:00<?, ?it/s]

In [12]:
df2['database'] = 1
df2.to_csv('scraped_data.csv', index=False)
df2

Unnamed: 0,id,title,authors,pages,url,section,issue,abstract,tags,date,pdf_url
0,article-37039,Inspiratie en perspectief,"Dykstra,Pearl A.",266-267,https://ugp.rug.nl/MenM/article/view/37039,artikelen,Vol 76 Nr 4 (2001),,,2001-12-01,https://ugp.rug.nl/MenM/article/download/37039...
1,article-37046,Een expertonderzoek naar de waarde van beroepen,"Ruijter,Judith de",268-284,https://ugp.rug.nl/MenM/article/view/37046,artikelen,Vol 76 Nr 4 (2001),An expert research on occupational 'worth'. Su...,,2001-12-01,https://ugp.rug.nl/MenM/article/download/37046...
2,article-37047,De invloed van persoonlijkheidskenmerken op he...,"Eijck,Koen van, Graaf,Paul M. de",285-302,https://ugp.rug.nl/MenM/article/view/37047,artikelen,Vol 76 Nr 4 (2001),The impact of personality traits on educationa...,,2001-12-01,https://ugp.rug.nl/MenM/article/download/37047...
3,article-37048,De invloed van waardenoriëntaties op de vormge...,"Jansen,Miranda, Kalmijn,Matthijs",303-324,https://ugp.rug.nl/MenM/article/view/37048,artikelen,Vol 76 Nr 4 (2001),The impact of value orientations on the design...,,2001-12-01,https://ugp.rug.nl/MenM/article/download/37048...
4,article-37050,De invloed van religie en godsdienstige samens...,"Tubergen,Frank van, Grotenhuis,Manfred te, ...",325-341,https://ugp.rug.nl/MenM/article/view/37050,artikelen,Vol 76 Nr 4 (2001),The influence of denomination and religious co...,,2001-12-01,https://ugp.rug.nl/MenM/article/download/37050...
...,...,...,...,...,...,...,...,...,...,...,...
3276,article-15388,Referaten,,,https://ugp.rug.nl/MenM/article/view/15388,algemeen,Vol 1 Nr 1 (1925),,,1925-03-01,https://ugp.rug.nl/MenM/article/download/15388...
3277,article-15389,Varia,,,https://ugp.rug.nl/MenM/article/view/15389,algemeen,Vol 1 Nr 1 (1925),,,1925-03-01,https://ugp.rug.nl/MenM/article/download/15389...
3278,article-15387,Nederlandsch Nationaal Bureau voor Anthropologie,,,https://ugp.rug.nl/MenM/article/view/15387,algemeen,Vol 1 Nr 1 (1925),,,1925-03-01,https://ugp.rug.nl/MenM/article/download/15387...
3279,article-15386,Agenda,,,https://ugp.rug.nl/MenM/article/view/15386,algemeen,Vol 1 Nr 1 (1925),,,1925-03-01,https://ugp.rug.nl/MenM/article/download/15386...
