In [160]:
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import numpy as np

import time
import re

Some constants...

In [161]:
DATA_DIR = 'data'

# we filter out these elements
CLEAN = [
  'a[id="top"]',
  'a[class="mw-selflink selflink"]',
  'a[class="image"]',
  'a[class="internal"]',
  "sup",
  "div.reflist" # remove citations, that doesn't count
]

# we filter out these links
REM_LINKS = [
  r"(\/wiki\/File:\w+)",
  r"(\/wiki\/Special:\w+)",
  r"(\/wiki\/Template:\w+)",
  r"(\/wiki\/Category:\w+)",
  r"(\/wiki\/Portal:\w+)",
  r"(\/wiki\/Template_talk:\w+)",
  r"(\/wiki\/Help:\w+)",
  r"(\/wiki\/Wikipedia:\w+)",
  r"(^#\w+)",
]

# main page content selector
CONT_SEL = "div#content"

Here, we define a function to clean up the page of any unwanted links or elements. Although Wikipedia pages are fairly clean and nice to work with programmatically, there are still certain types of elements that we want to filter out. Such links include self links (links that link back to themselves), image links, internal links, link to files or template pages, among others.

In [162]:
def cleanup_page(html):
  # clean up unwanted links from pages
  for c in CLEAN:
    els = html.select(f"{CONT_SEL} {c}")
    for el in els:
      el.decompose()

  # format remaining links
  links = html.select(f"{CONT_SEL} a")
  for link in links:
    # extract href from link
    href = link['href']

    # extract text of links and remove punctuation
    text = re.sub(r"[\,\.\:\!\?]", "", link.text)

    # at this stage, we want to further remove certain types of links
    # that is: any of the links in REM_LINKS, OR any link that doesn't start with /wiki/
    if any([re.match(regex, href) for regex in REM_LINKS]) or not re.match(r"^\/wiki\/\w+", href):
      link.decompose()
    else:
      # remove leading /wiki/ from href as it is redundant
      href = re.sub(r"\/wiki\/", "", href)

      # Here is the 1000 IQ play. We want to preserve the URL of the links but
      # also work with them from a cleaner text file. We CAN extract the text
      # from the entire page but that would mean losing the hrefs. To solve
      # this, we replace the text content of the link with its text AND the
      # associated href. THEN we can simply extract the text content of the file
      # without losing the href!!1
      link.replace_with(f'{{{text}|{href}}}')

Here, we load the list of pages that we are interested in scraping

In [163]:
# load most popular wikipedia pages csv
df = pd.read_csv(f"{DATA_DIR}/top1000.csv")

# get a list of pages as an array of strings
pages = df['article'].to_numpy().astype(str)

# filter the pages to only articles without ':' in the title
# I know this may not cover everything, but I'm just testing here
pages = pages[np.char.find(pages, ':') == -1]

In [None]:
# in case you wanna skip ahead
START_INDEX = 0
SLEEP_TIME_S = 2

n = pages.size

# compile this for efficiency
newline_regex = re.compile(r"\n{3,}")

for i, page in enumerate(pages):
  if i < START_INDEX:
    continue

  print(f"{i} of {n} ({round((i / n) * 100, 2)}%) - Now Scraping {page}")

  # load the page as html with BeautifulSoup
  res = get(f'https://en.wikipedia.org/wiki/{page}')

  # check if we got baned :c
  if res.status_code != 200:
    print("We got got")
    break

  html = BeautifulSoup(res.text, 'html.parser')

  # clean up html on the page
  cleanup_page(html)

  # create one parsed page and one clean html page
  parsed_page = newline_regex.sub("\n\n", html.getText()) # replace any more than three newlines into only 2
  html_page = str(html.prettify())

  # replace bad characters in titles with underscores
  title = re.sub(r"\/", "_", page)

  # save files
  parsed_file = open(f"{DATA_DIR}/pages/text/{i + 1}-{title}.txt", "w")
  parsed_file.write(parsed_page)
  parsed_file.close()

  html_file = open(f"{DATA_DIR}/pages/html/{i + 1}-{title}.html", "w")
  html_file.write(html_page)
  html_file.close()

  # let's not overload wikipedia with requests here
  time.sleep(SLEEP_TIME_S)