# Crawl Encyclopedia talks from the Church of Jesus Christ of Latter-day Saints

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [None]:
# config
base = 'https://eom.byu.edu/'
host = ' https://eom.byu.edu/index.php?title=Special:AllPages'
base_dir = '../data/raw/encyclopedia'
bs_parser = 'html.parser'
delay_seconds = 5

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
def extract_hrefs_from_mw_allpages_body(html):
    hrefs = []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all div tags with the class 'views-field-title'
    divs_with_class = soup.find_all('div', class_='mw-allpages-body')

    for div in divs_with_class:
        # Find all anchor tags within the div
        anchor_tags = div.find_all('a')

        # Extract href attribute from each anchor tag and append to the list
        for anchor in anchor_tags:
            href = anchor.get('href')
            if href:
                hrefs.append(urljoin(host, href))

    return hrefs

In [None]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-1]}.json")
    

In [None]:
def extract_links(soup, base):
    hrefs = []

    div = soup.find('div', class_ ='mw-allpages-body')
    if not div:
        return hrefs

    # Find all div tags with the class 'views-field-title'
    link_tags = soup.find_all('div', {'mw_allpages_body': True})

    # Extract href attribute from each anchor tag and append to the list
    for link in link_tags:
        href = link.get('mw-allpages-body')
        if href:
            if href.startswith(base):
                hrefs.append(href)
                # path_components = urlparse(href).path.split('/')
                # print(path_components)
                # if len(path_components) == 4:
                #    if not href in hrefs:
                #        hrefs.append(href)

    return hrefs

In [None]:
def extract_hrefs_from_div(soup, base):
    hrefs = []

    # Find the div with class "mw-allpages-body"
    div_with_class = soup.find('div', class_='mw-allpages-body')

    if div_with_class:
        # Find all <a> tags within the div
        a_tags = div_with_class.find_all('a')

        for a_tag in a_tags:
            # Get the href attribute
            href = a_tag.get('href')

            if href:
                # Make the href an absolute URL based on the base_url
                absolute_url = urljoin(base, href)
                hrefs.append(absolute_url)

    return hrefs

In [None]:
def extract_next_page_href(soup, base):
    # Find the div with class "mw-allpages-nav"
    div_with_class = soup.find('div', class_='mw-allpages-nav')

    if div_with_class:
        # Find all <a> tags within the div
        a_tags = div_with_class.find_all('a')

        for a_tag in a_tags:
            # Check if the text of the <a> tag starts with "Next page"
            if a_tag.text.startswith("Next page"):
                # Get the href attribute
                href = a_tag.get('href')
                if href:
                    # Make the href an absolute URL based on the base_url
                    absolute_url = urljoin(base, href)
                    return absolute_url

    # If no matching <a> tag is found, return None
    return None

In [None]:

# Starting URL
start_url = 'https://eom.byu.edu/index.php?title=Special:AllPages'

# Initialize an empty list to store all hrefs
all_hrefs = []

while start_url:
    # Fetch the content of the current page
    status_code, html = get_page(start_url)
    if status_code != 200:
        print(f"Failed to fetch {start_url}")
        break

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(html, 'html.parser')

    # Extract hrefs and the URL of the next page
    hrefs = extract_hrefs_from_div(soup, base)
    next_page_url = extract_next_page_href(soup, base)

    # Add the extracted hrefs to the list
    all_hrefs.extend(hrefs)

    # If there is a next page URL, update the start_url for the next iteration
    if next_page_url:
        start_url = next_page_url
    else:
        # If there is no next page URL, break the loop
        break



In [None]:
for href in all_hrefs:
    print(href)
    path_file =  get_path(href)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(href, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , href)
        continue
    save_page(path_file,href,html)