# Crawl KnoWhys talks from the Church of Jesus Christ of Latter-day Saints

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [3]:
# config
host = 'https://knowhy.bookofmormoncentral.org/reference-knowhy'
base_dir = '../data/load/raw/knowhys'
bs_parser = 'html.parser'
delay_seconds = 30

In [4]:
def extract_hrefs_from_views_field_title(html):
    hrefs = []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all div tags with the class 'views-field-title'
    divs_with_class = soup.find_all('div', class_='views-field-title')

    for div in divs_with_class:
        # Find all anchor tags within the div
        anchor_tags = div.find_all('a')

        # Extract href attribute from each anchor tag and append to the list
        for anchor in anchor_tags:
            href = anchor.get('href')
            if href:
                hrefs.append(urljoin(host, href))

    return hrefs

In [5]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-1]}.json")
    

In [6]:
knowhy_links = []

page_counter = 0
while True:
    print(page_counter)
    status_code, html = get_page("https://knowhy.bookofmormoncentral.org/reference-knowhy?page="+str(page_counter), delay_seconds)
    if status_code != 200:
        print("Error!", status_code, page_counter)
        continue
    hrefs = extract_hrefs_from_views_field_title(html)
    if len(hrefs) == 0:
        break
    else:
        knowhy_links.extend(hrefs)
        page_counter += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [7]:
for url in knowhy_links:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)
    
print("End")

../data/load/raw/knowhys/what-does-the-new-testament-teach-about-the-great-apostasy.json
../data/load/raw/knowhys/why-does-paul-quote-from-an-early-christian-hymn.json
../data/load/raw/knowhys/how-often-do-the-articles-of-faith-track-sayings-of-paul.json
../data/load/raw/knowhys/what-is-an-evangelist.json
../data/load/raw/knowhys/why-is-grace-important-in-a-covenantal-community.json
../data/load/raw/knowhys/how-are-faithfulness-and-grace-connected-to-the-right-hand-of-fellowship.json
../data/load/raw/knowhys/what-did-early-christians-teach-about-the-three-degrees-of-glory.json
../data/load/raw/knowhys/how-does-paul-use-chiasmus-to-teach-about-jesuss-atonement.json
../data/load/raw/knowhys/why-are-people-baptized-for-the-dead.json
../data/load/raw/knowhys/why-are-we-invited-to-partake-of-the-sacrament-with-our-right-hand.json
../data/load/raw/knowhys/what-did-paul-really-teach-about-marriage.json
../data/load/raw/knowhys/how-did-paul-understand-faith.json
../data/load/raw/knowhys/what-d

KeyboardInterrupt: 