# Crawl General Conference talks from the Church of Jesus Christ of Latter-day Saints

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [7]:
# config
# years = range(1980, 2024)
years = range(2023, 2026)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
base_dir = '../data/raw/conference'
bs_parser = 'html.parser'
delay_seconds = 10

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [8]:
def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 6 components (first component is empty) and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')


def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]


def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(base_dir, f"{year}-{month}-{title}.json")

In [None]:
for year in years:
    for month in months:
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        status_code, dir_html = get_page(dir_url, delay_seconds)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        print(dir_url, len(talk_urls))
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, delay_seconds)
            if status_code != 200:
                print(f"Status code={status_code} url={talk_url}")
                continue
            save_page(path, talk_url, talk_html)

https://www.churchofjesuschrist.org/study/general-conference/2023/04?lang=eng 70
https://www.churchofjesuschrist.org/study/general-conference/2023/10?lang=eng 66
https://www.churchofjesuschrist.org/study/general-conference/2024/04?lang=eng 68
https://www.churchofjesuschrist.org/study/general-conference/2024/10?lang=eng 70
https://www.churchofjesuschrist.org/study/general-conference/2025/04?lang=eng 68
     ../data/raw/conference/2025-04-11oaks.json
     ../data/raw/conference/2025-04-12larson.json
     ../data/raw/conference/2025-04-13holland.json
     ../data/raw/conference/2025-04-14johnson.json
     ../data/raw/conference/2025-04-15rasband.json
     ../data/raw/conference/2025-04-16cook.json
     ../data/raw/conference/2025-04-17gimenez.json
     ../data/raw/conference/2025-04-18eyring.json
