# Crawl General Conference talks from the Church of Jesus Christ of Latter-day Saints

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [16]:
# config
# years = range(1980, 2024)
years = range(2023, 2026)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
base_dir = '/home/public/iloveconference/load/raw'
bs_parser = 'html.parser'
delay_seconds = 10

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [17]:
# Find the last crawled month
existing_files = os.listdir(base_dir)
last_crawled_year = None
last_crawled_month = None

if existing_files:
    # Extract year-month from filenames (format: YYYY-MM-title.json)
    dates = [(f.split('-')[0], f.split('-')[1]) for f in existing_files if f.endswith('.json')]
    if dates:
        last_date = max(dates)
        last_crawled_year = int(last_date[0])
        last_crawled_month = last_date[1]

print(f"Last crawled: {last_crawled_year}-{last_crawled_month}")


Last crawled: 2025-04


In [18]:
def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 6 components (first component is empty) and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')


def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]


def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(base_dir, f"{year}-{month}-{title}.json")

In [19]:
start_found = False if last_crawled_year and last_crawled_month else True
print(start_found)

False


In [20]:
 
for year in years:
    for month in months:
        # Skip until we reach the last crawled point
        if not start_found:
            if year == last_crawled_year and month == last_crawled_month:
                start_found = True
            continue   
        
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        status_code, dir_html = get_page(dir_url, delay_seconds)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        print(dir_url, len(talk_urls))
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, delay_seconds)
            if status_code != 200:
                print(f"Status code={status_code} url={talk_url}")
                continue
            save_page(path, talk_url, talk_html)

Status code=404 url=https://www.churchofjesuschrist.org/study/general-conference/2025/10?lang=eng


In [21]:
import shutil

# Create new directory if it doesn't exist
new_dir = os.path.join(base_dir, 'new')
if not os.path.exists(new_dir):
    os.makedirs(new_dir)

# Get all files starting from October 2023
files_to_copy = [file for file in existing_files if file.startswith('2023-10') or 
                 (file.startswith('2024-') or file.startswith('2025-'))]

# Copy files
for file in files_to_copy:
    src = os.path.join(base_dir, file)
    dst = os.path.join(new_dir, file)
    shutil.copy2(src, dst)
    print(f"Copied {file}")

Copied 2023-10-28rasband.json
Copied 2024-10-17casillas.json
Copied 2024-10-57nelson.json
Copied 2024-04-46taylor.json
Copied 2024-04-12larson.json
Copied 2023-10-53cordon.json
Copied 2023-10-32koch.json
Copied 2023-10-41ballard.json
Copied 2023-10-52pingree.json
Copied 2023-10-55esplin.json
Copied 2023-10-33runia.json
Copied 2025-04-22lund.json
Copied 2024-04-55andersen.json
Copied 2024-04-27cook.json
Copied 2024-10-32yee.json
Copied 2024-04-17gerard.json
Copied 2024-04-56pace.json
Copied 2024-10-44stevenson.json
Copied 2024-04-57nelson.json
Copied 2023-10-23newman.json
Copied 2024-10-25buckner.json
Copied 2024-10-47eyring.json
Copied 2023-10-51nelson.json
Copied 2024-04-44pieper.json
Copied 2024-04-45kearon.json
Copied 2024-10-21christofferson.json
Copied 2023-10-43parrella.json
Copied 2023-10-27phillips.json
Copied 2024-10-11eyring.json
Copied 2024-10-23villar.json
Copied 2025-04-53whiting.json
Copied 2023-10-42freeman.json
Copied 2025-04-11oaks.json
Copied 2024-10-51uchtdorf.json
C

In [22]:
print(base_dir)

/home/public/iloveconference/load/raw


In [24]:
# List all files in the new directory
files_in_new = os.listdir(new_dir)
files_in_new.sort()  # Sort files alphabetically
for file in files_in_new:
    print(file)

2023-10-11bednar.json
2023-10-12wright.json
2023-10-13daines.json
2023-10-14godoy.json
2023-10-15christofferson.json
2023-10-16ardern.json
2023-10-17oaks.json
2023-10-21eyring.json
2023-10-22andersen.json
2023-10-23newman.json
2023-10-24costa.json
2023-10-25stevenson.json
2023-10-26choi.json
2023-10-27phillips.json
2023-10-28rasband.json
2023-10-31sabin.json
2023-10-32koch.json
2023-10-33runia.json
2023-10-34soares.json
2023-10-41ballard.json
2023-10-42freeman.json
2023-10-43parrella.json
2023-10-44cook.json
2023-10-45uchtdorf.json
2023-10-46waddell.json
2023-10-47eyring.json
2023-10-51nelson.json
2023-10-52pingree.json
2023-10-53cordon.json
2023-10-54gong.json
2023-10-55esplin.json
2023-10-56giraud-carrier.json
2023-10-57renlund.json
2024-04-11oaks.json
2024-04-12larson.json
2024-04-13holland.json
2024-04-14dennis.json
2024-04-15dushku.json
2024-04-16soares.json
2024-04-17gerard.json
2024-04-18eyring.json
2024-04-21bednar.json
2024-04-22de-feo.json
2024-04-23nielson.json
2024-04-24alo