# Crawl General Conference talks from the Church of Jesus Christ of Latter-day Saints

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [5]:
# config
# years = range(1980, 2024)
years = range(2023, 2026)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
base_dir = '/home/public/iloveconference/load/raw'
bs_parser = 'html.parser'
delay_seconds = 10

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
# Find the last crawled month
existing_files = os.listdir(base_dir)
last_crawled_year = None
last_crawled_month = None

if existing_files:
    # Extract year-month from filenames (format: YYYY-MM-title.json)
    dates = [(f.split('-')[0], f.split('-')[1]) for f in existing_files if f.endswith('.json')]
    if dates:
        last_date = max(dates)
        last_crawled_year = int(last_date[0])
        last_crawled_month = last_date[1]

print(f"Last crawled: {last_crawled_year}-{last_crawled_month}")


Last crawled: 2023-04


In [7]:
def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 6 components (first component is empty) and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')


def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]


def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(base_dir, f"{year}-{month}-{title}.json")

In [8]:
start_found = False if last_crawled_year and last_crawled_month else True
print(start_found)

False


In [9]:
 
for year in years:
    for month in months:
        # Skip until we reach the last crawled point
        if not start_found:
            if year == last_crawled_year and month == last_crawled_month:
                start_found = True
            continue   
        
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        status_code, dir_html = get_page(dir_url, delay_seconds)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        print(dir_url, len(talk_urls))
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, delay_seconds)
            if status_code != 200:
                print(f"Status code={status_code} url={talk_url}")
                continue
            save_page(path, talk_url, talk_html)

https://www.churchofjesuschrist.org/study/general-conference/2023/10?lang=eng 66
     /home/public/iloveconference/load/raw/2023-10-11bednar.json
     /home/public/iloveconference/load/raw/2023-10-12wright.json
     /home/public/iloveconference/load/raw/2023-10-13daines.json
     /home/public/iloveconference/load/raw/2023-10-14godoy.json
     /home/public/iloveconference/load/raw/2023-10-15christofferson.json
     /home/public/iloveconference/load/raw/2023-10-16ardern.json
     /home/public/iloveconference/load/raw/2023-10-17oaks.json
     /home/public/iloveconference/load/raw/2023-10-21eyring.json
     /home/public/iloveconference/load/raw/2023-10-22andersen.json
     /home/public/iloveconference/load/raw/2023-10-23newman.json
     /home/public/iloveconference/load/raw/2023-10-24costa.json
     /home/public/iloveconference/load/raw/2023-10-25stevenson.json
     /home/public/iloveconference/load/raw/2023-10-26choi.json
     /home/public/iloveconference/load/raw/2023-10-27phillips.json
