In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os
import time
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
import requests

In [None]:
# config
years = range(1980, 2023)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
base_dir = '../data'
bs_parser = 'html.parser'
encoding = 'utf-8'
seconds_delay = 30

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Accept-Language": "en-US,en;q=0.9", 
    "Sec-Ch-Ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"", 
    "Sec-Ch-Ua-Mobile": "?0", 
    "Sec-Ch-Ua-Platform": "\"Linux\"", 
    "Sec-Fetch-Dest": "document", 
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "cross-site", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1",     
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
}


In [None]:
class HrefConverter(MarkdownConverter):
    """
    Create a custom MarkdownConverter that joins hrefs with a base url
    """
    def __init__(self, *args, **kwargs):
        super(HrefConverter, self).__init__(*args, **kwargs)
        self.base_url = kwargs.get('base_url','')
        
    def convert_a(self, el, text, convert_as_inline):
        if 'href' in el.attrs:
            el['href'] = urljoin(self.base_url, el['href'])
        return super().convert_a(el, text, convert_as_inline)


# Create shorthand method for custom conversion
def md(html, **options):
    return HrefConverter(**options).convert(html)


def get_page(url, headers, encoding):
    response = requests.get(url, headers=headers)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text


def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 5 components and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')


def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]


def _clean(text):
    return text.replace(' ', ' ')


def get_talk_info(url, html):
    path_components = urlparse(url).path.split('/')
    year, month = path_components[3:5]
    soup = BeautifulSoup(html, bs_parser)
    title = soup.select_one('article header h1')
    author = soup.select_one('article p.author-name')
    author_role = soup.select_one('article p.author-role')
    body = soup.select_one('article div.body-block')
    content = _clean(md(str(body), base_url=url)) if body else ''

    return {
        'year': year,
        'month': month,
        'url': url,
        'title': _clean(title.text) if title else '',
        'author': _clean(author.text) if author else '',
        'author_role': _clean(author_role.text) if author_role else '',
        'content': content,
        'html': html,
    }


def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(base_dir, f"{year}-{month}-{title}.json")


def write_talk_info(path, talk_info):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(talk_info, f, ensure_ascii=False, indent=2)

In [None]:
for year in years:
    for month in months:
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        status_code, dir_html = get_page(dir_url, headers, encoding)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        print(dir_url, len(talk_urls))
        time.sleep(seconds_delay)
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, headers, encoding)
            if status_code != 200:
                print(f"Status code={status_code} url={talk_url}")
                continue
            talk_info = get_talk_info(talk_url, talk_html)
            if not talk_info['title'] or not talk_info['author'] or not talk_info['content']:
                print("Missing data", talk_url)
            write_talk_info(path, talk_info)
            time.sleep(seconds_delay)