# Crawl D&C podcasts from the Church of Jesus Christ of Latter-day Saints

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [3]:
# config
base = 'https://doctrineandcovenantscentral.org'
host = 'https://doctrineandcovenantscentral.org/church-history-matters-podcast/'
base_dir = '../data/raw/dc_podcasts'
bs_parser = 'html.parser'
delay_seconds = 30

if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    

In [4]:
def extract_hrefs(html,base):
    hrefs = []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all div tags with the class 'views-field-title'
    link_tags = soup.find_all('div', {'data-column-clickable': True})

    # Extract href attribute from each anchor tag and append to the list
    for link in link_tags:
        href = link.get('data-column-clickable')
        if href:
            if href.startswith(base):
                path_components = urlparse(href).path.split('/')
                # print(path_components)
                if len(path_components) == 3:
                    if not href in hrefs:
                        hrefs.append(href)

    return hrefs

In [5]:
def extract_links(html,base):
    hrefs = []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all div tags with the class 'views-field-title'
    link_tags = soup.find_all('div', {'data-column-clickable': True})

    # Extract href attribute from each anchor tag and append to the list
    for link in link_tags:
        href = link.get('data-column-clickable')
        if href:
            if href.startswith(base):
                hrefs.append(href)
                # path_components = urlparse(href).path.split('/')
                # print(path_components)
                # if len(path_components) == 4:
                #    if not href in hrefs:
                #        hrefs.append(href)

    return hrefs

In [6]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    # print(path_components)
    return os.path.join(base_dir, f"{path_components[-1]}.json")

In [7]:
dc_podcasts_pages = []

status_code, html = get_page(host, delay_seconds)
if status_code != 200:
    print("Error!", status_code)
series_hrefs = extract_hrefs(html,base)
# print(series_hrefs)

for series_link in series_hrefs:
    status_code, html = get_page(series_link, delay_seconds)
    if status_code != 200:
        print("Error!", status_code)
    episodes_hrefs = extract_links(html,base)
    # print(episodes_hrefs)
    if len(episodes_hrefs) != 0:
        dc_podcasts_pages.extend(episodes_hrefs)
    
print(dc_podcasts_pages)
len(dc_podcasts_pages)

['https://doctrineandcovenantscentral.org/podcast-episode/why-are-there-different-accounts-of-the-first-vision', 'https://doctrineandcovenantscentral.org/podcast-episode/whats-unique-about-joseph-smiths-1835-and-1838-accounts-of-his-first-vision', 'https://doctrineandcovenantscentral.org/podcast-episode/how-did-orson-pratt-influence-joseph-smiths-1842-first-vision-narrative', 'https://doctrineandcovenantscentral.org/podcast-episode/how-do-2nd-and-3rd-hand-accounts-add-to-our-understanding-of-the-first-vision', 'https://doctrineandcovenantscentral.org/podcast-episode/qr-what-about-others-who-claimed-similar-visions-in-joseph-smiths-day-etc', 'https://doctrineandcovenantscentral.org/podcast-episode/joseph-smiths-second-vision-and-the-coming-forth-of-the-book-of-mormon', 'https://doctrineandcovenantscentral.org/podcast-episode/joseph-smiths-creative-efforts-to-outsource-the-book-of-mormon-translation', 'https://doctrineandcovenantscentral.org/podcast-episode/translating-the-book-of-mormon

40

In [8]:
for url in dc_podcasts_pages:
    if url.endswith('/'):
        url = url[:-1]
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    
    save_page(path_file,url,html)
    
print("End")

../data/raw/dc_podcasts/why-are-there-different-accounts-of-the-first-vision.json


../data/raw/dc_podcasts/whats-unique-about-joseph-smiths-1835-and-1838-accounts-of-his-first-vision.json
../data/raw/dc_podcasts/how-did-orson-pratt-influence-joseph-smiths-1842-first-vision-narrative.json
../data/raw/dc_podcasts/how-do-2nd-and-3rd-hand-accounts-add-to-our-understanding-of-the-first-vision.json
../data/raw/dc_podcasts/qr-what-about-others-who-claimed-similar-visions-in-joseph-smiths-day-etc.json
../data/raw/dc_podcasts/joseph-smiths-second-vision-and-the-coming-forth-of-the-book-of-mormon.json


KeyboardInterrupt: 