In [122]:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
import re
import json

In [28]:
url = "https://www.churchofjesuschrist.org/study/general-conference/2022/10?lang=eng"

In [29]:
def get_all_talk_links(general_conference_link):
    response = requests.get(general_conference_link)
    soup = BeautifulSoup(response.text, "html.parser")
    talks = []
    for link in soup.find_all("a"):
        talks.append(link.get("href"))
    filtered_links = [link for link in talks if "session" not in link and "lib" not in link]
    filtered_links.pop(0)
    for i in range(len(filtered_links)):
        filtered_links[i] = "https://www.churchofjesuschrist.org" + filtered_links[i]
    filtered_links = [url for url in filtered_links if "/study/general-conference/" in url]
    return filtered_links

In [30]:
def get_links(list_element):
    soup = BeautifulSoup(str(list_element), 'html.parser')

    # find the first 'a' element inside the 'li' element
    link_element = soup.find('a')

    # get the link and text content
    link = link_element['href']
    text = link_element.text

    return link, text

In [31]:
def get_old_links(soup):
    scripture_refs = soup.find_all('a', class_='scripture-ref')

    links = []
    texts = []
    for ref in scripture_refs:
        link = ref['href']
        text = ref.text
        links.append(link)
        texts.append(text) 

    return links, texts

In [32]:
def get_title(html):
    # assume 'page_source' contains the HTML page source
    soup = BeautifulSoup(html, 'html.parser')
    title = "Unknown Title"
    author = "Unknown Author"
    # find the title and author elements
    try:
        title_elem = soup.find('h1', {'id': 'title1'})
        author_elem = soup.find("div", {"class": "byline"}).find("p")

        print(author_elem.text)
        # extract the title and author text
        try:
            title = title_elem.text.strip()
            author = author_elem.text.strip()
            author = author.replace('\xa0', ' ')
        except:
            pass
    except:
        pass
    return title, author

In [33]:

def get_refs(url):
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    # find all elements with 'id' attribute containing the string 'note'
    notes = soup.find_all('li', id=lambda x: x and 'note' in x)
    links = []
    texts = []
    for note in notes:
        try:
            link, text = get_links(note)
            links.append(link)
            texts.append(text)
        except:
            pass

    if len(links) == 0:
        get_old_links(soup)
        links, texts = get_old_links(soup)

    title, author = get_title(html)

    return {
        "title": title,
        "author": author,
        "url": url,
        "links": links,
        "texts": texts
    }



In [34]:
def gc_link(year, month):
    url = "https://www.churchofjesuschrist.org/study/general-conference/YEAR/MONTH?lang=eng"
    year = str(year)
    gc = url.replace("YEAR", year)
    gc = gc.replace("MONTH", month)
    return gc

def get_all_general_conference_links(start_year, end_year):
    years = list(range(start_year, end_year + 1, 1))
    months = ["04", "10"]

    gcs = []
    for year in years:
        for month in months:
            gcs.append(gc_link(year, month))

    return gcs



In [35]:
gcs = get_all_general_conference_links(1971, 2022)

In [36]:
talks = []
for gc in gcs:
    talks += get_all_talk_links(gc)

In [37]:
#Filter out irrelevant talks / links
unique_talks = list(set(talks))
substring = "10?lang=eng"
unique_talks = list(filter(lambda link: substring not in link, unique_talks))
substring = "04?lang=eng"
unique_talks = list(filter(lambda link: substring not in link, unique_talks))
print(len(unique_talks))
talks = unique_talks

4016


In [41]:
talks.sort(reverse=True)

In [43]:
last_five_hundred = talks[0:500]

In [20]:
def batchify_list(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i+batch_size]
batches = list(batchify_list(talks, 500))

In [21]:
def get_page_source_from_links(links):
    page_sources = {}
    for link in links:
        time.sleep(1)
        response = requests.get(link)
        if response.status_code == 200:
            page_source = response.text
            page_sources[link] = page_source
            # Process the page source as needed
        else:
            print(f"Failed to retrieve page source for {link}. Status code: {response.status_code}")
    return page_sources

In [22]:
# Let's only get 500 talks
page_sources1 = get_page_source_from_links(batches[0])

In [1]:
import pickle
file_path = "talks_page_source.pkl"

In [32]:
with open(file_path, 'wb') as f:
    pickle.dump(page_sources1, f)

In [2]:
with open(file_path, 'rb') as f:
    page_sources = pickle.load(f)


In [4]:
page_sources['https://www.churchofjesuschrist.org/study/general-conference/1971/04/life-is-eternal?lang=eng']



In [8]:
from bs4 import BeautifulSoup

In [None]:
page_sources['']

In [26]:
len(page_sources)

500

In [108]:
def get_notes(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all <a> tags with href attributes that start with '/study/'
    a_tags = soup.find_all('a', href=True)

    # Replace the matching href attributes
    for a_tag in a_tags:
        if a_tag['href'].startswith('/study/'):
            a_tag['href'] = 'https://www.churchofjesuschrist.org' + a_tag['href']

    notes = {}
    footer_element = soup.find('footer', class_='notes')
    if not footer_element:
        return None
    ol_element = footer_element.find('ol')
    li_elements = ol_element.find_all('li')
    for li_element in li_elements:
        id = str(li_element.get("id"))
        a = [(a['href'], a.text.strip()) for a in li_element.find_all('a') if "/study/scriptures/" in a['href']]
        if len(a) == 0:
            continue
        notes[id] = a

    return notes
    

In [109]:
get_notes(page_sources['https://www.churchofjesuschrist.org/study/general-conference/2022/10/56morrison?lang=eng'])

{'note1': [('https://www.churchofjesuschrist.org/study/scriptures/nt/john/9.2-3?lang=eng#p2',
   'John 9:2â\x80\x933')],
 'note2': [('https://www.churchofjesuschrist.org/study/scriptures/the-family-a-proclamation-to-the-world/the-family-a-proclamation-to-the-world?lang=eng&verse=7#p7',
   'The Family: A Proclamation to the World')],
 'note3': [('https://www.churchofjesuschrist.org/study/scriptures/bofm/mosiah/24.14-15?lang=eng#p14',
   'Mosiah 24:14â\x80\x9315')],
 'note4': [('https://www.churchofjesuschrist.org/study/scriptures/nt/philip/4.13?lang=eng#p13',
   'Philippians 4:13')],
 'note7': [('https://www.churchofjesuschrist.org/study/scriptures/nt/john/16.33?lang=eng#p33',
   'John 16:33')],
 'note8': [('https://www.churchofjesuschrist.org/study/scriptures/ot/job/27.5?lang=eng#p5',
   'Job 27:5')],
 'note9': [('https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/121.1?lang=eng#p1',
   'Doctrine and Covenants 121:1')],
 'note10': [('https://www.churchofjesuschrist.or

In [170]:
def get_elements(page_source, link):
    soup = BeautifulSoup(page_source, 'html.parser')
    # Find all <a> tags with href attributes that start with '/study/'
    a_tags = soup.find_all('a', href=True)
    # Replace the matching href attributes
    for a_tag in a_tags:
        if a_tag['href'].startswith('/study/'):
            a_tag['href'] = 'https://www.churchofjesuschrist.org' + a_tag['href']

    title_element = soup.find('h1', id='title1')
    author_element = soup.find('p', class_="author-name")
    role_element = soup.find('p', class_="author-role")
    kicker_element = soup.find('p', id='kicker1')
    kicker = kicker_element.text if kicker_element else None
    date_element = soup.find('div', class_='catalogTitle-KCT7_')
    notes = get_notes(page_source)

    references = []
    body = soup.find("div", class_="body-block")
    body = body if body else soup.find("div", class_="body")
    for p in body.find_all("p"):
        for a in p.find_all('a', class_='scripture-ref'):
            references.append({
                'link': a['href'],
                'text': a.text.strip(),
                'content': p.get_text(strip=True),
                'html': p.prettify()
            })
        pattern = re.compile(r"note(\d+)$")
        for a in p.find_all('a', href=pattern):
            number = re.search(pattern, a['href']).group()
            if notes != None and number in notes:
                for refs in notes[number]:
                    references.append({
                        'link': refs[0],
                        'text': refs[1],
                        'content': p.get_text(strip=True),
                        'html': p.prettify()
                    })
    

    return {
        'title': title_element.text if title_element is not None else None,
        'date': date_element.text if date_element is not None else None,
        'link': link,
        'author': author_element.text if author_element is not None else None,
        'role': role_element.text if role_element is not None else None,
        'kicker': kicker, 
        'references' : references
    }

In [171]:
talk_references = [get_elements(page_sources[link], link) for link in page_sources.keys()]

In [172]:
# From the extracting_cfm_and_institute_manual.ipynb
bom_abbreviations = {
    '1-ne' : '1 Nephi',
    '2-ne' : '2 Nephi',
    'jacob' : 'Jacob',
    'enos' : 'Enos',
    'jarom' : 'Jarom',
    'omni' : 'Omni',
    'w-of-m' : 'Words of Mormon',
    'mosiah' : 'Mosiah',
    'alma' : 'Alma',
    'hel' : 'Helaman',
    '3-ne' : '3 Nephi',
    '4-ne' : '4 Nephi',
    'morm' : 'Mormon',
    'ether' : 'Ether',
    'moro' : 'Moroni'
}

scripture_map = {
    'bom' : 'Book of Mormon',
    'ot' : 'Old Testament',
    'nt' : 'New Testament',
    'pgp' : 'Pearl of Great Price',
    'dc-testament' : 'Doctrine and Covenants'
}

file_path = 'scriptures-json/book-of-mormon.json'
bom_chapters = {}
with open(file_path, 'r') as json_file:
    book_of_mormon_data = json.load(json_file)

for book in book_of_mormon_data['books']:
    bom_chapters[book['book']] = len(book['chapters'])



In [173]:
def extract_scripture_info(link):
    # Remove the '?lang=eng' part from the link if it exists
    link = link.split('?')[0]

    # Split the link by '/'
    link_parts = link.split('/')

    # Extract the scripture, book, and the third element containing chapter and verses
    work = link_parts[5]
    book = link_parts[6]
    if book not in bom_abbreviations.keys():
        return None, None, None, None

    third_element = link_parts[7]
    part = third_element.split('.')
    chapter = part[0]
    if (len(part) < 2):
        return work, book, chapter, range(1, bom_chapters[bom_abbreviations[book]])
    verses = part[1]
    verses = verses.split(',')

    individual_verses = []
    for verse_range in verses:
        # Check if the verse range is in the form of '1-2'
        if '-' in verse_range:
            verse_range = verse_range.split('-')
            start_verse = verse_range[0]
            end_verse = verse_range[1]
            while int(start_verse)!= 0 or int(end_verse)!= 0:
                individual_verses.append(start_verse)
                start_verse = int(start_verse) + 1
                if int(start_verse) > int(end_verse):
                    break
        else:
            individual_verses.append(verse_range)

    return work, book, chapter, individual_verses

In [187]:
def decode_str(value):
    return value.encode('latin-1').decode('utf-8') if value is not None else None

In [188]:
bom_talk_ref_map = {}
for talk in talk_references:
    for reference in talk['references']:
        work, book, chapter, verses = extract_scripture_info(reference['link'])
        if (work == None):
            continue
        current_level = bom_talk_ref_map
        for level in (work, book, chapter):
            current_level = current_level.setdefault(level, {})
        for verse in verses:
            if verse not in current_level:
                current_level[verse] = []
            current_level[verse].append({
                'title': decode_str(talk['title']),
                'author': decode_str(talk['author']),
                'role': decode_str(talk['role']),
                'kicker': decode_str(talk['kicker']),
                'date': decode_str(talk['date']),
                'url': decode_str(talk['link']),
                
                'link': decode_str(reference['link']),
                'scripture': decode_str(reference['text']),
                'content': decode_str(reference['content']),
                'html': decode_str(reference['html']),
            })

In [191]:
print(bom_talk_ref_map['bofm']['1-ne']['1']['1'][0]['scripture'])

1 Ne. 1:1


In [192]:
with open("bom_talk_map.json", 'w', encoding='utf8') as f:
    json.dump(bom_talk_ref_map, f, ensure_ascii=False)

TypeError: load() missing 1 required positional argument: 'fp'