In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from markdownify import markdownify as md

In [3]:
# Configuration
host = 'https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=eng'
base_dir = '../handbook/'
bs_parser = 'html.parser'
markdown_dir = '../data/'
delay_seconds = 5
output_markdown_file = os.path.join(markdown_dir, 'merged_handbook.md')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)
if not os.path.exists(markdown_dir):
    os.makedirs(markdown_dir)

In [4]:
def get_links(url):
    """Get all the links from the provided URL."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, bs_parser)
    
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(host, href)
        links.append(full_url)
    
    return links

In [5]:
def get_page_content(url):
    """Get the content of the provided URL."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, bs_parser)
    
    content = {
        "url": url,
        "title": soup.title.string if soup.title else "No title",
        "content": soup.get_text(separator='\n')
    }
    
    return content

# Get all links from the base URL
links = get_links(host)

In [6]:
# Get all links from the base URL
links = get_links(host)
links

for link in links:
    try:
        page_content = get_page_content(link)
        filename_base = page_content['title'].replace(' ', '_')
        
        # Save as JSON
        json_filename = os.path.join(base_dir, f"{filename_base}.json")
        with open(json_filename, 'w') as f:
            json.dump(page_content, f, indent=4)
        print(f"Content saved to {json_filename}")
        
        # Wait for the specified delay
        time.sleep(delay_seconds)
        
    except Exception as e:
        print(f"Failed to retrieve content from {link}: {e}")

Content saved to ../handbook/Handbooks_and_Callings.json
Content saved to ../handbook/General_Handbook:_Serving_in_The_Church_of_Jesus_Christ_of_Latter-day_Saints.json
Content saved to ../handbook/General_Handbook:_Serving_in_The_Church_of_Jesus_Christ_of_Latter-day_Saints.json
Content saved to ../handbook/Summary_of_Recent_Updates.json
Content saved to ../handbook/16._Living_the_Gospel_of_Jesus_Christ.json
Content saved to ../handbook/General_Handbook:_Serving_in_The_Church_of_Jesus_Christ_of_Latter-day_Saints.json
Content saved to ../handbook/General_Handbook:_Serving_in_The_Church_of_Jesus_Christ_of_Latter-day_Saints.json
Content saved to ../handbook/Summary_of_Recent_Updates.json
Content saved to ../handbook/0._Introductory_Overview.json
Content saved to ../handbook/0._Introductory_Overview.json
Content saved to ../handbook/0._Introductory_Overview.json
Content saved to ../handbook/0._Introductory_Overview.json
Content saved to ../handbook/0._Introductory_Overview.json
Content save

In [None]:
# Merging JSON content into a single markdown file
def load_json_files(directory):
    """Load all JSON files from the specified directory."""
    json_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.json')]
    return json_files

def extract_content(json_files):
    """Extract 'title' and 'content' from each JSON file."""
    all_content = []
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
            title = data.get('title', 'No title')
            content = data.get('content', '')
            all_content.append((title, content))
    return all_content

def format_to_markdown(contents):
    """Format the extracted content into a markdown string."""
    markdown_str = ''
    for title, content in contents:
        markdown_str += f"# {title}\n\n"
        markdown_str += f"{content}\n\n"
    return markdown_str

def save_markdown(content, file_path):
    """Save the markdown content to a file."""
    with open(file_path, 'w') as f:
        f.write(content)


In [None]:
# Load JSON files
json_files = load_json_files(base_dir)

In [None]:
# Extract content
contents = extract_content(json_files)


In [None]:
# Format to markdown
markdown_content = format_to_markdown(contents)

In [None]:
# Save to a single markdown file
save_markdown(markdown_content, output_markdown_file)