# Crawl Pearl of Great Price from the Church of Jesus Christ of Latter-day Saints

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from models.crawl_utils import get_page, save_page

In [None]:
# config
hosts = ['https://pearlofgreatpricecentral.org/category/book-of-abraham/', 'https://pearlofgreatpricecentral.org/category/joseph-smith-history/']
base_dir = '../data/raw/pearl_of_great_price'
bs_parser = 'html.parser'
delay_seconds = 5
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
def extract_hrefs_from_elementor_div(soup, base_url):
    hrefs = []    
    # Find the div with class 'elementor-posts-container'
    elementor_div = soup.find('div', class_='elementor-posts-container')
    # Check if the div is found
    if elementor_div:
        # Find all 'a' tags within the div
        a_tags = elementor_div.find_all('a', href=True)
        # Extract and normalize href values
        for a_tag in a_tags:
            href = a_tag['href']
            full_url = urljoin(base_url, href)
            if not full_url in hrefs:
                hrefs.append(full_url)    
    return hrefs

In [None]:
def extract_next_sibling_href(soup, base_url):
    # Find the span tag with classes 'page-numbers' and 'current'
    span_tag = soup.find('span', class_='page-numbers current')    
    if span_tag:
        # Find the next sibling anchor tag
        anchor_tag = span_tag.find_next_sibling('a', href=True)
        if anchor_tag:
            href = anchor_tag['href']
            full_url = urljoin(base_url, href)
            return full_url
    return None

In [None]:
def fetch_and_extract_hrefs(start_url, base_url, max_pages=10):
    # List to store all extracted hrefs
    all_hrefs = []
    # Loop to fetch pages and extract hrefs
    for _ in range(max_pages):
        # Fetch the current page
        status_code, html = get_page(start_url)
        if status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')
            # Extract hrefs from the current page
            hrefs = extract_hrefs_from_elementor_div(soup, base_url)
            # Add the extracted hrefs to the list
            all_hrefs.extend(hrefs)
            # Get the href for the next page
            next_page_href = extract_next_sibling_href(soup, base_url)
            if next_page_href:
                # Update the start_url for the next iteration
                start_url = next_page_href
            else:
                print("No next page found. Exiting loop.")
                break
        else:
            print(f"Failed to fetch page: {start_url}")
            break
    return all_hrefs

In [None]:
all_hrefs = []
for start_url in hosts:
    hrefs = fetch_and_extract_hrefs(start_url, start_url)
    all_hrefs.extend(hrefs)
print(all_hrefs)

In [None]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-2]}.json")    

In [None]:
for url in all_hrefs:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)    
print("End")