# Crawl fair

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page

In [None]:
# config
host = 'https://www.fairlatterdaysaints.org/answers/Special:AllPages'
base_dir = '../data/raw/fair'
bs_parser = 'html.parser'
delay_seconds = 5
approved_links = [
    'https://www.fairlatterdaysaints.org/answers/Criticism_of_Mormonism/Online_documents/Letter_to_a_CES_Director',
    'https://www.fairlatterdaysaints.org/answers/Sarah_Allen_CES_Response_Posts',
    'https://www.fairlatterdaysaints.org/answers/Sarah_Allen%27s_Response_to_Letter_For_My_Wife'
    ]

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
def get_path(url):
    if url.endswith('/'):
        url = url[:-1]
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-1]}.json")
    

In [None]:
def get_links(soup, base_url):
    list_tags = soup.find_all(['ol', 'ul'])
    print('list_tags', len(list_tags))
    links = set()
    for list_tag in list_tags:
        a_tags = list_tag.find_all('a', href=True)
        print('a_tags', len(a_tags))
        for a_tag in a_tags:
            url = urljoin(base_url, a_tag['href']).split('#')[0]
            links.add(url)
    return list(links)

In [None]:
def extract_urls_from_approved_links(approved_links, delay_seconds=5):
    extracted_urls = set()

    for link in approved_links:
        status_code, html = get_page(link, delay_seconds)
        if status_code == 200:
            soup = BeautifulSoup(html, bs_parser)
            extracted_urls.update(get_links(soup, link))
            print('html', html)
            break
    return list(extracted_urls)

In [None]:
extracted_links = extract_urls_from_approved_links(approved_links, delay_seconds)

In [None]:
for link in extracted_links:
    print(link)

In [None]:
def filtered_extracted_urls(extracted_links):
    filtered_urls = []
    for link in extracted_links:
        url = urlparse(link)
        if not url.hostname.endswith("fairlatterdaysaints.org"):
            continue
        if not url.path.startswith("/answers") and not url.path.startswith("/blog"):
            continue
        if "Category:" in url.path:
            continue
        filtered_urls.append(link)
        print(link)
    return filtered_urls

In [None]:
filtered_urls = filtered_extracted_urls(extracted_links)

In [None]:
for link in filtered_urls:
    print(link)

In [None]:
for href in approved_links + filtered_urls:
    path_file =  get_path(href)
    print(href, path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(href, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , href)
        continue
    save_page(path_file,href,html)