# Crawl Magazines from the Church of Jesus Christ of Latter-day Saints

In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import os
import time
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from models.crawl_utils import get_page, save_page

In [52]:
# config
magazine_urls = [
    # 'https://www.churchofjesuschrist.org/study/magazines/liahona?lang=eng',
    # 'https://www.churchofjesuschrist.org/study/magazines/ya-weekly?lang=eng',
    # 'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth?lang=eng',
    'https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng',
    # 'https://www.churchofjesuschrist.org/study/magazines/friend?lang=eng',
    #'https://www.churchofjesuschrist.org/study/magazines/ensign-19712020?lang=eng'
]
base_dir = '../data/load/raw'
bs_parser = 'html.parser'
seconds_delay = 3

In [70]:
def _is_issue_link(url: str) -> bool:
    path_components = urlparse(url).path.split('/')
    # print('is_issue_link', url, path_components)
    if len(path_components) < 5:
        return False
    elif path_components[4] == 'new-era-19712020':
        # new-era issue links must have 6 path components
        return len(path_components) == 6
    else:
        # all other issue links must have 5 components (first component is empty)
        return len(path_components) == 5


def get_issue_links(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_issue_link(urljoin(base_url, a['href']))]

def get_year_month_links(url, html):
    links = get_issue_links(url, html)
    year_month_links = []
    for link in tqdm(links):
        path_components = urlparse(link).path.split('/')
        # print('link and components', link, path_components)
        if len(path_components[-1]) == 2 or path_components[-1].endswith('-se'):
            # year-month link
            # print('year-month link', link)
            year_month_links.append(link)
        elif len(path_components[-1]) == 4:
            # year_only_link
            # print('year-only link', link)
            time.sleep(seconds_delay)            
            status_code, html = get_page(link)
            if status_code != 200:
                print(f"Status code={status_code} url={link}")
                continue
            new_links = get_issue_links(link, html)
            for new_link in new_links:
                # print('issue link', new_link)
                year_month_links.append(new_link)
        else:
            print('unexpected link', link, path_components[-1])    
        # TODO remove break
        break
    return year_month_links

def _is_article_link(url: str) -> bool:
    path_components = urlparse(url).path.split('/')
    # # must be 6 or 7 components (first component is empty)
    return (len(path_components) == 6 or len(path_components) == 7) and \
        path_components[-2] != 'new-era-19712020' and path_components[-1] != 'contents'


def get_article_links(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_article_link(urljoin(base_url, a['href']))]


def get_article_path(url):
    path_components = urlparse(url).path.split('/')
    path = '_'.join(path_components[2:])
    return os.path.join(base_dir, f"{path}.json")

In [72]:
for url in tqdm(magazine_urls):
    time.sleep(seconds_delay)
    status_code, html = get_page(url)
    if status_code != 200:
        print(f"Status code={status_code} url={url}")
        continue
    year_month_links = get_year_month_links(url, html)
    print('year-month-links', url, len(year_month_links))
    for link in tqdm(year_month_links):
        print('year-month link', link)
        time.sleep(seconds_delay)
        status_code, html = get_page(link)
        if status_code != 200:
            print(f"Status code={status_code} url={url}")
            continue        
        article_links = get_article_links(link, html)
        for article_link in tqdm(article_links):
            path = get_article_path(article_link)
            # print('path', path, article_link)
            if os.path.exists(path):
                continue
            print("    ", path)
            time.sleep(seconds_delay)
            status_code, html = get_page(article_link)
            if status_code != 200:
                print(f"Status code={status_code} url={article_link}")
                continue
            save_page(path, article_link, html)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

year-month-links https://www.churchofjesuschrist.org/study/magazines/for-the-strength-of-youth/new-era-19712020?lang=eng 12


  0%|          | 0/12 [00:00<?, ?it/s]

!!! link https://www.churchofjesuschrist.org/study/new-era/2020/01?lang=eng


  0%|          | 0/44 [00:00<?, ?it/s]

!!! link https://www.churchofjesuschrist.org/study/new-era/2020/02?lang=eng


  0%|          | 0/44 [00:00<?, ?it/s]

!!! link https://www.churchofjesuschrist.org/study/new-era/2020/03?lang=eng


  0%|          | 0/36 [00:00<?, ?it/s]

!!! link https://www.churchofjesuschrist.org/study/new-era/2020/04?lang=eng


  0%|          | 0/44 [00:00<?, ?it/s]

     ../data/load/raw/new-era_2020_04_the-book-of-mormon-a-witness-of-the-resurrection.json
     ../data/load/raw/new-era_2020_04_he-is-the-light.json
     ../data/load/raw/new-era_2020_04_connect.json
     ../data/load/raw/new-era_2020_04_fun-stop.json
     ../data/load/raw/new-era_2020_04_q-a_what-do-you-say-when-your-friends-dont-believe-that-the-first-vision-could-happen.json
     ../data/load/raw/new-era_2020_04_q-a_how-did-joseph-smith-translate-the-book-of-mormon.json
     ../data/load/raw/new-era_2020_04_firm-foundations.json
     ../data/load/raw/new-era_2020_04_the-sacrament-and-taking-upon-us-the-name-of-jesus-christ.json
     ../data/load/raw/new-era_2020_04_king-benjamin.json
     ../data/load/raw/new-era_2020_04_share-the-message-of-the-ongoing-restoration.json


KeyboardInterrupt: 