In [1]:
import os
import re
import time
import requests
from bs4 import BeautifulSoup

# Google Drive integration
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

In [2]:
# Constants
BASE_URL = "https://www.churchofjesuschrist.org"
START_URL = f"{BASE_URL}/study/general-conference?lang=eng"
GDRIVE_FOLDER_ID = "YOUR_DRIVE_FOLDER_ID"  # ← replace with your folder ID

# 1. Authenticate with Google Drive
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)

In [3]:
from datetime import date

# Persistent HTTP session for connection pooling and cookies
session = requests.Session()  # 
session.headers.update({"User-Agent": "GC-Scraper/1.0"})

def get_conference_urls():
    """
    Generate all April and October conference page URLs
    from 1971 up to today, and return only those that exist.
    """
    today = date.today()  # 
    urls = []
    for year in range(1995, 2000):
        for month in (1,2,3,5,6,7,8,9,11,12):
            # Skip future sessions in the current year
            if year == today.year and month > today.month:
                continue
            url = f"{BASE_URL}/study/general-conference/{year}/{month:02d}?lang=eng"
            resp = session.head(url)  # 
            if resp.status_code == 200:
                urls.append(url)
    return urls

In [4]:
urls = get_conference_urls()


In [5]:
urls

['https://www.churchofjesuschrist.org/study/general-conference/1995/04?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1995/10?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1996/04?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1996/10?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1997/04?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1997/10?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1998/04?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1998/10?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1999/04?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1999/10?lang=eng']

In [6]:
import os
import re
import time
from bs4 import BeautifulSoup
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

GDRIVE_FOLDER_ID = "1TYYeDa41R2FYXh5IC6YHONTaBIcp5kQY"
# Persistent HTTP session for connection pooling and cookies

import time
from requests.exceptions import SSLError, ReadTimeout

def robust_get(url, **kwargs):
    for attempt in range(10):
        try:
            return session.get(url, timeout=(5, 10), **kwargs)
        except (SSLError, ReadTimeout) as e:
            wait = 2 ** attempt
            print(f"Warning: {e.__class__.__name__} on {url}, retrying in {wait}s...")
            time.sleep(wait)
    # last attempt, let exception propagate
    return session.get(url, timeout=(5, 10), **kwargs)

# Optional: if SSL certs are causing you grief (not recommended for production)
# session.verify = False

session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; GC-Scraper/1.0)"
})



session = requests.Session()  # 
session.headers.update({"User-Agent": "GC-Scraper/1.0"})

# Authenticate to Google Drive
gauth = GoogleAuth()
gauth.LocalWebserverAuth()  # 
drive = GoogleDrive(gauth)

def scrape_conference(conf_url, session):
    """
    Return a sorted list of full talk URLs for a given conference page.
    """
    resp = robust_get(conf_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # 1. Find all <a> tags matching the talk pattern
    anchors = soup.find_all(
        "a",
        href=re.compile(r"^/study/general-conference/\d{4}/\d{2}/.+\?lang=eng$")
    )

    # 2. Normalize and collect unique URLs
    links = set()
    for a in anchors:
        href = a["href"]
        if not href.startswith("http"):
            href = BASE_URL + href
        links.add(href)

    return sorted(links)

def scrape_and_upload():
    """Main workflow: scrape all talks and mirror them into Google Drive."""
    os.makedirs("gc_texts", exist_ok=True)  # 

    for conf_url in get_conference_urls():
        year, month = re.search(r"/(\d{4})/(\d{2})\?lang=eng$", conf_url).groups()
        conf_dir = os.path.join("gc_texts", f"{year}-{month}")
        os.makedirs(conf_dir, exist_ok=True)

        for talk_url in scrape_conference(conf_url, session):
            # Extract talk slug for filenames
            slug = talk_url.split("/")[-1].replace("?lang=eng", "")
            page = robust_get(talk_url)
            page.raise_for_status()
            # Extract full visible text from <article> or fallback to whole page
            text = (
                BeautifulSoup(page.text, "html.parser")
                .get_text(separator="\n", strip=True)  # 
            )
            file_path = os.path.join(conf_dir, f"{slug}.txt")
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(text)

            # Upload to Google Drive
            gfile = drive.CreateFile({'parents': [{'id': GDRIVE_FOLDER_ID}]})  # 
            gfile.SetContentFile(file_path)  # 
            gfile.Upload()                   # 
            time.sleep(2)                    # 

if __name__ == "__main__":
    scrape_and_upload()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=722991589272-caqeipkkl4lseu15js5l3nfvl7pejiae.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [7]:
#scrape liahona articles

In [1]:
import os
import re
import time
from datetime import date  #  [oai_citation_attribution:9‡Python documentation](https://docs.python.org/3/library/datetime.html?utm_source=chatgpt.com)
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# --- Configuration ---
START_YEAR = 2008              # adjust based on the first Liahona issue you care about
GDRIVE_FOLDER_ID = "1TYYeDa41R2FYXh5IC6YHONTaBIcp5kQY"
BASE_URL = "https://www.churchofjesuschrist.org"
LANG_PARAM = "?lang=eng"

# --- 1. HTTP Session w/ Retries & Timeouts ---
session = requests.Session()  #  [oai_citation_attribution:10‡Requests](https://requests.readthedocs.io/en/master/user/quickstart/?utm_source=chatgpt.com)
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)  #  [oai_citation_attribution:11‡urllib3](https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html?utm_source=chatgpt.com)

adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
session.headers.update({"User-Agent": "Liahona-Scraper/1.0"})

# --- 2. Google Drive Authentication ---
gauth = GoogleAuth()
gauth.LocalWebserverAuth()  # OAuth via local webserver  [oai_citation_attribution:12‡urllib3](https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html?utm_source=chatgpt.com)
drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=722991589272-caqeipkkl4lseu15js5l3nfvl7pejiae.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [2]:
def month_urls(start_year=START_YEAR):
    """Generate and return all existing Liahona issue URLs from start_year → today."""
    today = date.today()  #  [oai_citation_attribution:13‡Python documentation](https://docs.python.org/3/library/datetime.html?utm_source=chatgpt.com)
    urls = []
    for yr in range(start_year, today.year + 1):
        for mo in (1,2,3,5,6,7,8,9,11,12):
        # for mo in (8,9,11,12):
            if yr == today.year and mo > today.month:
                break
            url = f"{BASE_URL}/study/liahona/{yr}/{mo:02d}{LANG_PARAM}"
            try:
                resp = session.head(url, timeout=(3, 7))  #  [oai_citation_attribution:14‡W3Schools.com](https://www.w3schools.com/Python/ref_requests_head.asp?utm_source=chatgpt.com)
                if resp.status_code == 200:
                    urls.append((yr, mo, url))
            except requests.RequestException:
                continue
    return urls

In [None]:
def scrape_month(yr, mo, url):
    """Return a sorted list of full article URLs for a given Liahona month page."""
    resp = session.get(url, timeout=(5, 15))
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")  #  [oai_citation_attribution:15‡Beautiful Soup Documentation](https://beautiful-soup-4.readthedocs.io/en/latest/?utm_source=chatgpt.com)

    anchors = soup.find_all(
        "a",
        href=re.compile(rf"^/study/liahona/{yr}/{mo:02d}/.+{re.escape(LANG_PARAM)}$")
    )  #  [oai_citation_attribution:16‡Python documentation](https://docs.python.org/3/library/re.html?utm_source=chatgpt.com)

    links = {
        (href if href.startswith("http") else BASE_URL + href)
        for a in anchors for href in [a["href"]]
    }
    return sorted(links)

def fetch_text(url):
    """Extract and return the full visible text from an article page."""
    r = session.get(url, timeout=(5, 15))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    article = soup.find("article") or soup
    return article.get_text(separator="\n", strip=True)  #  [oai_citation_attribution:17‡Beautiful Soup Documentation](https://beautiful-soup-4.readthedocs.io/en/latest/?utm_source=chatgpt.com)

def main():
    os.makedirs("gc_texts/liahona", exist_ok=True)

    for yr, mo, month_url in month_urls():
        ym_folder = f"{yr}-{mo:02d}"
        out_dir = os.path.join("gc_texts/liahona", ym_folder)
        os.makedirs(out_dir, exist_ok=True)

        print(f"Processing {ym_folder}")
        for art_url in scrape_month(yr, mo, month_url):
            slug = art_url.rstrip("/").split("/")[-1].replace(LANG_PARAM, "")
            text = fetch_text(art_url)
            local_path = os.path.join(out_dir, f"{slug}.txt")
            with open(local_path, "w", encoding="utf-8") as f:
                f.write(text)

            # Upload to Google Drive
            gfile = drive.CreateFile({'parents':[{'id': GDRIVE_FOLDER_ID}]})
            gfile.SetContentFile(local_path)
            gfile.Upload()
            print(f" → Uploaded {slug}.txt")
            time.sleep(1)

if __name__ == "__main__":
    main()

Processing 2008-01
 → Uploaded a-missionary-in-the-making-john-kay-of-glenrothes-fife-scotland.txt
