# Download bibles from Bible.com

In [None]:
import pandas as pd
import time

import copy
import json
import requests
from bs4 import BeautifulSoup
import os

In [None]:
project_path = './'

struct = project_path + '00_structures/'
lang_list_fn = struct + 'languages_versions.xml'
book_list_fn = struct + 'book_list.xml'
titles_list_fn = struct + 'titles.xml'
pars_list_fn   = struct + 'paragraphs.xml'

# Output paths
csv_path = project_path + '01_input/'
tex_path = project_path + '02_outputs/'

In [None]:
# Load info
print('Loading structural data:')

print('  - List of books in the Bible with numbers of chapters')
books = pd.read_xml(book_list_fn)

print('  - Available translations')
trans = pd.read_xml(lang_list_fn, attrs_only=True)

print('  - Titles of sections')
titles = pd.read_xml(titles_list_fn)
titles = pd.melt(titles, id_vars=['book','chap','verse'])
titles.rename(columns={"variable": "lang", "value": "title"}, inplace=True)
titles['refs'] = titles['book'] + '.' + titles['chap'].astype(str) + '.' + titles['verse'].astype(str)

print('  - Paragraphs before each verse')
pars = pd.read_xml(pars_list_fn)
pars['refs'] = pars['book'] + '.' + pars['chap'].astype(str) + '.' + pars['verse'].astype(str)

print('Done...')

# Show the data
#display(books)
#display(trans)
#display(titles)
#display(pars)

In [None]:
# If I only want some books, continuing from above...
#books = books.iloc[21:].copy()
#display(books)

In [None]:
def download_chapter(version, vnum, book, chapter):
    # Create download link
    download_link = 'https://www.bible.com/bible/' + str(vnum)+ '/' + book + '.' + str(chapter) + '.' + version
    # Download the text
    html_doc = requests.get(download_link)
    # Extract into Pandas df
    df = extract_verses(html_doc, version)
    return(df)

def extract_verses(html_doc, version, silent=False):
    soup = BeautifulSoup(html_doc.text, 'html.parser')
    # Initialize empty list to store verse data
    verse_data = []
    # Find the div element with class 'chapter'
    chapter = copy.copy(soup).find("div", {"class": "chapter"})
    book_abbr = chapter['data-usfm'].split(".")[0]
    try:
        chapter_num = int(chapter.find('div', {'class': 'label'}).text) # Most bibles use numbers for chapters
    except:
        chapter_num = chapter.find('div', {'class': 'label'}).text # Except Hebrew
    # Find the book name
    info = copy.copy(soup).find("script", {"type": "application/ld+json"}).string
    code = json.loads(info)
    book_name = code['itemListElement'][2]['item']['name'].rsplit(' ', 1)[0]
    # Language
    version_div = soup.find("div", {"class": "version"})
    if version_div:
        language = version_div.get("data-iso6393")
    # Status
    if(not silent):
        print('      -', book_abbr, chapter_num, '('+ book_name +')')
    # Find all verse elements
    verses = copy.copy(chapter).find_all('span', {'class': 'verse'})
    # Iterate over each verse
    for verse in verses:
        # In some cases, multiple verses are referenced as text
        verse_num = ''
        try:
            verse_num = int(copy.copy(verse).find('span', {'class': 'label'}).text)
        except:
            pass
        if(verse_num is None):
            verse_num = copy.copy(verse).find('span', {'class': 'label'}).text
        # Replace the divine name with capitals
        try:
            # English uses class nd
            divineName = verse.find('span', {'class':'nd'}).string
            verse.find('span', {'class':'nd'}).string.replace_with('\\textsc{' + divineName + '}')
        except:
            pass
        try:
            # German uses class sc
            divineName = verse.find('span', {'class':'sc'}).string
            verse.find('span', {'class':'sc'}).string.replace_with('\\textsc{' + divineName + '}')
        except:
            pass
        # Keep only relevant text data
        verse_text = ''
        for match in verse.find_all('span', {'class':'content'}):
            verse_text += match.get_text()
        verse_text = verse_text.strip()
        # Append verse data to list
        verse_data.append({'lang': language,
                           'version': version,
                           'book_name': book_name,
                           'book_abbr': book_abbr,
                           'chapter_num': chapter_num,
                           'verse_num': verse_num,
                           'verse_text': verse_text})
    # Create a pandas dataframe from the list of verse data
    df = pd.DataFrame(verse_data)
    # Remove extra line breaks etc
    df = df.groupby(['lang','version','book_name','book_abbr','chapter_num','verse_num'])['verse_text'].apply(lambda x: ' '.join(x)).reset_index()
    return(df)

In [None]:
def download_book(current_version, current_book):
    current_lang = trans.loc[trans['version'] == current_version, 'lang'].values[0]
    current_vnum = trans.loc[trans['version'] == current_version, 'vnum'].values[0]
    current_lcode = trans.loc[trans['version'] == current_version, 'lcode'].values[0]
    current_order = books.loc[books['book'] == current_book, 'order'].values[0]
    current_nb_chaps = books.loc[books['book'] == current_book, 'chapters'].values[0]
    print('  - ' + current_lang + ' - ' + current_version)
    print('    - ' + current_book)
    chapter_df = []
    for chap in range(1,current_nb_chaps+1):
        temp = download_chapter(current_version, current_vnum, current_book, chap)
        chapter_df.append(temp)
        time.sleep(5) # This website will not let us download quickly (protection mechanism). Pause between downloads...
        pass
    print('      - Concatenate book df...')
    book_df = pd.concat(chapter_df)
    print('      - Save to csv file: ', end = '')
    book_csv_fn = '{:02d}'.format(current_order) + '_' + current_book + '.csv'
    folder_name = current_lcode + '_' + current_version
    print(csv_path + folder_name + '/' + book_csv_fn)
    if not os.path.exists(csv_path + '/' + folder_name):
        os.makedirs(csv_path + '/' + folder_name)
    book_df.to_csv(csv_path + '/' + folder_name + '/' + book_csv_fn, index=False)
    print('Done...')
    pass

def download_version(current_version):
    print('Downloading:')
    # Cycle through and download
    for index, book in books.iterrows():
        download_book(current_version, book['book'])
    pass

def download_all():
    # Cycle through and download
    for index, tr in trans.iterrows():
        download_version(tr['version'])
    pass

In [None]:
# Download everything
#download_all()

In [None]:
# Download a specific version
download_version('ZUL20')

In [None]:
# Download a specific book
#download_book('NIV', 'PSA')