# Download bibles from Bible.com

https://www.bible.com/

In [1]:
import pandas as pd
import time

import copy
import json
import requests
from bs4 import BeautifulSoup
import os

In [2]:
project_path = './'

struct = project_path + '00_structures/'
lang_list_fn = struct + 'languages_versions.xml'
book_list_fn = struct + 'book_list.xml'
titles_list_fn = struct + 'titles.xml'
pars_list_fn   = struct + 'paragraphs.xml'

# Output paths
csv_path = project_path + '01_input/'
tex_path = project_path + '02_outputs/'

In [4]:
# Load info
print('Loading structural data:')

print('  - List of books in the Bible with numbers of chapters')
books = pd.read_xml(book_list_fn)

print('  - Available translations')
trans = pd.read_xml(lang_list_fn, attrs_only=True)

print('  - Titles of sections')
titles = pd.read_xml(titles_list_fn)
titles = pd.melt(titles, id_vars=['book','chap','verse'])
titles.rename(columns={"variable": "lang", "value": "title"}, inplace=True)
titles['refs'] = titles['book'] + '.' + titles['chap'].astype(str) + '.' + titles['verse'].astype(str)

print('  - Paragraphs before each verse')
pars = pd.read_xml(pars_list_fn)
pars['refs'] = pars['book'] + '.' + pars['chap'].astype(str) + '.' + pars['verse'].astype(str)

print('Done...')

# Show the data
display(books)
display(trans)
#display(titles)
#display(pars)

Loading structural data:
  - List of books in the Bible with numbers of chapters
  - Available translations
  - Titles of sections
  - Paragraphs before each verse
Done...


Unnamed: 0,order,chapters,book
0,1,50,GEN
1,2,40,EXO
2,3,27,LEV
3,4,36,NUM
4,5,34,DEU
...,...,...,...
61,62,5,1JN
62,63,1,2JN
63,64,1,3JN
64,65,1,JUD


Unnamed: 0,lcode,version,vnum,lang
0,eng,NASB2020,2692,English
1,heb,תנ״ך ומודרני,2221,עברית


In [5]:
# If I only want some books, continuing from above...
#books = books.iloc[23:].copy()

In [6]:
def download_chapter(version, vnum, book, chapter):
    # Create download link
    download_link = 'https://www.bible.com/bible/' + str(vnum)+ '/' + book + '.' + str(chapter) + '.' + version
    # Download the text
    html_doc = requests.get(download_link)
    # Extract into Pandas df
    df = extract_verses(html_doc, version)
    return(df)

def extract_verses(html_doc, version, silent=False):
    soup = BeautifulSoup(html_doc.text, 'html.parser')
    # Initialize empty list to store verse data
    verse_data = []
    # Find the div element with class 'chapter'
    chapter = copy.copy(soup).find("div", {"class": "chapter"})
    book_abbr = chapter['data-usfm'].split(".")[0]
    chapter_num = int(chapter.find('div', {'class': 'label'}).text)
    # Find the book name
    info = copy.copy(soup).find("script", {"type": "application/ld+json"}).string
    code = json.loads(info)
    book_name = code['itemListElement'][2]['item']['name'].rsplit(' ', 1)[0]
    # Language
    version_div = soup.find("div", {"class": "version"})
    if version_div:
        language = version_div.get("data-iso6393")
    # Status
    if(not silent):
        print('      -', book_abbr, chapter_num, '('+ book_name +')')
    # Find all verse elements
    verses = copy.copy(chapter).find_all('span', {'class': 'verse'})
    # Iterate over each verse
    for verse in verses:
        try:
            verse_num = int(copy.copy(verse).find('span', {'class': 'label'}).text)
        except:
            pass
        # Replace the divine name with capitals
        try:
            # English uses class nd
            divineName = verse.find('span', {'class':'nd'}).string
            verse.find('span', {'class':'nd'}).string.replace_with('\\textsc{' + divineName + '}')
        except:
            pass
        try:
            # German uses class sc
            divineName = verse.find('span', {'class':'sc'}).string
            verse.find('span', {'class':'sc'}).string.replace_with('\\textsc{' + divineName + '}')
        except:
            pass
        # Keep only relevant text data
        verse_text = ''
        for match in verse.find_all('span', {'class':'content'}):
            verse_text += match.get_text()
        verse_text = verse_text.strip()
        # Append verse data to list
        verse_data.append({'lang': language,
                           'version': version,
                           'book_name': book_name,
                           'book_abbr': book_abbr,
                           'chapter_num': chapter_num,
                           'verse_num': verse_num,
                           'verse_text': verse_text})
    # Create a pandas dataframe from the list of verse data
    df = pd.DataFrame(verse_data)
    # Remove extra line breaks etc
    df = df.groupby(['lang','version','book_name','book_abbr','chapter_num','verse_num'])['verse_text'].apply(lambda x: ' '.join(x)).reset_index()
    return(df)

def pd_to_latex(df, print_titles=True):
    latex_str = ''
    for index, row in df.iterrows():
        # Add a paragraph (if necessary)
        current_verse_ref = row['book_abbr'] + '.' + str(row['chapter_num']) + '.' + str(row['verse_num'])
        if(current_verse_ref in pars['refs'].values):
            latex_str += '\n\n'
            
        # Add name of the book
        if((row['chapter_num'] == 1) & (row['verse_num'] == 1)):
            latex_str += '\\nonumchapter{' + str(row['book_name']) + '}\n\n'
            
        # Add a section title (if necessary)
        current_title = titles.loc[(titles['refs'] == current_verse_ref) & (titles['lang'] == row['lang']),'title'].values
        if((current_title.size > 0) & (print_titles == True)):
            if(current_title[0] != None): # Prevents a crash if the titles haven't been translated
                latex_str += '\n\n\\nonumsection{' + current_title[0] + '}\n\n'
            
        # Add the chapter number
        #-----------------------
        # Only add the chapter number as a large capital if it's at the beginning of a paragraph
        if((row['verse_num'] == 1) & (current_verse_ref in pars['refs'].values) & (print_titles == True)):
            latex_str += '\\bibchap{' + str(row['chapter_num']) + '}'
            latex_str += '\\bibverse{' + str(row['verse_num']) + '}' + str(row['verse_text']) + ' '
        # If it's not at the beginning of a paragraph, make the chapter number small
        if((row['verse_num'] == 1) & (current_verse_ref not in pars['refs'].values) & (print_titles == True)):
            latex_str += '\\bibverse{' + str(row['chapter_num']) + '.' + str(row['verse_num']) + '}' + str(row['verse_text']) + ' '
        # If the titles aren't to be printed, then the chapter numbers will always be a large number
        if((row['verse_num'] == 1) & (print_titles == False)):
            latex_str += '\n\n\\bibchap{' + str(row['chapter_num']) + '}'
            latex_str += '\\bibverse{' + str(row['verse_num']) + '}' + str(row['verse_text']) + ' '
        # All other cases, just add the verse number
        if(row['verse_num'] > 1):
            latex_str += '\\bibverse{' + str(row['verse_num']) + '}' + str(row['verse_text']) + ' '
        # Old
        #if(row['verse_num'] == 1):
        #    latex_str += '\\bibchap{' + str(row['chapter_num']) + '}'
        #latex_str += '\\bibverse{' + str(row['verse_num']) + '}' + str(row['verse_text']) + ' '
        
    # replace double spaces
    latex_str = ' '.join(latex_str.split(' '))
    
    return(latex_str)

def latex_to_file(latex_str, fn):
    with open(fn, 'w', encoding='utf-8') as f:
        f.write(latex_str)
    pass

In [None]:
# Cycle through translations, books and chapters to download them
print('Downloading:')
for i, tr in trans.iterrows():
    current_lang = tr['lang']
    current_vers = tr['version']
    current_vnum = tr['vnum']
    print('  - ' + current_lang + ' - ' + current_vers)
    for index, book in books.iterrows():
        current_book = book['book']
        print('    - ' + current_book)
        if(True):
        #if(current_vers == 'NASB2020'):
        #if((current_book == 'GEN') & (current_vers == 'NASB2020')):
            chapter_df = []
            for chap in range(1,book['chapters']+1):
                #print('      ' + current_book + ' ' + str(chap))
                temp = download_chapter(current_vers, current_vnum, current_book, chap)
                chapter_df.append(temp)
                time.sleep(5) # This website will not let us download quickly (protection mechanism). Pause between downloads...
                pass
            print('      - Concatenate book df...')
            book_df = pd.concat(chapter_df)
            print('      - Save to csv file: ', end = '')
            book_csv_fn = '{:02d}'.format(book['order']) + '_' + book['book'] + '.csv'
            folder_name = tr['lcode'] + '_' + tr['version']
            print(csv_path + folder_name + '/' + book_csv_fn)
            if not os.path.exists(csv_path + '/' + folder_name):
                os.makedirs(csv_path + '/' + folder_name)
            book_df.to_csv(csv_path + '/' + folder_name + '/' + book_csv_fn, index=False)
print('Done...')

Downloading:
  - English - NASB2020
    - GEN
      - GEN 1 (Genesis)
      - GEN 2 (Genesis)
      - GEN 3 (Genesis)
      - GEN 4 (Genesis)
      - GEN 5 (Genesis)
      - GEN 6 (Genesis)
      - GEN 7 (Genesis)
      - GEN 8 (Genesis)
      - GEN 9 (Genesis)
      - GEN 10 (Genesis)
      - GEN 11 (Genesis)
      - GEN 12 (Genesis)
      - GEN 13 (Genesis)
      - GEN 14 (Genesis)
      - GEN 15 (Genesis)
      - GEN 16 (Genesis)
      - GEN 17 (Genesis)
      - GEN 18 (Genesis)
      - GEN 19 (Genesis)
      - GEN 20 (Genesis)
      - GEN 21 (Genesis)
      - GEN 22 (Genesis)
      - GEN 23 (Genesis)
      - GEN 24 (Genesis)
      - GEN 25 (Genesis)
      - GEN 26 (Genesis)
      - GEN 27 (Genesis)
      - GEN 28 (Genesis)
      - GEN 29 (Genesis)
      - GEN 30 (Genesis)
      - GEN 31 (Genesis)
      - GEN 32 (Genesis)
      - GEN 33 (Genesis)
      - GEN 34 (Genesis)
      - GEN 35 (Genesis)
      - GEN 36 (Genesis)
      - GEN 37 (Genesis)
      - GEN 38 (Genesis)
      - GEN 3