In [None]:
# Laura Burdick (lburdick@umich.edu)
# Figure out which Bible translations have 75% of the English KJV Bible

In [1]:
from tqdm import tqdm_notebook
from collections import Counter
import pickle

In [8]:
#SET THESE VARIABLES

# Location where the Bible text is stored
# Should include files book_mappings.txt and versenames.txt, as well as
# a folder text/, which includes all the Bible translation texts
nicolai_path = '/Users/laura/embedding-spaces/embedding_datasets/multilingual/bible/nicolai/'

In [9]:
# Get list of all Bibles
with open('all_bible_files.txt','r') as textFile:
    fileNames = [i[:-1] for i in textFile.readlines()]

In [10]:
# Read in book mappings
book_mappings = {} #key: book index, values: book names
with open(nicolai_path+'book_mappings.txt','r') as mappingsFile:
    lines = [i[:-1] for i in mappingsFile.readlines()]
for line in lines:
    book_mappings[int(line.split()[0])] = ' '.join(line.split()[1:])

In [5]:
# Read in versenames
with open(nicolai_path+'versenames.txt','r') as verseFile:
    versenames_list = [(int(i[:-1][:2]),int(i[:-1][2:5]),int(i[:-1][5:])) for i in verseFile.readlines()] #(book,chapter,verse)

#key = line number
#value = (book,chapter,verse)
#remove all values with chapter <= 0
#remove all values with verse <=0
#remove all books that are not in protestant Bible
versenames = {i:versenames_list[i] for i in range(len(versenames_list)) if versenames_list[i][1]>0 \
              and versenames_list[i][0]!=0 and\
             versenames_list[i][0]<=66 and versenames_list[i][2]>0}

In [6]:
# Only take verses that are in the KJV Bible
all_verses_list = [] # List of (book,chapter,verse) tuples in KJV
with open(nicolai_path+'text/eng-x-bible-kingjames-v1.txt','r') as bibleFile:
    lines = [i[:-1] for i in bibleFile.readlines()]
for i in range(len(lines)):
    if len(lines[i]) == 0:
        continue
    if i in versenames: #we want to count this verse
        (book,chapter,verse) = versenames[i]
        all_verses_list.append((book,chapter,verse))

In [7]:
# Turn into dictionary
# key: line number of verse, values: (book,chapter,verse) tuples
versenames = {lineNum:(book,chapter,verse) for (lineNum,(book,chapter,verse)) in versenames.items() if \
             (book,chapter,verse) in all_verses_list}
all_verses = len(versenames) # Number of verses in KJV
print(all_verses)

31101


In [None]:
# Calculate what percentage of the Bible each file contains
all_percentages = []
for fileName in tqdm_notebook(fileNames):
    with open(nicolai_path+'text/'+fileName,'r') as bibleFile:
        lines = [i[:-1] for i in bibleFile.readlines()]
    counts = Counter()
    for i in range(len(lines)):
        if len(lines[i]) == 0:
            continue
        if i in versenames: #we want to count this verse
            (book,chapter,verse) = versenames[i]
            counts[book] += 1
    totalVerses = 0
    for (book,numVerses) in counts.items():
        totalVerses += numVerses
    all_percentages.append(totalVerses/all_verses)

In [14]:
# Bibles that are 100% complete
complete_bibles = [fileNames[i] for i in range(len(fileNames)) if all_percentages[i]==1]
len(complete_bibles)

46

In [17]:
print(complete_bibles)

['aln-x-bible-aln-v1.txt', 'arb-x-bible-arb-v1.txt', 'azb-x-bible-azb-v1.txt', 'cac-x-bible-ixtatan-v1.txt', 'ceb-x-bible-bugna2009-v1.txt', 'ceb-x-bible-pinadayag-v1.txt', 'ces-x-bible-kralicka-v1.txt', 'dan-x-bible-1931-v1.txt', 'deu-x-bible-elberfelder1871-v1.txt', 'deu-x-bible-elberfelder1905-v1.txt', 'deu-x-bible-luther1545-v1.txt', 'deu-x-bible-luther1912-v1.txt', 'ell-x-bible-modern2009-v1.txt', 'eng-x-bible-kingjames-v1.txt', 'eng-x-bible-literal-v1.txt', 'eng-x-bible-newsimplified-v1.txt', 'epo-x-bible-epo-v1.txt', 'fin-x-bible-1766-v1.txt', 'fin-x-bible-1992-v1.txt', 'fra-x-bible-bonnet-v1.txt', 'fra-x-bible-darby-v1.txt', 'fra-x-bible-louissegond-v1.txt', 'fra-x-bible-pirotclamer-v1.txt', 'gur-x-bible-frafra-v1.txt', 'hun-x-bible-karoli-v1.txt', 'ita-x-bible-diodati-v1.txt', 'ita-x-bible-nuovadiodati1991-v1.txt', 'ita-x-bible-riveduta-v1.txt', 'kek-x-bible-1988-v1.txt', 'may-ZLMAVB.txt', 'mri-x-bible-mri-v1.txt', 'nld-x-bible-nld-v1.txt', 'por-x-bible-almeidaatualizada-v1.tx

In [15]:
# Bibles that are at least 75% complete
almost_complete_bibles = [fileNames[i] for i in range(len(fileNames)) if all_percentages[i]>.75]
len(almost_complete_bibles)

162

In [18]:
print(almost_complete_bibles)

['afr-x-bible-1953-v1.txt', 'aln-x-bible-aln-v1.txt', 'arb-ARBIBS.txt', 'arb-x-bible-arb-v1.txt', 'arz-x-bible-arz-v1.txt', 'ayr-AYMBSB.txt', 'ayr-x-bible-1997-v1.txt', 'ayr-x-bible-2011-v1.txt', 'azb-x-bible-azb-v1.txt', 'azj-AZEBSA.txt', 'bba-BBABSB.txt', 'bba-x-bible-bba-v1.txt', 'ben-x-bible-common-v1.txt', 'ben-x-bible-mussolmani-v1.txt', 'bqc-BQCSIM.txt', 'bqc-x-bible-bqc-v1.txt', 'bul-x-bible-bul-v1.txt', 'bul-x-bible-veren-v1.txt', 'cac-x-bible-ixtatan-v1.txt', 'cak-x-bible-central2003-v1.txt', 'ceb-x-bible-bugna2009-v1.txt', 'ceb-x-bible-bugna-v1.txt', 'ceb-x-bible-godsword-v1.txt', 'ceb-x-bible-pinadayag-v1.txt', 'ces-x-bible-ekumenicky-v1.txt', 'ces-x-bible-kralicka-v1.txt', 'che-CHEIBT.txt', 'cme-CNHBSM.txt', 'cmn-x-bible-sf_ncv-zefania-v1.txt', 'cnh-x-bible-cnh-v1.txt', 'crh-CRHIBT.txt', 'cym-x-bible-colloquial2013-v1.txt', 'cym-x-bible-morgan1804-v1.txt', 'dan-x-bible-1931-v1.txt', 'deu-x-bible-elberfelder1871-v1.txt', 'deu-x-bible-elberfelder1905-v1.txt', 'deu-x-bible-fr

In [16]:
print(len(versenames)) # Total number of verses in Bible
print(0.75*len(versenames)) # Number of verses in 75% complete Bible

31101
23325.75
