# Bible CSV Generator:

A Python script for converting a folder with .txt files for each Bible chapter into a single .csv file

By Kenneth Burchfiel

Script released under the MIT license; original .txt files and resulting .csv file are both in the public domain

In [1]:
import time
start_time = time.time()
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import os

Note: The original source of the files in the Bible_Chapters folder was https://ebible.org/Scriptures/details.php?id=eng-web-c; this link contains various downloadable versions of the World English Bible (Catholic Edition). I downloaded the 'Plain text canon only chapter files' as a .zip file and then extracted the contents. 

I noticed that Genesis was missing, so I downloaded an additional version of the World English Bible from https://ebible.org/find/show.php?id=eng-web and then copied the Genesis chapters into the Catholic World English Bible chapters folder. These files start with 'eng-web_' instead of 'eng-web-c'.

In [2]:
bible_chapters_folder = os.listdir('eng-web-c_readaloud_with_additions/Bible_Chapters')
bible_chapters_folder.sort()
bible_chapters_folder[0]

'eng-web-c_003_EXO_01_read.txt'

In [3]:
def bible_chapter_txt_file_to_df(i):
    '''This function converts a .txt file containing two header lines and 
    the verses of a Bible chapter into a DataFrame that stores both those
    verses and information about their corresponding book and chapter.
    i represents the index of the file within the Bible Chapters folder
    that this function will open.'''

    # The file names contain book, book order, and chapter information, so we'll
    # extract that data here for use within our .csv file:

    book_order = int(bible_chapters_folder[i].split('_')[1])
    book_name = bible_chapters_folder[i].split('_')[2]
    chapter_name = bible_chapters_folder[i].split('_')[3]
    # print(i, book_order, book_name, chapter_name)

    with open (('eng-web-c_readaloud_with_additions/Bible_Chapters')+'/'+bible_chapters_folder[i]) as file:
        chapter = file.readlines()[2:].copy() # [2:] excludes the 1st two lines 
        # (as these don't include actual verses):
    # Removing extra spaces and linebreaks:
    chapter = [verse.strip() for verse in chapter]
    chapter

    # Creating a DataFrame that contains both the chapter's verses and 
    # information about the book and chapter:

    df_chapter = pd.DataFrame(data = {'Book_Order':book_order-1, 
    'Book_Name':book_name, 'Chapter_Name': chapter_name, 
    'Verse_#':np.arange(1, len(chapter)+1), 'Verse':chapter})
    df_chapter # I'm assuming
    # that each line in the original text files represents its own verse,
    # and some quick spot checks indicated that this was the case. However,
    # these may not actually be the real verse numbers of each chapter.

    # I also subtract 1 from the book order values because they appeared
    # to be 1 higher than the actual values. (For instance, Genesis had an 
    # order of 2 rather than 1.)
    return df_chapter

The following cell calls bible_chapter_txt_file_to_df() for each file within the Bible Chapters folder, then concatenates the resulting DataFrames into a single DataFrame.

In [4]:
df_Bible = pd.concat([bible_chapter_txt_file_to_df(i) for i in range(len(bible_chapters_folder))])
df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Verse_#,Verse
0,2,EXO,01,1,"Now these are the names of the sons of Israel,..."
1,2,EXO,01,2,"Reuben, Simeon, Levi, and Judah,"
2,2,EXO,01,3,"Issachar, Zebulun, and Benjamin,"
3,2,EXO,01,4,"Dan and Naphtali, Gad and Asher."
4,2,EXO,01,5,All the souls who came out of Jacob’s body wer...
...,...,...,...,...,...
21,1,GEN,50,22,"Joseph lived in Egypt, he, and his father’s ho..."
22,1,GEN,50,23,Joseph saw Ephraim’s children to the third gen...
23,1,GEN,50,24,"Joseph said to his brothers, “I am dying, but ..."
24,1,GEN,50,25,Joseph took an oath from the children of Israe...


In [5]:
df_Bible['Chapter_Name'] = df_Bible['Chapter_Name'].astype('int')
# Now that chapter names are in integer form, we can use Book_Order,
# Chapter_Name, and Verse_# values to sort the DataFrame:
df_Bible.sort_values(['Book_Order', 'Chapter_Name'], inplace = True)
df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Verse_#,Verse
0,1,GEN,1,1,"In the beginning, God created the heavens and ..."
1,1,GEN,1,2,The earth was formless and empty. Darkness was...
2,1,GEN,1,3,"God said, “Let there be light,” and there was ..."
3,1,GEN,1,4,"God saw the light, and saw that it was good. G..."
4,1,GEN,1,5,"God called the light “day”, and the darkness h..."
...,...,...,...,...,...
16,95,REV,22,17,"The Spirit and the bride say, “Come!” He who h..."
17,95,REV,22,18,I testify to everyone who hears the words of t...
18,95,REV,22,19,If anyone takes away from the words of the boo...
19,95,REV,22,20,"He who testifies these things says, “Yes, I am..."


In [6]:
# A column that stores both book and chapter information will prove 
# useful when visualizing data by chapter:
df_Bible.insert(3, 'Book_and_Chapter', df_Bible['Book_Name'] + ' ' + df_Bible['Chapter_Name'].astype('str'))

df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Book_and_Chapter,Verse_#,Verse
0,1,GEN,1,GEN 1,1,"In the beginning, God created the heavens and ..."
1,1,GEN,1,GEN 1,2,The earth was formless and empty. Darkness was...
2,1,GEN,1,GEN 1,3,"God said, “Let there be light,” and there was ..."
3,1,GEN,1,GEN 1,4,"God saw the light, and saw that it was good. G..."
4,1,GEN,1,GEN 1,5,"God called the light “day”, and the darkness h..."
...,...,...,...,...,...,...
16,95,REV,22,REV 22,17,"The Spirit and the bride say, “Come!” He who h..."
17,95,REV,22,REV 22,18,I testify to everyone who hears the words of t...
18,95,REV,22,REV 22,19,If anyone takes away from the words of the boo...
19,95,REV,22,REV 22,20,"He who testifies these things says, “Yes, I am..."


## Adding in the position of each chapter relative to all Bible chapters:

In [7]:
# We can calculate these chapter positions via pd.factorize().
df_Bible.insert(4, 'Chapter_Order', pd.factorize(df_Bible['Book_and_Chapter'])[0]+1)
df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Book_and_Chapter,Chapter_Order,Verse_#,Verse
0,1,GEN,1,GEN 1,1,1,"In the beginning, God created the heavens and ..."
1,1,GEN,1,GEN 1,1,2,The earth was formless and empty. Darkness was...
2,1,GEN,1,GEN 1,1,3,"God said, “Let there be light,” and there was ..."
3,1,GEN,1,GEN 1,1,4,"God saw the light, and saw that it was good. G..."
4,1,GEN,1,GEN 1,1,5,"God called the light “day”, and the darkness h..."
...,...,...,...,...,...,...,...
16,95,REV,22,REV 22,1328,17,"The Spirit and the bride say, “Come!” He who h..."
17,95,REV,22,REV 22,1328,18,I testify to everyone who hears the words of t...
18,95,REV,22,REV 22,1328,19,If anyone takes away from the words of the boo...
19,95,REV,22,REV 22,1328,20,"He who testifies these things says, “Yes, I am..."



## Adding in the position of each verse relative to all Bible verses:

This idea came from a World English Bible .csv file that I downloaded from [Bible Supersearch](https://www.biblesupersearch.com/bible-downloads/). I ultimately opted not to use that .csv file, however, because I also wanted to include the books found in the WEB's Catholic edition.


In [8]:
df_Bible.sort_values(['Book_Order', 'Chapter_Name', 'Verse_#'], inplace = True)
if 'Verse_Order' not in df_Bible.columns:
    df_Bible.insert(6, 'Verse_Order', np.arange(1, len(df_Bible) +1))
df_Bible.reset_index(drop=True,inplace=True)
df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Book_and_Chapter,Chapter_Order,Verse_#,Verse_Order,Verse
0,1,GEN,1,GEN 1,1,1,1,"In the beginning, God created the heavens and ..."
1,1,GEN,1,GEN 1,1,2,2,The earth was formless and empty. Darkness was...
2,1,GEN,1,GEN 1,1,3,3,"God said, “Let there be light,” and there was ..."
3,1,GEN,1,GEN 1,1,4,4,"God saw the light, and saw that it was good. G..."
4,1,GEN,1,GEN 1,1,5,5,"God called the light “day”, and the darkness h..."
...,...,...,...,...,...,...,...,...
35374,95,REV,22,REV 22,1328,17,35375,"The Spirit and the bride say, “Come!” He who h..."
35375,95,REV,22,REV 22,1328,18,35376,I testify to everyone who hears the words of t...
35376,95,REV,22,REV 22,1328,19,35377,If anyone takes away from the words of the boo...
35377,95,REV,22,REV 22,1328,20,35378,"He who testifies these things says, “Yes, I am..."


In [9]:
# Replacing curly quotes with straight quotes:
df_Bible['Verse'] = df_Bible['Verse'].str.replace('“', '"')
df_Bible['Verse'] = df_Bible['Verse'].str.replace('”', '"')
df_Bible['Verse'] = df_Bible['Verse'].str.replace('‘', "'")
df_Bible['Verse'] = df_Bible['Verse'].str.replace('’', "'")
df_Bible['Verse'] = df_Bible['Verse'].str.replace('’', "'")

Confirming that the replace operation was successful (at least for instances of '’'):

In [10]:
df_Bible.iloc[1]['Verse']

"The earth was formless and empty. Darkness was on the surface of the deep and God's Spirit was hovering over the surface of the waters."

Calculating the number of characters in each verse:

In [11]:
df_Bible['Characters'] = df_Bible['Verse'].str.len()
# This approach is much faster than the following list comprehension:
# [len(df_Bible.iloc[i]['Verse']) for i in range(len(df_Bible))]

df_Bible

Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Book_and_Chapter,Chapter_Order,Verse_#,Verse_Order,Verse,Characters
0,1,GEN,1,GEN 1,1,1,1,"In the beginning, God created the heavens and ...",56
1,1,GEN,1,GEN 1,1,2,2,The earth was formless and empty. Darkness was...,135
2,1,GEN,1,GEN 1,1,3,3,"God said, ""Let there be light,"" and there was ...",52
3,1,GEN,1,GEN 1,1,4,4,"God saw the light, and saw that it was good. G...",85
4,1,GEN,1,GEN 1,1,5,5,"God called the light ""day"", and the darkness h...",119
...,...,...,...,...,...,...,...,...,...
35374,95,REV,22,REV 22,1328,17,35375,"The Spirit and the bride say, ""Come!"" He who h...",160
35375,95,REV,22,REV 22,1328,18,35376,I testify to everyone who hears the words of t...,159
35376,95,REV,22,REV 22,1328,19,35377,If anyone takes away from the words of the boo...,174
35377,95,REV,22,REV 22,1328,20,35378,"He who testifies these things says, ""Yes, I am...",89


In [12]:
# Finding the total number of characters in this version of the Bible:
df_Bible['Characters'].sum()

4507727

Exporting this DataFrame to a .csv file that can be used within the main Type the Bible program:

In [13]:
df_Bible.to_csv('WEB_Catholic_Version.csv', index = False)

Creating a copy of this DataFrame that will be useful for our 'Type the Bible' game:

In [14]:
df_Bible_for_game = df_Bible.copy()
df_Bible_for_game['Typed'] = 0 
df_Bible_for_game['Tests'] = 0
df_Bible_for_game['Fastest_WPM'] = np.NaN
df_Bible_for_game['Characters_Typed'] = 0
df_Bible_for_game['Total_Characters_Typed'] = 0
df_Bible_for_game


Unnamed: 0,Book_Order,Book_Name,Chapter_Name,Book_and_Chapter,Chapter_Order,Verse_#,Verse_Order,Verse,Characters,Typed,Tests,Fastest_WPM,Characters_Typed,Total_Characters_Typed
0,1,GEN,1,GEN 1,1,1,1,"In the beginning, God created the heavens and ...",56,0,0,,0,0
1,1,GEN,1,GEN 1,1,2,2,The earth was formless and empty. Darkness was...,135,0,0,,0,0
2,1,GEN,1,GEN 1,1,3,3,"God said, ""Let there be light,"" and there was ...",52,0,0,,0,0
3,1,GEN,1,GEN 1,1,4,4,"God saw the light, and saw that it was good. G...",85,0,0,,0,0
4,1,GEN,1,GEN 1,1,5,5,"God called the light ""day"", and the darkness h...",119,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35374,95,REV,22,REV 22,1328,17,35375,"The Spirit and the bride say, ""Come!"" He who h...",160,0,0,,0,0
35375,95,REV,22,REV 22,1328,18,35376,I testify to everyone who hears the words of t...,159,0,0,,0,0
35376,95,REV,22,REV 22,1328,19,35377,If anyone takes away from the words of the boo...,174,0,0,,0,0
35377,95,REV,22,REV 22,1328,20,35378,"He who testifies these things says, ""Yes, I am...",89,0,0,,0,0


In [15]:
df_Bible_for_game.to_csv('WEB_Catholic_Version_for_game.csv', index = False)
# I manually created a copy of this file called 'WEB_Catholic_Version_for_game_updated.csv'
# in order to store the player's actual game data. That way, running this script
# won't overwrite any progress the player has already made within that file.

In [16]:
end_time = time.time()
runtime = end_time - start_time
print(datetime.now(timezone.utc).isoformat())
# To show the current UTC time, you can instead use:
# print(datetime.now(timezone.utc).isoformat())
print(f"Finished running script in {round(runtime, 3)} seconds.")

2023-10-28T22:59:24.552474+00:00
Finished running script in 1.831 seconds.


Updating pre-existing WEB_Catholic_Version_for_game_updated.csv file with new version of df_Bible:

In [19]:
# df_Bible_for_game_updated = pd.read_csv('WEB_Catholic_version_for_game_updated.csv')

# df_Bible_for_game_updated = df_Bible.merge(df_Bible_for_game_updated[['Verse_Order', 'Typed', 'Tests', 'Fastest_WPM', 'Characters_Typed', 'Total_Characters_Typed']], on = 'Verse_Order')
# # df_Bible_for_game_updated.merge(df_Bible_for_game[[]], 
# df_Bible_for_game_updated.to_csv('WEB_Catholic_version_for_game_updated.csv', index = False)
# df_Bible_for_game_updated