In [1]:
import difflib

In [2]:
doc_path = "../data/bibles/"

docs = ["NASB-1971-1Timothy.txt", "NASB-1977-1Timothy.txt" ]

In [3]:
texts = []
for doc in docs:
    with open(doc_path + doc) as f:
        content = f.read()
    texts.append(content)

In [4]:
len(texts[0])

16090

In [5]:
len(texts[1])

14232

In [6]:
print("Book A ({}) has {} characters,\nBook B ({}) has {} characters".format(docs[0],len(texts[0]), docs[1], len(texts[1]) ))

Book A (NASB-1971-1Timothy.txt) has 16090 characters,
Book B (NASB-1977-1Timothy.txt) has 14232 characters


In [7]:
texts[0]

'THE FIRST EPISTLE OF PAUL TO TIMOTHY\n\n\nSalutation. Charge Respecting Misuse of the Law. Personal Thanksgiving.\n\n1 Paul, an apostle of Christ Jesus according to the commandment of God our Savior, and of Christ Jesus, who is our hope;\n2 to Timothy, my true child in the faith: Grace, mercy and peace from God the Father and Christ Jesus our Lord.\n3 As I urged you (1) upon my departure for Macedonia, (2) remain on at Ephesus, in order that you may instruct certain men not to teach strange doctrines,\n4 nor to (1) pay attention to myths and endless genealogies, which give rise to mere speculation rather than furthering (2) God’s provision which is by faith.\n5 But the goal of our (5) instruction is love from a pure heart and a good conscience and a sincere faith.\n6 For some men, straying from these things, have turned aside to fruitless discussion,\n7 wanting to be teachers of the Law, even though they do not understand either what they are saying or the matters about which they mak

In [8]:
texts = [text.split('\n') for text in texts]
print(len(texts))
texts

2


[['THE FIRST EPISTLE OF PAUL TO TIMOTHY',
  '',
  '',
  'Salutation. Charge Respecting Misuse of the Law. Personal Thanksgiving.',
  '',
  '1 Paul, an apostle of Christ Jesus according to the commandment of God our Savior, and of Christ Jesus, who is our hope;',
  '2 to Timothy, my true child in the faith: Grace, mercy and peace from God the Father and Christ Jesus our Lord.',
  '3 As I urged you (1) upon my departure for Macedonia, (2) remain on at Ephesus, in order that you may instruct certain men not to teach strange doctrines,',
  '4 nor to (1) pay attention to myths and endless genealogies, which give rise to mere speculation rather than furthering (2) God’s provision which is by faith.',
  '5 But the goal of our (5) instruction is love from a pure heart and a good conscience and a sincere faith.',
  '6 For some men, straying from these things, have turned aside to fruitless discussion,',
  '7 wanting to be teachers of the Law, even though they do not understand either what they 

In [9]:
print(len(texts[0]))
print(len(texts[1]))

224
142


In [10]:
# Filter so only numbered verses remain

import re

verses = []

for text in texts:
    by_verse = []
    for line in text:
        if re.match('^[0-9]* ', line):
            by_verse.append(line)
        else:
            pass
    verses.append(by_verse)

verses

[['1 Paul, an apostle of Christ Jesus according to the commandment of God our Savior, and of Christ Jesus, who is our hope;',
  '2 to Timothy, my true child in the faith: Grace, mercy and peace from God the Father and Christ Jesus our Lord.',
  '3 As I urged you (1) upon my departure for Macedonia, (2) remain on at Ephesus, in order that you may instruct certain men not to teach strange doctrines,',
  '4 nor to (1) pay attention to myths and endless genealogies, which give rise to mere speculation rather than furthering (2) God’s provision which is by faith.',
  '5 But the goal of our (5) instruction is love from a pure heart and a good conscience and a sincere faith.',
  '6 For some men, straying from these things, have turned aside to fruitless discussion,',
  '7 wanting to be teachers of the Law, even though they do not understand either what they are saying or the matters about which they make confident assertions.',
  '8 But we know that the Law is good, if one uses it lawfully,',

In [11]:
d = difflib.Differ()

In [12]:
result = list(d.compare(verses[0], verses[1]))

In [13]:
from pprint import pprint

pprint(result)

['- 1 Paul, an apostle of Christ Jesus according to the commandment of God our '
 'Savior, and of Christ Jesus, who is our hope;',
 '?    ^^^\n',
 '+ 1 PAUL, an apostle of Christ Jesus according to the commandment of God our '
 'Savior, and of Christ Jesus, who is our hope;',
 '?    ^^^\n',
 '  2 to Timothy, my true child in the faith: Grace, mercy and peace from God '
 'the Father and Christ Jesus our Lord.',
 '- 3 As I urged you (1) upon my departure for Macedonia, (2) remain on at '
 'Ephesus, in order that you may instruct certain men not to teach strange '
 'doctrines,',
 '?                 ----                                 ----\n',
 '+ 3 As I urged you upon my departure for Macedonia, remain on at Ephesus, in '
 'order that you may instruct certain men not to teach strange doctrines,',
 '- 4 nor to (1) pay attention to myths and endless genealogies, which give '
 'rise to mere speculation rather than furthering (2) God’s provision which is '
 'by faith.',
 '?         '
 '---- 

In [21]:
# What if we remove all the foot note keys?

cleaned_verses = []

for text in verses:
    by_verse = []
    for line in text:
        # What do we want out of here?
        line = re.sub('\([0-9]*\)', '', line)
        line = re.sub('  ', ' ', line)
        line = re.sub('\n', '', line)
        line = line.strip()
        line = re.sub('[^A-Za-z0-9 ]+', '', line)
        line = line.lower()
        
        by_verse.append(line)
    cleaned_verses.append(by_verse)

cleaned_verses

[['1 paul an apostle of christ jesus according to the commandment of god our savior and of christ jesus who is our hope',
  '2 to timothy my true child in the faith grace mercy and peace from god the father and christ jesus our lord',
  '3 as i urged you upon my departure for macedonia remain on at ephesus in order that you may instruct certain men not to teach strange doctrines',
  '4 nor to pay attention to myths and endless genealogies which give rise to mere speculation rather than furthering gods provision which is by faith',
  '5 but the goal of our instruction is love from a pure heart and a good conscience and a sincere faith',
  '6 for some men straying from these things have turned aside to fruitless discussion',
  '7 wanting to be teachers of the law even though they do not understand either what they are saying or the matters about which they make confident assertions',
  '8 but we know that the law is good if one uses it lawfully',
  '9 realizing the fact that law is not m

In [22]:
d2 = difflib.Differ()

result = list(d2.compare(cleaned_verses[0], cleaned_verses[1]))

In [23]:
pprint(result)

['  1 paul an apostle of christ jesus according to the commandment of god our '
 'savior and of christ jesus who is our hope',
 '  2 to timothy my true child in the faith grace mercy and peace from god the '
 'father and christ jesus our lord',
 '  3 as i urged you upon my departure for macedonia remain on at ephesus in '
 'order that you may instruct certain men not to teach strange doctrines',
 '- 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering gods provision which is by faith',
 '?                                                                                                                       '
 '-----------\n',
 '+ 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering the administration of god which is '
 'by faith',
 '?                                                                                                                    '
 '+++++

In [24]:
trouble_verses = []

for line in result:
    if re.match('^  ', line):
        pass
    else:
        trouble_verses.append(line)

In [25]:
pprint(trouble_verses)

['- 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering gods provision which is by faith',
 '?                                                                                                                       '
 '-----------\n',
 '+ 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering the administration of god which is '
 'by faith',
 '?                                                                                                                    '
 '++++++++++++++++++++++\n',
 '- 18 this command i entrust to you timothy my son in',
 '+ 18 this command i entrust to you timothy my son in accordance with the '
 'prophecies previously made concerning you that by them you may fight the '
 'good fight',
 '- 20 among these are hymenaeus and alexander whom i have delivered over to '
 'satan so that they may be dtaught not to blaspheme',
 '?              