# REL560-VersionedBibles

## Cosine Similarity

In [1]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
### Load texts into a dataframe (written as list, shorter version)

# Create function that changes the text

def preprocessing_function(content):
#   lines of code here that change the text
    content = content.lower()
    lines = content.split('\n')
    
    verses = []
    for line in lines:
        if re.match('^[0-9]* ', line):
            verses.append(line)
        else:
            pass
    for line in verses:
        line = re.sub('\([0-9]*\)', '', line)
        line = re.sub('  ', ' ', line)
        line = re.sub('\n', '', line)
        line = line.strip()
        line = re.sub('[^A-Za-z0-9 ]+', '', line)
        
    
    content = " ".join(verses)
    
    
    return content


source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

books = ['1Timothy', 'Ephesians']

texts = []


for book in books:
    for filename in os.listdir(source_dir):
        if filename.endswith(book+".txt"):
            with open(os.path.join(source_dir, filename), 'r') as obit:
                content = obit.read()
#               Apply cleaned content code
                cleaned_content = preprocessing_function(content)
            texts.append(
                {
                    "doc_id": filename,
                    "book": book,
                    "text": cleaned_content
                }
            )
        else:
            pass

texts_df = pd.DataFrame(texts)
texts_df.head(10)

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."


## Cosine Similarity - 1Timothy

In [3]:
### Set book to 1Timothy

timothy_df = texts_df[texts_df['book'] == "1Timothy"]
texts_df[texts_df['book'] == "1Timothy"]

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [4]:
timothy_df.reset_index(inplace=True)
timothy_df

Unnamed: 0,index,doc_id,book,text
0,0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [5]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(timothy_df['text'])

print (tfidf_matrix.shape)

(9, 1297)


In [6]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.98382916 0.98369742 0.89293084 0.98905153 0.96017234
  0.89595162 0.95821667 0.96348024]
 [0.98382916 1.         0.99974682 0.90272342 0.99573744 0.96772054
  0.90555847 0.96612344 0.96308238]
 [0.98369742 0.99974682 1.         0.90246857 0.99547039 0.9675837
  0.90528307 0.96610771 0.9629358 ]
 [0.89293084 0.90272342 0.90246857 1.         0.8994457  0.91252901
  0.9965877  0.90884892 0.90383236]
 [0.98905153 0.99573744 0.99547039 0.8994457  1.         0.96618106
  0.90242304 0.96438296 0.96343146]
 [0.96017234 0.96772054 0.9675837  0.91252901 0.96618106 1.
  0.91496463 0.99685117 0.97798604]
 [0.89595162 0.90555847 0.90528307 0.9965877  0.90242304 0.91496463
  1.         0.91136581 0.90689588]
 [0.95821667 0.96612344 0.96610771 0.90884892 0.96438296 0.99685117
  0.91136581 1.         0.9778599 ]
 [0.96348024 0.96308238 0.9629358  0.90383236 0.96343146 0.97798604
  0.90689588 0.9778599  1.        ]]


In [7]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983829,0.983697,0.892931,0.989052,0.960172,0.895952,0.958217,0.96348
1,0.983829,1.0,0.999747,0.902723,0.995737,0.967721,0.905558,0.966123,0.963082
2,0.983697,0.999747,1.0,0.902469,0.99547,0.967584,0.905283,0.966108,0.962936
3,0.892931,0.902723,0.902469,1.0,0.899446,0.912529,0.996588,0.908849,0.903832
4,0.989052,0.995737,0.99547,0.899446,1.0,0.966181,0.902423,0.964383,0.963431
5,0.960172,0.967721,0.967584,0.912529,0.966181,1.0,0.914965,0.996851,0.977986
6,0.895952,0.905558,0.905283,0.996588,0.902423,0.914965,1.0,0.911366,0.906896
7,0.958217,0.966123,0.966108,0.908849,0.964383,0.996851,0.911366,1.0,0.97786
8,0.96348,0.963082,0.962936,0.903832,0.963431,0.977986,0.906896,0.97786,1.0


In [8]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983829,0.983697,0.892931,0.989052,0.960172,0.895952,0.958217,0.96348
1,0.983829,1.0,0.999747,0.902723,0.995737,0.967721,0.905558,0.966123,0.963082
2,0.983697,0.999747,1.0,0.902469,0.99547,0.967584,0.905283,0.966108,0.962936
3,0.892931,0.902723,0.902469,1.0,0.899446,0.912529,0.996588,0.908849,0.903832
4,0.989052,0.995737,0.99547,0.899446,1.0,0.966181,0.902423,0.964383,0.963431
5,0.960172,0.967721,0.967584,0.912529,0.966181,1.0,0.914965,0.996851,0.977986
6,0.895952,0.905558,0.905283,0.996588,0.902423,0.914965,1.0,0.911366,0.906896
7,0.958217,0.966123,0.966108,0.908849,0.964383,0.996851,0.911366,1.0,0.97786
8,0.96348,0.963082,0.962936,0.903832,0.963431,0.977986,0.906896,0.97786,1.0


In [9]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
...,...,...,...
76,8,4,0.963431
77,8,5,0.977986
78,8,6,0.906896
79,8,7,0.977860


In [10]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
5,0,5,0.960172
...,...,...,...
75,8,3,0.903832
76,8,4,0.963431
77,8,5,0.977986
78,8,6,0.906896


In [11]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
5,0,5,0.960172
6,0,6,0.895952
7,0,7,0.958217
8,0,8,0.96348
11,1,2,0.999747
12,1,3,0.902723


In [12]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

### 1Timothy - Least Similar

In [13]:
## Get least similar (bottom)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      0      3          0.892931      0  NASB-2020-1Timothy.txt       3   
1      0      6          0.895952      0  NASB-2020-1Timothy.txt       6   
2      3      4          0.899446      3   ERV-1881-1Timothy.txt       4   
3      4      6          0.902423      4  NASB-1995-1Timothy.txt       6   
4      2      3          0.902469      2  NASB-1971-1Timothy.txt       3   
5      1      3          0.902723      1  NASB-1977-1Timothy.txt       3   
6      3      8          0.903832      3   ERV-1881-1Timothy.txt       8   
7      2      6          0.905283      2  NASB-1971-1Timothy.txt       6   
8      1      6          0.905558      1  NASB-1977-1Timothy.txt       6   
9      6      8          0.906896      6   ASV-1900-1Timothy.txt       8   

                 Doc_B_ID  
0   ERV-1881-1Timothy.txt  
1   ASV-1900-1Timothy.txt  
2  NASB-1995-1Timothy.txt  
3   ASV-1900-1Timothy.txt  
4   ERV-1881-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.892931,NASB-2020-1Timothy.txt,ERV-1881-1Timothy.txt
1,0.895952,NASB-2020-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.899446,ERV-1881-1Timothy.txt,NASB-1995-1Timothy.txt
3,0.902423,NASB-1995-1Timothy.txt,ASV-1900-1Timothy.txt
4,0.902469,NASB-1971-1Timothy.txt,ERV-1881-1Timothy.txt
5,0.902723,NASB-1977-1Timothy.txt,ERV-1881-1Timothy.txt
6,0.903832,ERV-1881-1Timothy.txt,NRSV-1989-1Timothy.txt
7,0.905283,NASB-1971-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.905558,NASB-1977-1Timothy.txt,ASV-1900-1Timothy.txt
9,0.906896,ASV-1900-1Timothy.txt,NRSV-1989-1Timothy.txt


### 1Timothy - Most Similar

In [14]:
## Get most similar (top)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      7      8          0.977860      7   RSV-1946-1Timothy.txt       8   
1      5      8          0.977986      5   RSV-1971-1Timothy.txt       8   
2      0      2          0.983697      0  NASB-2020-1Timothy.txt       2   
3      0      1          0.983829      0  NASB-2020-1Timothy.txt       1   
4      0      4          0.989052      0  NASB-2020-1Timothy.txt       4   
5      2      4          0.995470      2  NASB-1971-1Timothy.txt       4   
6      1      4          0.995737      1  NASB-1977-1Timothy.txt       4   
7      3      6          0.996588      3   ERV-1881-1Timothy.txt       6   
8      5      7          0.996851      5   RSV-1971-1Timothy.txt       7   
9      1      2          0.999747      1  NASB-1977-1Timothy.txt       2   

                 Doc_B_ID  
0  NRSV-1989-1Timothy.txt  
1  NRSV-1989-1Timothy.txt  
2  NASB-1971-1Timothy.txt  
3  NASB-1977-1Timothy.txt  
4  NASB-1995-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.97786,RSV-1946-1Timothy.txt,NRSV-1989-1Timothy.txt
1,0.977986,RSV-1971-1Timothy.txt,NRSV-1989-1Timothy.txt
2,0.983697,NASB-2020-1Timothy.txt,NASB-1971-1Timothy.txt
3,0.983829,NASB-2020-1Timothy.txt,NASB-1977-1Timothy.txt
4,0.989052,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
5,0.99547,NASB-1971-1Timothy.txt,NASB-1995-1Timothy.txt
6,0.995737,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
7,0.996588,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.996851,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt
9,0.999747,NASB-1977-1Timothy.txt,NASB-1971-1Timothy.txt


In [15]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

## Cosine Similarity - Ephesians

In [16]:
#### Reset book to Ephesians ####

ephesians_df = texts_df[texts_df['book'] == "Ephesians"]
texts_df[texts_df['book'] == "Ephesians"]

Unnamed: 0,doc_id,book,text
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [17]:
ephesians_df.reset_index(inplace=True)
# jw - Added another "reset_index"
ephesians_df.reset_index(inplace=True)
ephesians_df

Unnamed: 0,level_0,index,doc_id,book,text
0,0,9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
1,1,10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
2,2,11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
3,3,12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
4,4,13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
5,5,14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
6,6,15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
7,7,16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
8,8,17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [18]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ephesians_df['text'])

print (tfidf_matrix.shape)

(9, 1126)


In [19]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.92376014 0.9210797  0.92523693 0.91189857 0.99879233
  0.92495947 0.92549473 0.92470542]
 [0.92376014 1.         0.9959448  0.99802952 0.98597172 0.92304704
  0.98794612 0.99818576 0.98782359]
 [0.9210797  0.9959448  1.         0.99407024 0.98584327 0.92043421
  0.98566804 0.99429886 0.98569083]
 [0.92523693 0.99802952 0.99407024 1.         0.9853741  0.92465466
  0.98916718 0.9998112  0.9895019 ]
 [0.91189857 0.98597172 0.98584327 0.9853741  1.         0.91204383
  0.98964085 0.98550663 0.98956946]
 [0.99879233 0.92304704 0.92043421 0.92465466 0.91204383 1.
  0.92478483 0.92496564 0.92467962]
 [0.92495947 0.98794612 0.98566804 0.98916718 0.98964085 0.92478483
  1.         0.98931014 0.99931977]
 [0.92549473 0.99818576 0.99429886 0.9998112  0.98550663 0.92496564
  0.98931014 1.         0.98963664]
 [0.92470542 0.98782359 0.98569083 0.9895019  0.98956946 0.92467962
  0.99931977 0.98963664 1.        ]]


In [20]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.92376,0.92108,0.925237,0.911899,0.998792,0.924959,0.925495,0.924705
1,0.92376,1.0,0.995945,0.99803,0.985972,0.923047,0.987946,0.998186,0.987824
2,0.92108,0.995945,1.0,0.99407,0.985843,0.920434,0.985668,0.994299,0.985691
3,0.925237,0.99803,0.99407,1.0,0.985374,0.924655,0.989167,0.999811,0.989502
4,0.911899,0.985972,0.985843,0.985374,1.0,0.912044,0.989641,0.985507,0.989569
5,0.998792,0.923047,0.920434,0.924655,0.912044,1.0,0.924785,0.924966,0.92468
6,0.924959,0.987946,0.985668,0.989167,0.989641,0.924785,1.0,0.98931,0.99932
7,0.925495,0.998186,0.994299,0.999811,0.985507,0.924966,0.98931,1.0,0.989637
8,0.924705,0.987824,0.985691,0.989502,0.989569,0.92468,0.99932,0.989637,1.0


In [21]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.92376,0.92108,0.925237,0.911899,0.998792,0.924959,0.925495,0.924705
1,0.92376,1.0,0.995945,0.99803,0.985972,0.923047,0.987946,0.998186,0.987824
2,0.92108,0.995945,1.0,0.99407,0.985843,0.920434,0.985668,0.994299,0.985691
3,0.925237,0.99803,0.99407,1.0,0.985374,0.924655,0.989167,0.999811,0.989502
4,0.911899,0.985972,0.985843,0.985374,1.0,0.912044,0.989641,0.985507,0.989569
5,0.998792,0.923047,0.920434,0.924655,0.912044,1.0,0.924785,0.924966,0.92468
6,0.924959,0.987946,0.985668,0.989167,0.989641,0.924785,1.0,0.98931,0.99932
7,0.925495,0.998186,0.994299,0.999811,0.985507,0.924966,0.98931,1.0,0.989637
8,0.924705,0.987824,0.985691,0.989502,0.989569,0.92468,0.99932,0.989637,1.0


In [22]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.923760
2,0,2,0.921080
3,0,3,0.925237
4,0,4,0.911899
...,...,...,...
76,8,4,0.989569
77,8,5,0.924680
78,8,6,0.999320
79,8,7,0.989637


In [23]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.923760
2,0,2,0.921080
3,0,3,0.925237
4,0,4,0.911899
5,0,5,0.998792
...,...,...,...
75,8,3,0.989502
76,8,4,0.989569
77,8,5,0.924680
78,8,6,0.999320


In [24]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.92376
2,0,2,0.92108
3,0,3,0.925237
4,0,4,0.911899
5,0,5,0.998792
6,0,6,0.924959
7,0,7,0.925495
8,0,8,0.924705
11,1,2,0.995945
12,1,3,0.99803


In [25]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="level_0").merge(metadata, how='left',
    left_on="Doc_B", right_on="level_0") # Changed the 'right_on' condition to 'level_0'
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

### Ephesians - Least Similar

In [26]:
## Get least similar (bottom)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      0      4          0.911899      0   ERV-1881-Ephesians.txt       4   
1      4      5          0.912044      4  NRSV-1989-Ephesians.txt       5   
2      2      5          0.920434      2  NASB-2020-Ephesians.txt       5   
3      0      2          0.921080      0   ERV-1881-Ephesians.txt       2   
4      1      5          0.923047      1  NASB-1995-Ephesians.txt       5   
5      0      1          0.923760      0   ERV-1881-Ephesians.txt       1   
6      3      5          0.924655      3  NASB-1971-Ephesians.txt       5   
7      5      8          0.924680      5   ASV-1900-Ephesians.txt       8   
8      0      8          0.924705      0   ERV-1881-Ephesians.txt       8   
9      5      6          0.924785      5   ASV-1900-Ephesians.txt       6   

                  Doc_B_ID  
0  NRSV-1989-Ephesians.txt  
1   ASV-1900-Ephesians.txt  
2   ASV-1900-Ephesians.txt  
3  NASB-2020-Ephesians.txt  
4   ASV

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.911899,ERV-1881-Ephesians.txt,NRSV-1989-Ephesians.txt
1,0.912044,NRSV-1989-Ephesians.txt,ASV-1900-Ephesians.txt
2,0.920434,NASB-2020-Ephesians.txt,ASV-1900-Ephesians.txt
3,0.92108,ERV-1881-Ephesians.txt,NASB-2020-Ephesians.txt
4,0.923047,NASB-1995-Ephesians.txt,ASV-1900-Ephesians.txt
5,0.92376,ERV-1881-Ephesians.txt,NASB-1995-Ephesians.txt
6,0.924655,NASB-1971-Ephesians.txt,ASV-1900-Ephesians.txt
7,0.92468,ASV-1900-Ephesians.txt,RSV-1971-Ephesians.txt
8,0.924705,ERV-1881-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.924785,ASV-1900-Ephesians.txt,RSV-1946-Ephesians.txt


### Ephesians - Most Similar

In [27]:
## Get most similar (top)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      7      8          0.989637      7  NASB-1977-Ephesians.txt       8   
1      4      6          0.989641      4  NRSV-1989-Ephesians.txt       6   
2      2      3          0.994070      2  NASB-2020-Ephesians.txt       3   
3      2      7          0.994299      2  NASB-2020-Ephesians.txt       7   
4      1      2          0.995945      1  NASB-1995-Ephesians.txt       2   
5      1      3          0.998030      1  NASB-1995-Ephesians.txt       3   
6      1      7          0.998186      1  NASB-1995-Ephesians.txt       7   
7      0      5          0.998792      0   ERV-1881-Ephesians.txt       5   
8      6      8          0.999320      6   RSV-1946-Ephesians.txt       8   
9      3      7          0.999811      3  NASB-1971-Ephesians.txt       7   

                  Doc_B_ID  
0   RSV-1971-Ephesians.txt  
1   RSV-1946-Ephesians.txt  
2  NASB-1971-Ephesians.txt  
3  NASB-1977-Ephesians.txt  
4  NASB

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.989637,NASB-1977-Ephesians.txt,RSV-1971-Ephesians.txt
1,0.989641,NRSV-1989-Ephesians.txt,RSV-1946-Ephesians.txt
2,0.99407,NASB-2020-Ephesians.txt,NASB-1971-Ephesians.txt
3,0.994299,NASB-2020-Ephesians.txt,NASB-1977-Ephesians.txt
4,0.995945,NASB-1995-Ephesians.txt,NASB-2020-Ephesians.txt
5,0.99803,NASB-1995-Ephesians.txt,NASB-1971-Ephesians.txt
6,0.998186,NASB-1995-Ephesians.txt,NASB-1977-Ephesians.txt
7,0.998792,ERV-1881-Ephesians.txt,ASV-1900-Ephesians.txt
8,0.99932,RSV-1946-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.999811,NASB-1971-Ephesians.txt,NASB-1977-Ephesians.txt


## Measure Similarity and Difference

## 1Timothy

In [28]:
import difflib

In [29]:
doc_path = "../data/analysis_data/bibles-txt-ft-cleaned/"

docs = ["NASB-1971-1Timothy.txt", "NASB-1995-1Timothy.txt"]

In [30]:
texts = []
for doc in docs:
    with open(doc_path + doc) as f:
        content = f.read()
    texts.append(content)

In [31]:
len(texts[0])

16193

In [32]:
len(texts[1])

14129

In [33]:
texts = [text.split('\n') for text in texts]
print(len(texts))
texts

2


[['THE FIRST EPISTLE OF PAUL TO TIMOTHY',
  '',
  '',
  'Salutation. Charge Respecting Misuse of the Law. Personal Thanksgiving.',
  '',
  '1 Paul, an apostle of Christ Jesus according to the commandment of God our Savior, and of Christ Jesus, who is our hope;',
  '2 to Timothy, my true child in the faith: Grace, mercy and peace from God the Father and Christ Jesus our Lord.',
  '3 As I urged you (1) upon my departure for Macedonia, (2) remain on at Ephesus, in order that you may instruct certain men not to teach strange doctrines,',
  '4 nor to (1) pay attention to myths and endless genealogies, which give rise to mere speculation rather than furthering (2) God’s provision which is by faith.',
  '5 But the goal of our (5) instruction is love from a pure heart and a good conscience and a sincere faith.',
  '6 For some men, straying from these things, have turned aside to fruitless discussion,',
  '7 wanting to be teachers of the Law, even though they do not understand either what they 

In [34]:
# Filter so only numbered verses remain

import re

verses = []

for text in texts:
    by_verse = []
    for line in text:
        if re.match('^[0-9]* ', line):
            by_verse.append(line)
        else:
            pass
    verses.append(by_verse)
        
#verses

In [35]:
print(len(verses))

2


In [36]:
d = difflib.Differ()

In [37]:
result = list(d.compare(verses[0], verses[1]))

In [38]:
from pprint import pprint

pprint(result)

['- 1 Paul, an apostle of Christ Jesus according to the commandment of God our '
 'Savior, and of Christ Jesus, who is our hope;',
 '?                                                                                                                        '
 '^\n',
 '+ 1 Paul, an apostle of Christ Jesus according to the commandment of God our '
 'Savior, and of Christ Jesus, who is our hope,',
 '?                                                                                                                        '
 '^\n',
 '- 2 to Timothy, my true child in the faith: Grace, mercy and peace from God '
 'the Father and Christ Jesus our Lord.',
 '?   ^                                     ^\n',
 '+ 2 To Timothy, my true child in the faith; Grace, mercy and peace from God '
 'the Father and Christ Jesus our Lord.',
 '?   ^                                     ^\n',
 '- 3 As I urged you (1) upon my departure for Macedonia, (2) remain on at '
 'Ephesus, in order that you may instruct certain m

In [39]:
## Cleaned verses
# What if we remove all the footnote keys?

cleaned_verses = []

for text in verses:
    by_verse = []
    for line in text:
            #What do we want out of here?
            line = re.sub('\([0-9]*\)', '', line)
            line = re.sub('  ', ' ', line)
            line = re.sub('\n', '', line)
            line = line.strip()
            line = re.sub('[^A-Za-z0-9 ]+', '', line)
            line = line.lower()
            
            by_verse.append(line)
    cleaned_verses.append(by_verse)
        
cleaned_verses

[['1 paul an apostle of christ jesus according to the commandment of god our savior and of christ jesus who is our hope',
  '2 to timothy my true child in the faith grace mercy and peace from god the father and christ jesus our lord',
  '3 as i urged you upon my departure for macedonia remain on at ephesus in order that you may instruct certain men not to teach strange doctrines',
  '4 nor to pay attention to myths and endless genealogies which give rise to mere speculation rather than furthering gods provision which is by faith',
  '5 but the goal of our instruction is love from a pure heart and a good conscience and a sincere faith',
  '6 for some men straying from these things have turned aside to fruitless discussion',
  '7 wanting to be teachers of the law even though they do not understand either what they are saying or the matters about which they make confident assertions',
  '8 but we know that the law is good if one uses it lawfully',
  '9 realizing the fact that law is not m

In [40]:
d2 = difflib.Differ()

result = list(d2.compare(cleaned_verses[0], cleaned_verses[1]))

In [41]:
pprint(result)

['  1 paul an apostle of christ jesus according to the commandment of god our '
 'savior and of christ jesus who is our hope',
 '  2 to timothy my true child in the faith grace mercy and peace from god the '
 'father and christ jesus our lord',
 '- 3 as i urged you upon my departure for macedonia remain on at ephesus in '
 'order that you may instruct certain men not to teach strange doctrines',
 '?                                                                       ^^^ '
 '----\n',
 '+ 3 as i urged you upon my departure for macedonia remain on at ephesus so '
 'that you may instruct certain men not to teach strange doctrines',
 '?                                                                       ^\n',
 '- 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering gods provision which is by faith',
 '?                                                                                                                       '


### Trouble_verses for "NASB-1971-1Timothy.txt", "NASB-1995-1Timothy.txt"

In [42]:
trouble_verses = []

for line in result:
    if re.match('^  ', line):
        pass
    else:
        trouble_verses.append(line)

In [43]:
pprint(trouble_verses)

['- 3 as i urged you upon my departure for macedonia remain on at ephesus in '
 'order that you may instruct certain men not to teach strange doctrines',
 '?                                                                       ^^^ '
 '----\n',
 '+ 3 as i urged you upon my departure for macedonia remain on at ephesus so '
 'that you may instruct certain men not to teach strange doctrines',
 '?                                                                       ^\n',
 '- 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering gods provision which is by faith',
 '?                                                                                                                       '
 '-----------\n',
 '+ 4 nor to pay attention to myths and endless genealogies which give rise to '
 'mere speculation rather than furthering the administration of god which is '
 'by faith',
 '?                                                   

## Ephesians

In [44]:
doc_path = "../data/analysis_data/bibles-txt-ft-cleaned/"

docs = ["NASB-1971-Ephesians.txt", "NASB-2020-Ephesians.txt"]

In [45]:
texts = []
for doc in docs:
    with open(doc_path + doc) as f:
        content = f.read()
    texts.append(content)

In [46]:
len(texts[0])

20812

In [47]:
len(texts[1])

17981

In [48]:
texts = [text.split('\n') for text in texts]
print(len(texts))
#texts

2


In [49]:
# Filter so only numbered verses remain

import re

verses = []

for text in texts:
    by_verse = []
    for line in text:
        if re.match('^[0-9]* ', line):
            by_verse.append(line)
        else:
            pass
    verses.append(by_verse)
        
verses

[['1 Paul, an apostle of Christ Jesus (1) by the will of God, to the (2) saints who are (3) at Ephesus, and who are faithful in Christ Jesus:',
  '2 Grace to you and peace from God our Father and the Lord Jesus Christ.',
  '3 Blessed be the God and Father of our Lord Jesus Christ, who has blessed us with every spiritual blessing in the heavenly places in Christ,',
  '4 just as He chose us in Him before the foundation of the world, that we should be holy and blameless before (1) Him. In love',
  '5 (1) He predestined us to adoption as sons through Jesus Christ to Himself, according to the (2) kind intention of His will,',
  '6 to the praise of the glory of His grace, which He freely bestowed on us in the Beloved.',
  '7 In (1) Him we have redemption through His blood, the forgiveness of our trespasses, according to the riches of His grace,',
  '8 which He (1) lavished upon (2) us. In all wisdom and insight',
  '9 He (1) made known to us the mystery of His will, according to His (1) kind

In [50]:
print(len(verses))

2


In [51]:
d = difflib.Differ()

In [52]:
result = list(d.compare(verses[0], verses[1]))

In [53]:
from pprint import pprint

pprint(result)

['- 1 Paul, an apostle of Christ Jesus (1) by the will of God, to the (2) '
 'saints who are (3) at Ephesus, and who are faithful in Christ Jesus:',
 '?                                    ----                    ^     '
 '----                 ^            -    ----\n',
 '+ 1 Paul, an apostle of Christ Jesus by the will of God, To the saints who '
 'are (1) at Ephesus and are faithful in Christ Jesus:',
 '?                                                        '
 '^                      ^\n',
 '  2 Grace to you and peace from God our Father and the Lord Jesus Christ.',
 '  3 Blessed be the God and Father of our Lord Jesus Christ, who has blessed '
 'us with every spiritual blessing in the heavenly places in Christ,',
 '- 4 just as He chose us in Him before the foundation of the world, that we '
 'should be holy and blameless before (1) Him. In love',
 '?                                                                          '
 '^^                                   ^\n',
 '+ 4 just as

 '- 17 And He came and preached PEACE to you who were FAR AWAY, AND PEACE TO '
 'THOSE WHO WERE NEAR;',
 '+ 17 And He came and preached peace to you who were far away, and peace to '
 'those who were near;',
 '  18 for through Him we both have our access in one Spirit to the Father.',
 '- 19 So then you are no longer strangers and aliens, but you are fellow- '
 'citizens with the saints, and are of God’s household,',
 '?                                            ^^  ^                     -\n',
 '+ 19 So then you are no longer strangers and foreigners, but you are fellow '
 'citizens with the saints, and are of God’s household,',
 '?                                            ^^^^ ++ ^\n',
 '- 20 having been built upon the foundation of the apostles and prophets, '
 'Christ Jesus Himself being the cornerstone,',
 '?                      --\n',
 '+ 20 having been built on the foundation of the apostles and prophets, '
 'Christ Jesus Himself being the cornerstone,',
 '- 21 in whom the wh

 '+ 31 For this reason a man shall leave HIS FATHER AND HIS MOTHER AND BE '
 'JOINED TO HIS WIFE, AND THE TWO SHALL BECOME ONE FLESH.',
 '?             ^^  ^^                   ^^^ ^^^^^^ ^^^^^^^            ^^ ^^^^ '
 '^            ^\n',
 '  32 This mystery is great; but I am speaking with reference to Christ and '
 'the church.',
 '- 33 Nevertheless let each individual among you also love his own wife even '
 'as himself; and let the wife see to it that she respect her husband.',
 '?                 ^^^ ^^^^             ^^ ^ ^^^ ^^^                     ^ '
 '-           ^    ----\n',
 '+ 33 Nevertheless, as for you individually, each husband is to love his own '
 'wife the same as himself, and the wife must see to it that she respects her '
 'husband.',
 '?                + ^^ ^^^^^^^           +++ + ^^^^^^^^ ^ ^^ '
 '^                    ++ ^^^^            ^             '
 '+++++                           +\n',
 '  1 Children, obey your parents in the Lord, for this is right.',
 '- 2

In [54]:
## Cleaned Verses
# What if we remove all the footnote keys?

cleaned_verses = []

for text in verses:
    by_verse = []
    for line in text:
            #What do we want out of here?
            line = re.sub('\([0-9]*\)', '', line)
            line = re.sub('  ', ' ', line)
            line = re.sub('\n', '', line)
            line = line.strip()
            line = re.sub('[^A-Za-z0-9 ]+', '', line)
            line = line.lower()
            
            by_verse.append(line)
    cleaned_verses.append(by_verse)
        
cleaned_verses

[['1 paul an apostle of christ jesus by the will of god to the saints who are at ephesus and who are faithful in christ jesus',
  '2 grace to you and peace from god our father and the lord jesus christ',
  '3 blessed be the god and father of our lord jesus christ who has blessed us with every spiritual blessing in the heavenly places in christ',
  '4 just as he chose us in him before the foundation of the world that we should be holy and blameless before him in love',
  '5 he predestined us to adoption as sons through jesus christ to himself according to the kind intention of his will',
  '6 to the praise of the glory of his grace which he freely bestowed on us in the beloved',
  '7 in him we have redemption through his blood the forgiveness of our trespasses according to the riches of his grace',
  '8 which he lavished upon us in all wisdom and insight',
  '9 he made known to us the mystery of his will according to his kind intention which he purposed in him',
  '10 with a view to an 

In [55]:
d2 = difflib.Differ()

result = list(d2.compare(cleaned_verses[0], cleaned_verses[1]))

In [56]:
pprint(result)

['- 1 paul an apostle of christ jesus by the will of god to the saints who are '
 'at ephesus and who are faithful in christ jesus',
 '?                                                                                           '
 '----\n',
 '+ 1 paul an apostle of christ jesus by the will of god to the saints who are '
 'at ephesus and are faithful in christ jesus',
 '  2 grace to you and peace from god our father and the lord jesus christ',
 '  3 blessed be the god and father of our lord jesus christ who has blessed '
 'us with every spiritual blessing in the heavenly places in christ',
 '- 4 just as he chose us in him before the foundation of the world that we '
 'should be holy and blameless before him in love',
 '?                                                                         '
 '^^\n',
 '+ 4 just as he chose us in him before the foundation of the world that we '
 'would be holy and blameless before him in love',
 '?                                                        

 '^^^\n',
 '+ 16 that he would grant you according to the riches of his glory to be '
 'strengthened with power through his spirit in the inner self',
 '?                                                                                                                               '
 '^^^^\n',
 '  17 so that christ may dwell in your hearts through faith and that you '
 'being rooted and grounded in love',
 '- 18 may be able to comprehend with all the saints what is the breadth and '
 'length and height and depth',
 '?                                                              ^^^^\n',
 '+ 18 may be able to comprehend with all the saints what is the width and '
 'length and height and depth',
 '?                                                              ^^\n',
 '- 19 and to know the love of christ which surpasses knowledge that you may '
 'be filled up to all the fulness of god',
 '?                                                                                    '
 '---\n',
 '+ 1

### Trouble_Verses for "NASB-1971-Ephesians.txt", "NASB-2020-Ephesians.txt"

In [57]:
trouble_verses = []

for line in result:
    if re.match('^  ', line):
        pass
    else:
        trouble_verses.append(line)

In [58]:
pprint(trouble_verses)

['- 1 paul an apostle of christ jesus by the will of god to the saints who are '
 'at ephesus and who are faithful in christ jesus',
 '?                                                                                           '
 '----\n',
 '+ 1 paul an apostle of christ jesus by the will of god to the saints who are '
 'at ephesus and are faithful in christ jesus',
 '- 4 just as he chose us in him before the foundation of the world that we '
 'should be holy and blameless before him in love',
 '?                                                                         '
 '^^\n',
 '+ 4 just as he chose us in him before the foundation of the world that we '
 'would be holy and blameless before him in love',
 '?                                                                         '
 '^\n',
 '- 5 he predestined us to adoption as sons through jesus christ to himself '
 'according to the kind intention of his will',
 '?                                                                      