In [1]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dataframe

In [2]:
### Load texts into a dataframe (long version)


# source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

# texts = []

# for filename in os.listdir(source_dir):
#     if filename.endswith('1Timothy.txt'):
#         with open(os.path.join(source_dir, filename), 'r') as obit:
#             content = obit.read()
#         texts.append(
#             {
#                 "doc_id": filename,
#                 "book": '1Timothy',
#                 "text": content
#             }
#         )
#     elif filename in os.listdir(source_dir):
#         if filename.endswith('Ephesians.txt'):
#             with open(os.path.join(source_dir, filename), 'r') as obit:
#                 content = obit.read()
#             texts.append(
#                 {
#                     "doc_id": filename,
#                     "book": 'Ephesians',
#                     "text": content
#                 }
#             )
        
#             else:
#                 pass

In [3]:
### Load texts into a dataframe (written as list, shorter version)

# Create function that changes the text

def preprocessing_function(content):
#   lines of code here that change the text
    content = content.lower()
    lines = content.split('\n')
    
    verses = []
    for line in lines:
        if re.match('^[0-9]* ', line):
            verses.append(line)
        else:
            pass
    for line in verses:
        line = re.sub('\([0-9]*\)', '', line)
        line = re.sub('  ', ' ', line)
        line = re.sub('\n', '', line)
        line = line.strip()
        line = re.sub('[^A-Za-z0-9 ]+', '', line)
        
    
    content = " ".join(verses)
    
    
    return content


source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

books = ['1Timothy', 'Ephesians']

texts = []


for book in books:
    for filename in os.listdir(source_dir):
        if filename.endswith(book+".txt"):
            with open(os.path.join(source_dir, filename), 'r') as obit:
                content = obit.read()
#               Apply cleaned content code
                cleaned_content = preprocessing_function(content)
            texts.append(
                {
                    "doc_id": filename,
                    "book": book,
                    "text": cleaned_content
                }
            )
        else:
            pass

texts_df = pd.DataFrame(texts)
texts_df.head(10)

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."


# 1Timothy

In [4]:
### Set book to 1Timothy

timothy_df = texts_df[texts_df['book'] == "1Timothy"]
texts_df[texts_df['book'] == "1Timothy"]

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [5]:
timothy_df.reset_index(inplace=True)
timothy_df

Unnamed: 0,index,doc_id,book,text
0,0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [6]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(timothy_df['text'])

print (tfidf_matrix.shape)

(9, 1298)


In [7]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.98382916 0.98369742 0.89293084 0.98905153 0.96017234
  0.89595162 0.95808813 0.96348024]
 [0.98382916 1.         0.99974682 0.90272342 0.99573744 0.96772054
  0.90555847 0.96599504 0.96308238]
 [0.98369742 0.99974682 1.         0.90246857 0.99547039 0.9675837
  0.90528307 0.96598185 0.9629358 ]
 [0.89293084 0.90272342 0.90246857 1.         0.8994457  0.91252901
  0.9965877  0.90897507 0.90383236]
 [0.98905153 0.99573744 0.99547039 0.8994457  1.         0.96618106
  0.90242304 0.96425933 0.96343146]
 [0.96017234 0.96772054 0.9675837  0.91252901 0.96618106 1.
  0.91496463 0.99676547 0.97798604]
 [0.89595162 0.90555847 0.90528307 0.9965877  0.90242304 0.91496463
  1.         0.91149102 0.90689588]
 [0.95808813 0.96599504 0.96598185 0.90897507 0.96425933 0.99676547
  0.91149102 1.         0.97776635]
 [0.96348024 0.96308238 0.9629358  0.90383236 0.96343146 0.97798604
  0.90689588 0.97776635 1.        ]]


In [8]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983829,0.983697,0.892931,0.989052,0.960172,0.895952,0.958088,0.96348
1,0.983829,1.0,0.999747,0.902723,0.995737,0.967721,0.905558,0.965995,0.963082
2,0.983697,0.999747,1.0,0.902469,0.99547,0.967584,0.905283,0.965982,0.962936
3,0.892931,0.902723,0.902469,1.0,0.899446,0.912529,0.996588,0.908975,0.903832
4,0.989052,0.995737,0.99547,0.899446,1.0,0.966181,0.902423,0.964259,0.963431
5,0.960172,0.967721,0.967584,0.912529,0.966181,1.0,0.914965,0.996765,0.977986
6,0.895952,0.905558,0.905283,0.996588,0.902423,0.914965,1.0,0.911491,0.906896
7,0.958088,0.965995,0.965982,0.908975,0.964259,0.996765,0.911491,1.0,0.977766
8,0.96348,0.963082,0.962936,0.903832,0.963431,0.977986,0.906896,0.977766,1.0


In [9]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983829,0.983697,0.892931,0.989052,0.960172,0.895952,0.958088,0.96348
1,0.983829,1.0,0.999747,0.902723,0.995737,0.967721,0.905558,0.965995,0.963082
2,0.983697,0.999747,1.0,0.902469,0.99547,0.967584,0.905283,0.965982,0.962936
3,0.892931,0.902723,0.902469,1.0,0.899446,0.912529,0.996588,0.908975,0.903832
4,0.989052,0.995737,0.99547,0.899446,1.0,0.966181,0.902423,0.964259,0.963431
5,0.960172,0.967721,0.967584,0.912529,0.966181,1.0,0.914965,0.996765,0.977986
6,0.895952,0.905558,0.905283,0.996588,0.902423,0.914965,1.0,0.911491,0.906896
7,0.958088,0.965995,0.965982,0.908975,0.964259,0.996765,0.911491,1.0,0.977766
8,0.96348,0.963082,0.962936,0.903832,0.963431,0.977986,0.906896,0.977766,1.0


In [10]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
...,...,...,...
76,8,4,0.963431
77,8,5,0.977986
78,8,6,0.906896
79,8,7,0.977766


In [11]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
5,0,5,0.960172
...,...,...,...
75,8,3,0.903832
76,8,4,0.963431
77,8,5,0.977986
78,8,6,0.906896


In [12]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983829
2,0,2,0.983697
3,0,3,0.892931
4,0,4,0.989052
5,0,5,0.960172
6,0,6,0.895952
7,0,7,0.958088
8,0,8,0.96348
11,1,2,0.999747
12,1,3,0.902723


In [13]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# 1Timothy - Least Similar

In [14]:
## Get least similar (bottom)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      0      3          0.892931      0  NASB-2020-1Timothy.txt       3   
1      0      6          0.895952      0  NASB-2020-1Timothy.txt       6   
2      3      4          0.899446      3   ERV-1881-1Timothy.txt       4   
3      4      6          0.902423      4  NASB-1995-1Timothy.txt       6   
4      2      3          0.902469      2  NASB-1971-1Timothy.txt       3   
5      1      3          0.902723      1  NASB-1977-1Timothy.txt       3   
6      3      8          0.903832      3   ERV-1881-1Timothy.txt       8   
7      2      6          0.905283      2  NASB-1971-1Timothy.txt       6   
8      1      6          0.905558      1  NASB-1977-1Timothy.txt       6   
9      6      8          0.906896      6   ASV-1900-1Timothy.txt       8   

                 Doc_B_ID  
0   ERV-1881-1Timothy.txt  
1   ASV-1900-1Timothy.txt  
2  NASB-1995-1Timothy.txt  
3   ASV-1900-1Timothy.txt  
4   ERV-1881-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.892931,NASB-2020-1Timothy.txt,ERV-1881-1Timothy.txt
1,0.895952,NASB-2020-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.899446,ERV-1881-1Timothy.txt,NASB-1995-1Timothy.txt
3,0.902423,NASB-1995-1Timothy.txt,ASV-1900-1Timothy.txt
4,0.902469,NASB-1971-1Timothy.txt,ERV-1881-1Timothy.txt
5,0.902723,NASB-1977-1Timothy.txt,ERV-1881-1Timothy.txt
6,0.903832,ERV-1881-1Timothy.txt,NRSV-1989-1Timothy.txt
7,0.905283,NASB-1971-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.905558,NASB-1977-1Timothy.txt,ASV-1900-1Timothy.txt
9,0.906896,ASV-1900-1Timothy.txt,NRSV-1989-1Timothy.txt


# 1Timothy - Most Similar

In [15]:
## Get most similar (top)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      7      8          0.977766      7   RSV-1946-1Timothy.txt       8   
1      5      8          0.977986      5   RSV-1971-1Timothy.txt       8   
2      0      2          0.983697      0  NASB-2020-1Timothy.txt       2   
3      0      1          0.983829      0  NASB-2020-1Timothy.txt       1   
4      0      4          0.989052      0  NASB-2020-1Timothy.txt       4   
5      2      4          0.995470      2  NASB-1971-1Timothy.txt       4   
6      1      4          0.995737      1  NASB-1977-1Timothy.txt       4   
7      3      6          0.996588      3   ERV-1881-1Timothy.txt       6   
8      5      7          0.996765      5   RSV-1971-1Timothy.txt       7   
9      1      2          0.999747      1  NASB-1977-1Timothy.txt       2   

                 Doc_B_ID  
0  NRSV-1989-1Timothy.txt  
1  NRSV-1989-1Timothy.txt  
2  NASB-1971-1Timothy.txt  
3  NASB-1977-1Timothy.txt  
4  NASB-1995-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.977766,RSV-1946-1Timothy.txt,NRSV-1989-1Timothy.txt
1,0.977986,RSV-1971-1Timothy.txt,NRSV-1989-1Timothy.txt
2,0.983697,NASB-2020-1Timothy.txt,NASB-1971-1Timothy.txt
3,0.983829,NASB-2020-1Timothy.txt,NASB-1977-1Timothy.txt
4,0.989052,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
5,0.99547,NASB-1971-1Timothy.txt,NASB-1995-1Timothy.txt
6,0.995737,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
7,0.996588,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.996765,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt
9,0.999747,NASB-1977-1Timothy.txt,NASB-1971-1Timothy.txt


In [16]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# 1Timothy - Get Similar Docs

In [17]:
get_similar_docs('ERV-1881-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,6,ASV-1900-1Timothy.txt,0.996588
1,5,RSV-1971-1Timothy.txt,0.912529
2,7,RSV-1946-1Timothy.txt,0.908975
3,8,NRSV-1989-1Timothy.txt,0.903832
4,1,NASB-1977-1Timothy.txt,0.902723
5,2,NASB-1971-1Timothy.txt,0.902469
6,4,NASB-1995-1Timothy.txt,0.899446
7,0,NASB-2020-1Timothy.txt,0.892931


In [18]:
get_similar_docs('RSV-1946-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,5,RSV-1971-1Timothy.txt,0.996765
1,8,NRSV-1989-1Timothy.txt,0.977766
2,1,NASB-1977-1Timothy.txt,0.965995
3,2,NASB-1971-1Timothy.txt,0.965982
4,4,NASB-1995-1Timothy.txt,0.964259
5,0,NASB-2020-1Timothy.txt,0.958088
6,6,ASV-1900-1Timothy.txt,0.911491
7,3,ERV-1881-1Timothy.txt,0.908975


In [19]:
get_similar_docs('ASV-1900-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,3,ERV-1881-1Timothy.txt,0.996588
1,5,RSV-1971-1Timothy.txt,0.914965
2,7,RSV-1946-1Timothy.txt,0.911491
3,8,NRSV-1989-1Timothy.txt,0.906896
4,1,NASB-1977-1Timothy.txt,0.905558
5,2,NASB-1971-1Timothy.txt,0.905283
6,4,NASB-1995-1Timothy.txt,0.902423
7,0,NASB-2020-1Timothy.txt,0.895952


In [20]:
get_similar_docs('NASB-1971-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,1,NASB-1977-1Timothy.txt,0.999747
1,4,NASB-1995-1Timothy.txt,0.99547
2,0,NASB-2020-1Timothy.txt,0.983697
3,5,RSV-1971-1Timothy.txt,0.967584
4,7,RSV-1946-1Timothy.txt,0.965982
5,8,NRSV-1989-1Timothy.txt,0.962936
6,6,ASV-1900-1Timothy.txt,0.905283
7,3,ERV-1881-1Timothy.txt,0.902469


# Ephesians

In [21]:
#### Reset book to Ephesians ####

ephesians_df = texts_df[texts_df['book'] == "Ephesians"]
texts_df[texts_df['book'] == "Ephesians"]

Unnamed: 0,doc_id,book,text
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [22]:
ephesians_df.reset_index(inplace=True)
# jw - Added another "reset_index"
ephesians_df.reset_index(inplace=True)
ephesians_df

Unnamed: 0,level_0,index,doc_id,book,text
0,0,9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
1,1,10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
2,2,11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
3,3,12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
4,4,13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
5,5,14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
6,6,15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
7,7,16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
8,8,17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [23]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ephesians_df['text'])

print (tfidf_matrix.shape)

(9, 1131)


In [24]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.9237579  0.92116564 0.92517043 0.91189857 0.99879233
  0.92495947 0.92549252 0.92470542]
 [0.9237579  1.         0.99592487 0.99789529 0.98596933 0.9230448
  0.98794373 0.99818577 0.98782119]
 [0.92116564 0.99592487 1.         0.99395835 0.98578926 0.92050605
  0.98568018 0.99431799 0.98569093]
 [0.92517043 0.99789529 0.99395835 1.         0.98529439 0.92458874
  0.98906977 0.99967834 0.98940422]
 [0.91189857 0.98596933 0.98578926 0.98529439 1.         0.91204383
  0.98964085 0.98550428 0.98956946]
 [0.99879233 0.9230448  0.92050605 0.92458874 0.91204383 1.
  0.92478483 0.92496343 0.92467962]
 [0.92495947 0.98794373 0.98568018 0.98906977 0.98964085 0.92478483
  1.         0.98930777 0.99931977]
 [0.92549252 0.99818577 0.99431799 0.99967834 0.98550428 0.92496343
  0.98930777 1.         0.98963428]
 [0.92470542 0.98782119 0.98569093 0.98940422 0.98956946 0.92467962
  0.99931977 0.98963428 1.        ]]


In [25]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.923758,0.921166,0.92517,0.911899,0.998792,0.924959,0.925493,0.924705
1,0.923758,1.0,0.995925,0.997895,0.985969,0.923045,0.987944,0.998186,0.987821
2,0.921166,0.995925,1.0,0.993958,0.985789,0.920506,0.98568,0.994318,0.985691
3,0.92517,0.997895,0.993958,1.0,0.985294,0.924589,0.98907,0.999678,0.989404
4,0.911899,0.985969,0.985789,0.985294,1.0,0.912044,0.989641,0.985504,0.989569
5,0.998792,0.923045,0.920506,0.924589,0.912044,1.0,0.924785,0.924963,0.92468
6,0.924959,0.987944,0.98568,0.98907,0.989641,0.924785,1.0,0.989308,0.99932
7,0.925493,0.998186,0.994318,0.999678,0.985504,0.924963,0.989308,1.0,0.989634
8,0.924705,0.987821,0.985691,0.989404,0.989569,0.92468,0.99932,0.989634,1.0


In [26]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.923758,0.921166,0.92517,0.911899,0.998792,0.924959,0.925493,0.924705
1,0.923758,1.0,0.995925,0.997895,0.985969,0.923045,0.987944,0.998186,0.987821
2,0.921166,0.995925,1.0,0.993958,0.985789,0.920506,0.98568,0.994318,0.985691
3,0.92517,0.997895,0.993958,1.0,0.985294,0.924589,0.98907,0.999678,0.989404
4,0.911899,0.985969,0.985789,0.985294,1.0,0.912044,0.989641,0.985504,0.989569
5,0.998792,0.923045,0.920506,0.924589,0.912044,1.0,0.924785,0.924963,0.92468
6,0.924959,0.987944,0.98568,0.98907,0.989641,0.924785,1.0,0.989308,0.99932
7,0.925493,0.998186,0.994318,0.999678,0.985504,0.924963,0.989308,1.0,0.989634
8,0.924705,0.987821,0.985691,0.989404,0.989569,0.92468,0.99932,0.989634,1.0


In [27]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.923758
2,0,2,0.921166
3,0,3,0.925170
4,0,4,0.911899
...,...,...,...
76,8,4,0.989569
77,8,5,0.924680
78,8,6,0.999320
79,8,7,0.989634


In [28]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.923758
2,0,2,0.921166
3,0,3,0.925170
4,0,4,0.911899
5,0,5,0.998792
...,...,...,...
75,8,3,0.989404
76,8,4,0.989569
77,8,5,0.924680
78,8,6,0.999320


In [29]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.923758
2,0,2,0.921166
3,0,3,0.92517
4,0,4,0.911899
5,0,5,0.998792
6,0,6,0.924959
7,0,7,0.925493
8,0,8,0.924705
11,1,2,0.995925
12,1,3,0.997895


In [30]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="level_0").merge(metadata, how='left',
    left_on="Doc_B", right_on="level_0") # Changed the 'right_on' condition to 'level_0'
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# Ephesians - Least Similar

In [31]:
## Get least similar (bottom)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      0      4          0.911899      0   ERV-1881-Ephesians.txt       4   
1      4      5          0.912044      4  NRSV-1989-Ephesians.txt       5   
2      2      5          0.920506      2  NASB-2020-Ephesians.txt       5   
3      0      2          0.921166      0   ERV-1881-Ephesians.txt       2   
4      1      5          0.923045      1  NASB-1995-Ephesians.txt       5   
5      0      1          0.923758      0   ERV-1881-Ephesians.txt       1   
6      3      5          0.924589      3  NASB-1971-Ephesians.txt       5   
7      5      8          0.924680      5   ASV-1900-Ephesians.txt       8   
8      0      8          0.924705      0   ERV-1881-Ephesians.txt       8   
9      5      6          0.924785      5   ASV-1900-Ephesians.txt       6   

                  Doc_B_ID  
0  NRSV-1989-Ephesians.txt  
1   ASV-1900-Ephesians.txt  
2   ASV-1900-Ephesians.txt  
3  NASB-2020-Ephesians.txt  
4   ASV

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.911899,ERV-1881-Ephesians.txt,NRSV-1989-Ephesians.txt
1,0.912044,NRSV-1989-Ephesians.txt,ASV-1900-Ephesians.txt
2,0.920506,NASB-2020-Ephesians.txt,ASV-1900-Ephesians.txt
3,0.921166,ERV-1881-Ephesians.txt,NASB-2020-Ephesians.txt
4,0.923045,NASB-1995-Ephesians.txt,ASV-1900-Ephesians.txt
5,0.923758,ERV-1881-Ephesians.txt,NASB-1995-Ephesians.txt
6,0.924589,NASB-1971-Ephesians.txt,ASV-1900-Ephesians.txt
7,0.92468,ASV-1900-Ephesians.txt,RSV-1971-Ephesians.txt
8,0.924705,ERV-1881-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.924785,ASV-1900-Ephesians.txt,RSV-1946-Ephesians.txt


# Ephesians - Most Similar

In [32]:
## Get most similar (top)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      7      8          0.989634      7  NASB-1977-Ephesians.txt       8   
1      4      6          0.989641      4  NRSV-1989-Ephesians.txt       6   
2      2      3          0.993958      2  NASB-2020-Ephesians.txt       3   
3      2      7          0.994318      2  NASB-2020-Ephesians.txt       7   
4      1      2          0.995925      1  NASB-1995-Ephesians.txt       2   
5      1      3          0.997895      1  NASB-1995-Ephesians.txt       3   
6      1      7          0.998186      1  NASB-1995-Ephesians.txt       7   
7      0      5          0.998792      0   ERV-1881-Ephesians.txt       5   
8      6      8          0.999320      6   RSV-1946-Ephesians.txt       8   
9      3      7          0.999678      3  NASB-1971-Ephesians.txt       7   

                  Doc_B_ID  
0   RSV-1971-Ephesians.txt  
1   RSV-1946-Ephesians.txt  
2  NASB-1971-Ephesians.txt  
3  NASB-1977-Ephesians.txt  
4  NASB

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.989634,NASB-1977-Ephesians.txt,RSV-1971-Ephesians.txt
1,0.989641,NRSV-1989-Ephesians.txt,RSV-1946-Ephesians.txt
2,0.993958,NASB-2020-Ephesians.txt,NASB-1971-Ephesians.txt
3,0.994318,NASB-2020-Ephesians.txt,NASB-1977-Ephesians.txt
4,0.995925,NASB-1995-Ephesians.txt,NASB-2020-Ephesians.txt
5,0.997895,NASB-1995-Ephesians.txt,NASB-1971-Ephesians.txt
6,0.998186,NASB-1995-Ephesians.txt,NASB-1977-Ephesians.txt
7,0.998792,ERV-1881-Ephesians.txt,ASV-1900-Ephesians.txt
8,0.99932,RSV-1946-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.999678,NASB-1971-Ephesians.txt,NASB-1977-Ephesians.txt


In [33]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# Ephesians - Get Similar Docs

In [34]:
get_similar_docs('ERV-1881-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,5,ASV-1900-Ephesians.txt,0.998792
1,7,NASB-1977-Ephesians.txt,0.925493
2,3,NASB-1971-Ephesians.txt,0.92517
3,6,RSV-1946-Ephesians.txt,0.924959
4,8,RSV-1971-Ephesians.txt,0.924705
5,1,NASB-1995-Ephesians.txt,0.923758
6,2,NASB-2020-Ephesians.txt,0.921166
7,4,NRSV-1989-Ephesians.txt,0.911899


In [35]:
get_similar_docs('ASV-1900-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,0,ERV-1881-Ephesians.txt,0.998792
1,7,NASB-1977-Ephesians.txt,0.924963
2,6,RSV-1946-Ephesians.txt,0.924785
3,8,RSV-1971-Ephesians.txt,0.92468
4,3,NASB-1971-Ephesians.txt,0.924589
5,1,NASB-1995-Ephesians.txt,0.923045
6,2,NASB-2020-Ephesians.txt,0.920506
7,4,NRSV-1989-Ephesians.txt,0.912044


In [36]:
get_similar_docs('NASB-1971-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,7,NASB-1977-Ephesians.txt,0.999678
1,1,NASB-1995-Ephesians.txt,0.997895
2,2,NASB-2020-Ephesians.txt,0.993958
3,8,RSV-1971-Ephesians.txt,0.989404
4,6,RSV-1946-Ephesians.txt,0.98907
5,4,NRSV-1989-Ephesians.txt,0.985294
6,0,ERV-1881-Ephesians.txt,0.92517
7,5,ASV-1900-Ephesians.txt,0.924589
