In [1]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dataframe

In [2]:
### Load texts into a dataframe (long version)


# source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

# texts = []

# for filename in os.listdir(source_dir):
#     if filename.endswith('1Timothy.txt'):
#         with open(os.path.join(source_dir, filename), 'r') as obit:
#             content = obit.read()
#         texts.append(
#             {
#                 "doc_id": filename,
#                 "book": '1Timothy',
#                 "text": content
#             }
#         )
#     elif filename in os.listdir(source_dir):
#         if filename.endswith('Ephesians.txt'):
#             with open(os.path.join(source_dir, filename), 'r') as obit:
#                 content = obit.read()
#             texts.append(
#                 {
#                     "doc_id": filename,
#                     "book": 'Ephesians',
#                     "text": content
#                 }
#             )
        
#             else:
#                 pass

In [3]:
### Load texts into a dataframe (written as list, shorter version)

# Create function that changes the text

def preprocessing_function(content):
#   lines of code here that change the text
    content = content.lower()
    lines = content.split('\n')
    
    verses = []
    for line in lines:
        if re.match('^[0-9]* ', line):
            verses.append(line)
        else:
            pass
    for line in verses:
        line = re.sub('\([0-9]*\)', '', line)
        line = re.sub('  ', ' ', line)
        line = re.sub('\n', '', line)
        line = line.strip()
        line = re.sub('[^A-Za-z0-9 ]+', '', line)
        
    
    content = " ".join(verses)
    
    
    return content


source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

books = ['1Timothy', 'Ephesians']

texts = []


for book in books:
    for filename in os.listdir(source_dir):
        if filename.endswith(book+".txt"):
            with open(os.path.join(source_dir, filename), 'r') as obit:
                content = obit.read()
#               Apply cleaned content code
                cleaned_content = preprocessing_function(content)
            texts.append(
                {
                    "doc_id": filename,
                    "book": book,
                    "text": cleaned_content
                }
            )
        else:
            pass

texts_df = pd.DataFrame(texts)
texts_df.head(10)

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."


# 1Timothy

In [4]:
### Set book to 1Timothy

timothy_df = texts_df[texts_df['book'] == "1Timothy"]
texts_df[texts_df['book'] == "1Timothy"]

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [5]:
timothy_df.reset_index(inplace=True)
timothy_df

Unnamed: 0,index,doc_id,book,text
0,0,NASB-2020-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
1,1,NASB-1977-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
2,2,NASB-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
3,3,ERV-1881-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
4,4,NASB-1995-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
5,5,RSV-1971-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
6,6,ASV-1900-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus according t..."
7,7,RSV-1946-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by command ..."
8,8,NRSV-1989-1Timothy.txt,1Timothy,"1 paul, an apostle of christ jesus by the comm..."


In [6]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(timothy_df['text'])

print (tfidf_matrix.shape)

(9, 1299)


In [7]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.98382643 0.98344624 0.89241412 0.98904856 0.96015763
  0.89551519 0.95807338 0.96345666]
 [0.98382643 1.         0.99948151 0.90227451 0.99573765 0.96771504
  0.90516765 0.96598944 0.96306994]
 [0.98344624 0.99948151 1.         0.90203921 0.99521013 0.96721431
  0.90492056 0.96551582 0.96281586]
 [0.89241412 0.90227451 0.90203921 1.         0.89899735 0.91211488
  0.99589632 0.90856182 0.90346311]
 [0.98904856 0.99573765 0.99521013 0.89899735 1.         0.96617579
  0.90197111 0.96425398 0.96341872]
 [0.96015763 0.96771504 0.96721431 0.91211488 0.96617579 1.
  0.91470463 0.99676577 0.97795922]
 [0.89551519 0.90516765 0.90492056 0.99589632 0.90197111 0.91470463
  1.         0.91121809 0.90649558]
 [0.95807338 0.96598944 0.96551582 0.90856182 0.96425398 0.99676577
  0.91121809 1.         0.97773892]
 [0.96345666 0.96306994 0.96281586 0.90346311 0.96341872 0.97795922
  0.90649558 0.97773892 1.        ]]


In [8]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983826,0.983446,0.892414,0.989049,0.960158,0.895515,0.958073,0.963457
1,0.983826,1.0,0.999482,0.902275,0.995738,0.967715,0.905168,0.965989,0.96307
2,0.983446,0.999482,1.0,0.902039,0.99521,0.967214,0.904921,0.965516,0.962816
3,0.892414,0.902275,0.902039,1.0,0.898997,0.912115,0.995896,0.908562,0.903463
4,0.989049,0.995738,0.99521,0.898997,1.0,0.966176,0.901971,0.964254,0.963419
5,0.960158,0.967715,0.967214,0.912115,0.966176,1.0,0.914705,0.996766,0.977959
6,0.895515,0.905168,0.904921,0.995896,0.901971,0.914705,1.0,0.911218,0.906496
7,0.958073,0.965989,0.965516,0.908562,0.964254,0.996766,0.911218,1.0,0.977739
8,0.963457,0.96307,0.962816,0.903463,0.963419,0.977959,0.906496,0.977739,1.0


In [9]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.983826,0.983446,0.892414,0.989049,0.960158,0.895515,0.958073,0.963457
1,0.983826,1.0,0.999482,0.902275,0.995738,0.967715,0.905168,0.965989,0.96307
2,0.983446,0.999482,1.0,0.902039,0.99521,0.967214,0.904921,0.965516,0.962816
3,0.892414,0.902275,0.902039,1.0,0.898997,0.912115,0.995896,0.908562,0.903463
4,0.989049,0.995738,0.99521,0.898997,1.0,0.966176,0.901971,0.964254,0.963419
5,0.960158,0.967715,0.967214,0.912115,0.966176,1.0,0.914705,0.996766,0.977959
6,0.895515,0.905168,0.904921,0.995896,0.901971,0.914705,1.0,0.911218,0.906496
7,0.958073,0.965989,0.965516,0.908562,0.964254,0.996766,0.911218,1.0,0.977739
8,0.963457,0.96307,0.962816,0.903463,0.963419,0.977959,0.906496,0.977739,1.0


In [10]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.983826
2,0,2,0.983446
3,0,3,0.892414
4,0,4,0.989049
...,...,...,...
76,8,4,0.963419
77,8,5,0.977959
78,8,6,0.906496
79,8,7,0.977739


In [11]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983826
2,0,2,0.983446
3,0,3,0.892414
4,0,4,0.989049
5,0,5,0.960158
...,...,...,...
75,8,3,0.903463
76,8,4,0.963419
77,8,5,0.977959
78,8,6,0.906496


In [12]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983826
2,0,2,0.983446
3,0,3,0.892414
4,0,4,0.989049
5,0,5,0.960158
6,0,6,0.895515
7,0,7,0.958073
8,0,8,0.963457
11,1,2,0.999482
12,1,3,0.902275


In [13]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# 1Timothy - Least Similar

In [14]:
## Get least similar (bottom)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      0      3          0.892414      0  NASB-2020-1Timothy.txt       3   
1      0      6          0.895515      0  NASB-2020-1Timothy.txt       6   
2      3      4          0.898997      3   ERV-1881-1Timothy.txt       4   
3      4      6          0.901971      4  NASB-1995-1Timothy.txt       6   
4      2      3          0.902039      2  NASB-1971-1Timothy.txt       3   
5      1      3          0.902275      1  NASB-1977-1Timothy.txt       3   
6      3      8          0.903463      3   ERV-1881-1Timothy.txt       8   
7      2      6          0.904921      2  NASB-1971-1Timothy.txt       6   
8      1      6          0.905168      1  NASB-1977-1Timothy.txt       6   
9      6      8          0.906496      6   ASV-1900-1Timothy.txt       8   

                 Doc_B_ID  
0   ERV-1881-1Timothy.txt  
1   ASV-1900-1Timothy.txt  
2  NASB-1995-1Timothy.txt  
3   ASV-1900-1Timothy.txt  
4   ERV-1881-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.892414,NASB-2020-1Timothy.txt,ERV-1881-1Timothy.txt
1,0.895515,NASB-2020-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.898997,ERV-1881-1Timothy.txt,NASB-1995-1Timothy.txt
3,0.901971,NASB-1995-1Timothy.txt,ASV-1900-1Timothy.txt
4,0.902039,NASB-1971-1Timothy.txt,ERV-1881-1Timothy.txt
5,0.902275,NASB-1977-1Timothy.txt,ERV-1881-1Timothy.txt
6,0.903463,ERV-1881-1Timothy.txt,NRSV-1989-1Timothy.txt
7,0.904921,NASB-1971-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.905168,NASB-1977-1Timothy.txt,ASV-1900-1Timothy.txt
9,0.906496,ASV-1900-1Timothy.txt,NRSV-1989-1Timothy.txt


# 1Timothy - Most Similar

In [15]:
## Get most similar (top)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      7      8          0.977739      7   RSV-1946-1Timothy.txt       8   
1      5      8          0.977959      5   RSV-1971-1Timothy.txt       8   
2      0      2          0.983446      0  NASB-2020-1Timothy.txt       2   
3      0      1          0.983826      0  NASB-2020-1Timothy.txt       1   
4      0      4          0.989049      0  NASB-2020-1Timothy.txt       4   
5      2      4          0.995210      2  NASB-1971-1Timothy.txt       4   
6      1      4          0.995738      1  NASB-1977-1Timothy.txt       4   
7      3      6          0.995896      3   ERV-1881-1Timothy.txt       6   
8      5      7          0.996766      5   RSV-1971-1Timothy.txt       7   
9      1      2          0.999482      1  NASB-1977-1Timothy.txt       2   

                 Doc_B_ID  
0  NRSV-1989-1Timothy.txt  
1  NRSV-1989-1Timothy.txt  
2  NASB-1971-1Timothy.txt  
3  NASB-1977-1Timothy.txt  
4  NASB-1995-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.977739,RSV-1946-1Timothy.txt,NRSV-1989-1Timothy.txt
1,0.977959,RSV-1971-1Timothy.txt,NRSV-1989-1Timothy.txt
2,0.983446,NASB-2020-1Timothy.txt,NASB-1971-1Timothy.txt
3,0.983826,NASB-2020-1Timothy.txt,NASB-1977-1Timothy.txt
4,0.989049,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
5,0.99521,NASB-1971-1Timothy.txt,NASB-1995-1Timothy.txt
6,0.995738,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
7,0.995896,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.996766,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt
9,0.999482,NASB-1977-1Timothy.txt,NASB-1971-1Timothy.txt


In [16]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# 1Timothy - Get Similar Docs

In [17]:
get_similar_docs('ERV-1881-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,6,ASV-1900-1Timothy.txt,0.995896
1,5,RSV-1971-1Timothy.txt,0.912115
2,7,RSV-1946-1Timothy.txt,0.908562
3,8,NRSV-1989-1Timothy.txt,0.903463
4,1,NASB-1977-1Timothy.txt,0.902275
5,2,NASB-1971-1Timothy.txt,0.902039
6,4,NASB-1995-1Timothy.txt,0.898997
7,0,NASB-2020-1Timothy.txt,0.892414


In [18]:
get_similar_docs('RSV-1946-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,5,RSV-1971-1Timothy.txt,0.996766
1,8,NRSV-1989-1Timothy.txt,0.977739
2,1,NASB-1977-1Timothy.txt,0.965989
3,2,NASB-1971-1Timothy.txt,0.965516
4,4,NASB-1995-1Timothy.txt,0.964254
5,0,NASB-2020-1Timothy.txt,0.958073
6,6,ASV-1900-1Timothy.txt,0.911218
7,3,ERV-1881-1Timothy.txt,0.908562


In [19]:
get_similar_docs('ASV-1900-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,3,ERV-1881-1Timothy.txt,0.995896
1,5,RSV-1971-1Timothy.txt,0.914705
2,7,RSV-1946-1Timothy.txt,0.911218
3,8,NRSV-1989-1Timothy.txt,0.906496
4,1,NASB-1977-1Timothy.txt,0.905168
5,2,NASB-1971-1Timothy.txt,0.904921
6,4,NASB-1995-1Timothy.txt,0.901971
7,0,NASB-2020-1Timothy.txt,0.895515


In [20]:
get_similar_docs('NASB-1971-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,1,NASB-1977-1Timothy.txt,0.999482
1,4,NASB-1995-1Timothy.txt,0.99521
2,0,NASB-2020-1Timothy.txt,0.983446
3,5,RSV-1971-1Timothy.txt,0.967214
4,7,RSV-1946-1Timothy.txt,0.965516
5,8,NRSV-1989-1Timothy.txt,0.962816
6,6,ASV-1900-1Timothy.txt,0.904921
7,3,ERV-1881-1Timothy.txt,0.902039


# Ephesians

In [21]:
#### Reset book to Ephesians ####

ephesians_df = texts_df[texts_df['book'] == "Ephesians"]
texts_df[texts_df['book'] == "Ephesians"]

Unnamed: 0,doc_id,book,text
9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [22]:
ephesians_df.reset_index(inplace=True)
# jw - Added another "reset_index"
ephesians_df.reset_index(inplace=True)
ephesians_df

Unnamed: 0,level_0,index,doc_id,book,text
0,0,9,ERV-1881-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
1,1,10,NASB-1995-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
2,2,11,NASB-2020-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
3,3,12,NASB-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus (1) by the ..."
4,4,13,NRSV-1989-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
5,5,14,ASV-1900-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus through the..."
6,6,15,RSV-1946-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
7,7,16,NASB-1977-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus by the will..."
8,8,17,RSV-1971-Ephesians.txt,Ephesians,"1 paul, an apostle of christ jesus i by the wi..."


In [23]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ephesians_df['text'])

print (tfidf_matrix.shape)

(9, 1138)


In [24]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.92375141 0.92115548 0.92515994 0.91207454 0.99879794
  0.92499161 0.92566992 0.92463389]
 [0.92375141 1.         0.99593201 0.99790234 0.98591375 0.92303316
  0.98791041 0.99820845 0.98766399]
 [0.92115548 0.99593201 1.         0.9939586  0.98575413 0.92049072
  0.98564164 0.99425981 0.98556348]
 [0.92515994 0.99790234 0.9939586  1.         0.9851796  0.92457306
  0.98902396 0.99962457 0.98924379]
 [0.91207454 0.98591375 0.98575413 0.9851796  1.         0.91219888
  0.98958994 0.98527866 0.98927755]
 [0.99879794 0.92303316 0.92049072 0.92457306 0.91219888 1.
  0.9248119  0.92513607 0.92456096]
 [0.92499161 0.98791041 0.98564164 0.98902396 0.98958994 0.9248119
  1.         0.98919951 0.99905676]
 [0.92566992 0.99820845 0.99425981 0.99962457 0.98527866 0.92513607
  0.98919951 1.         0.98941383]
 [0.92463389 0.98766399 0.98556348 0.98924379 0.98927755 0.92456096
  0.99905676 0.98941383 1.        ]]


In [25]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.923751,0.921155,0.92516,0.912075,0.998798,0.924992,0.92567,0.924634
1,0.923751,1.0,0.995932,0.997902,0.985914,0.923033,0.98791,0.998208,0.987664
2,0.921155,0.995932,1.0,0.993959,0.985754,0.920491,0.985642,0.99426,0.985563
3,0.92516,0.997902,0.993959,1.0,0.98518,0.924573,0.989024,0.999625,0.989244
4,0.912075,0.985914,0.985754,0.98518,1.0,0.912199,0.98959,0.985279,0.989278
5,0.998798,0.923033,0.920491,0.924573,0.912199,1.0,0.924812,0.925136,0.924561
6,0.924992,0.98791,0.985642,0.989024,0.98959,0.924812,1.0,0.9892,0.999057
7,0.92567,0.998208,0.99426,0.999625,0.985279,0.925136,0.9892,1.0,0.989414
8,0.924634,0.987664,0.985563,0.989244,0.989278,0.924561,0.999057,0.989414,1.0


In [26]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.923751,0.921155,0.92516,0.912075,0.998798,0.924992,0.92567,0.924634
1,0.923751,1.0,0.995932,0.997902,0.985914,0.923033,0.98791,0.998208,0.987664
2,0.921155,0.995932,1.0,0.993959,0.985754,0.920491,0.985642,0.99426,0.985563
3,0.92516,0.997902,0.993959,1.0,0.98518,0.924573,0.989024,0.999625,0.989244
4,0.912075,0.985914,0.985754,0.98518,1.0,0.912199,0.98959,0.985279,0.989278
5,0.998798,0.923033,0.920491,0.924573,0.912199,1.0,0.924812,0.925136,0.924561
6,0.924992,0.98791,0.985642,0.989024,0.98959,0.924812,1.0,0.9892,0.999057
7,0.92567,0.998208,0.99426,0.999625,0.985279,0.925136,0.9892,1.0,0.989414
8,0.924634,0.987664,0.985563,0.989244,0.989278,0.924561,0.999057,0.989414,1.0


In [27]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.923751
2,0,2,0.921155
3,0,3,0.925160
4,0,4,0.912075
...,...,...,...
76,8,4,0.989278
77,8,5,0.924561
78,8,6,0.999057
79,8,7,0.989414


In [28]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.923751
2,0,2,0.921155
3,0,3,0.925160
4,0,4,0.912075
5,0,5,0.998798
...,...,...,...
75,8,3,0.989244
76,8,4,0.989278
77,8,5,0.924561
78,8,6,0.999057


In [29]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.923751
2,0,2,0.921155
3,0,3,0.92516
4,0,4,0.912075
5,0,5,0.998798
6,0,6,0.924992
7,0,7,0.92567
8,0,8,0.924634
11,1,2,0.995932
12,1,3,0.997902


In [30]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="level_0").merge(metadata, how='left',
    left_on="Doc_B", right_on="level_0") # Changed the 'right_on' condition to 'level_0'
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# Ephesians - Least Similar

In [31]:
## Get least similar (bottom)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      0      4          0.912075      0   ERV-1881-Ephesians.txt       4   
1      4      5          0.912199      4  NRSV-1989-Ephesians.txt       5   
2      2      5          0.920491      2  NASB-2020-Ephesians.txt       5   
3      0      2          0.921155      0   ERV-1881-Ephesians.txt       2   
4      1      5          0.923033      1  NASB-1995-Ephesians.txt       5   
5      0      1          0.923751      0   ERV-1881-Ephesians.txt       1   
6      5      8          0.924561      5   ASV-1900-Ephesians.txt       8   
7      3      5          0.924573      3  NASB-1971-Ephesians.txt       5   
8      0      8          0.924634      0   ERV-1881-Ephesians.txt       8   
9      5      6          0.924812      5   ASV-1900-Ephesians.txt       6   

                  Doc_B_ID  
0  NRSV-1989-Ephesians.txt  
1   ASV-1900-Ephesians.txt  
2   ASV-1900-Ephesians.txt  
3  NASB-2020-Ephesians.txt  
4   ASV

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.912075,ERV-1881-Ephesians.txt,NRSV-1989-Ephesians.txt
1,0.912199,NRSV-1989-Ephesians.txt,ASV-1900-Ephesians.txt
2,0.920491,NASB-2020-Ephesians.txt,ASV-1900-Ephesians.txt
3,0.921155,ERV-1881-Ephesians.txt,NASB-2020-Ephesians.txt
4,0.923033,NASB-1995-Ephesians.txt,ASV-1900-Ephesians.txt
5,0.923751,ERV-1881-Ephesians.txt,NASB-1995-Ephesians.txt
6,0.924561,ASV-1900-Ephesians.txt,RSV-1971-Ephesians.txt
7,0.924573,NASB-1971-Ephesians.txt,ASV-1900-Ephesians.txt
8,0.924634,ERV-1881-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.924812,ASV-1900-Ephesians.txt,RSV-1946-Ephesians.txt


# Ephesians - Most Similar

In [32]:
## Get most similar (top)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      7      8          0.989414      7  NASB-1977-Ephesians.txt       8   
1      4      6          0.989590      4  NRSV-1989-Ephesians.txt       6   
2      2      3          0.993959      2  NASB-2020-Ephesians.txt       3   
3      2      7          0.994260      2  NASB-2020-Ephesians.txt       7   
4      1      2          0.995932      1  NASB-1995-Ephesians.txt       2   
5      1      3          0.997902      1  NASB-1995-Ephesians.txt       3   
6      1      7          0.998208      1  NASB-1995-Ephesians.txt       7   
7      0      5          0.998798      0   ERV-1881-Ephesians.txt       5   
8      6      8          0.999057      6   RSV-1946-Ephesians.txt       8   
9      3      7          0.999625      3  NASB-1971-Ephesians.txt       7   

                  Doc_B_ID  
0   RSV-1971-Ephesians.txt  
1   RSV-1946-Ephesians.txt  
2  NASB-1971-Ephesians.txt  
3  NASB-1977-Ephesians.txt  
4  NASB

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.989414,NASB-1977-Ephesians.txt,RSV-1971-Ephesians.txt
1,0.98959,NRSV-1989-Ephesians.txt,RSV-1946-Ephesians.txt
2,0.993959,NASB-2020-Ephesians.txt,NASB-1971-Ephesians.txt
3,0.99426,NASB-2020-Ephesians.txt,NASB-1977-Ephesians.txt
4,0.995932,NASB-1995-Ephesians.txt,NASB-2020-Ephesians.txt
5,0.997902,NASB-1995-Ephesians.txt,NASB-1971-Ephesians.txt
6,0.998208,NASB-1995-Ephesians.txt,NASB-1977-Ephesians.txt
7,0.998798,ERV-1881-Ephesians.txt,ASV-1900-Ephesians.txt
8,0.999057,RSV-1946-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.999625,NASB-1971-Ephesians.txt,NASB-1977-Ephesians.txt


In [33]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# Ephesians - Get Similar Docs

In [34]:
get_similar_docs('ERV-1881-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,5,ASV-1900-Ephesians.txt,0.998798
1,7,NASB-1977-Ephesians.txt,0.92567
2,3,NASB-1971-Ephesians.txt,0.92516
3,6,RSV-1946-Ephesians.txt,0.924992
4,8,RSV-1971-Ephesians.txt,0.924634
5,1,NASB-1995-Ephesians.txt,0.923751
6,2,NASB-2020-Ephesians.txt,0.921155
7,4,NRSV-1989-Ephesians.txt,0.912075


In [35]:
get_similar_docs('ASV-1900-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,0,ERV-1881-Ephesians.txt,0.998798
1,7,NASB-1977-Ephesians.txt,0.925136
2,6,RSV-1946-Ephesians.txt,0.924812
3,3,NASB-1971-Ephesians.txt,0.924573
4,8,RSV-1971-Ephesians.txt,0.924561
5,1,NASB-1995-Ephesians.txt,0.923033
6,2,NASB-2020-Ephesians.txt,0.920491
7,4,NRSV-1989-Ephesians.txt,0.912199


In [36]:
get_similar_docs('NASB-1971-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,7,NASB-1977-Ephesians.txt,0.999625
1,1,NASB-1995-Ephesians.txt,0.997902
2,2,NASB-2020-Ephesians.txt,0.993959
3,8,RSV-1971-Ephesians.txt,0.989244
4,6,RSV-1946-Ephesians.txt,0.989024
5,4,NRSV-1989-Ephesians.txt,0.98518
6,0,ERV-1881-Ephesians.txt,0.92516
7,5,ASV-1900-Ephesians.txt,0.924573
