In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dataframe

In [2]:
### Load texts into a dataframe (long version)


# source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

# texts = []

# for filename in os.listdir(source_dir):
#     if filename.endswith('1Timothy.txt'):
#         with open(os.path.join(source_dir, filename), 'r') as obit:
#             content = obit.read()
#         texts.append(
#             {
#                 "doc_id": filename,
#                 "book": '1Timothy',
#                 "text": content
#             }
#         )
#     elif filename in os.listdir(source_dir):
#         if filename.endswith('Ephesians.txt'):
#             with open(os.path.join(source_dir, filename), 'r') as obit:
#                 content = obit.read()
#             texts.append(
#                 {
#                     "doc_id": filename,
#                     "book": 'Ephesians',
#                     "text": content
#                 }
#             )
        
#             else:
#                 pass

In [3]:
### Load texts into a dataframe (written as list, shorter version)

# Create function that changes the text

def preprocessing_function(content):
#   lines of code here that change the text

    content = content.lower()
    
    return content


source_dir = "../data/analysis_data/bibles-txt-ft-cleaned/"

books = ['1Timothy', 'Ephesians']

texts = []


for book in books:
    for filename in os.listdir(source_dir):
        if filename.endswith(book+".txt"):
            with open(os.path.join(source_dir, filename), 'r') as obit:
                content = obit.read()
#               Apply cleaned content code
                cleaned_content = preprocessing_function(content)
            texts.append(
                {
                    "doc_id": filename,
                    "book": book,
                    "text": cleaned_content
                }
            )
        else:
            pass

texts_df = pd.DataFrame(texts)
texts_df.head(10)

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...
9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...


# 1Timothy

In [4]:
### Set book to 1Timothy

timothy_df = texts_df[texts_df['book'] == "1Timothy"]
texts_df[texts_df['book'] == "1Timothy"]

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...


In [5]:
timothy_df.reset_index(inplace=True)
timothy_df

Unnamed: 0,index,doc_id,book,text
0,0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...


In [6]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(timothy_df['text'])

print (tfidf_matrix.shape)

(9, 1399)


In [7]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.98314014 0.93073159 0.88169319 0.98929375 0.95910957
  0.88356863 0.95743374 0.95174055]
 [0.98314014 1.         0.94450041 0.89137894 0.99449446 0.96598171
  0.8931242  0.96473007 0.94972951]
 [0.93073159 0.94450041 1.         0.84781809 0.94044515 0.91674429
  0.85022391 0.91561579 0.90534922]
 [0.88169319 0.89137894 0.84781809 1.         0.88780983 0.904696
  0.99399193 0.90190626 0.89014865]
 [0.98929375 0.99449446 0.94044515 0.88780983 1.         0.96473018
  0.8897256  0.96324413 0.95066475]
 [0.95910957 0.96598171 0.91674429 0.904696   0.96473018 1.
  0.90616372 0.99636235 0.96990109]
 [0.88356863 0.8931242  0.85022391 0.99399193 0.8897256  0.90616372
  1.         0.90358608 0.89158219]
 [0.95743374 0.96473007 0.91561579 0.90190626 0.96324413 0.99636235
  0.90358608 1.         0.96757991]
 [0.95174055 0.94972951 0.90534922 0.89014865 0.95066475 0.96990109
  0.89158219 0.96757991 1.        ]]


In [8]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.98314,0.930732,0.881693,0.989294,0.95911,0.883569,0.957434,0.951741
1,0.98314,1.0,0.9445,0.891379,0.994494,0.965982,0.893124,0.96473,0.94973
2,0.930732,0.9445,1.0,0.847818,0.940445,0.916744,0.850224,0.915616,0.905349
3,0.881693,0.891379,0.847818,1.0,0.88781,0.904696,0.993992,0.901906,0.890149
4,0.989294,0.994494,0.940445,0.88781,1.0,0.96473,0.889726,0.963244,0.950665
5,0.95911,0.965982,0.916744,0.904696,0.96473,1.0,0.906164,0.996362,0.969901
6,0.883569,0.893124,0.850224,0.993992,0.889726,0.906164,1.0,0.903586,0.891582
7,0.957434,0.96473,0.915616,0.901906,0.963244,0.996362,0.903586,1.0,0.96758
8,0.951741,0.94973,0.905349,0.890149,0.950665,0.969901,0.891582,0.96758,1.0


In [9]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.98314,0.930732,0.881693,0.989294,0.95911,0.883569,0.957434,0.951741
1,0.98314,1.0,0.9445,0.891379,0.994494,0.965982,0.893124,0.96473,0.94973
2,0.930732,0.9445,1.0,0.847818,0.940445,0.916744,0.850224,0.915616,0.905349
3,0.881693,0.891379,0.847818,1.0,0.88781,0.904696,0.993992,0.901906,0.890149
4,0.989294,0.994494,0.940445,0.88781,1.0,0.96473,0.889726,0.963244,0.950665
5,0.95911,0.965982,0.916744,0.904696,0.96473,1.0,0.906164,0.996362,0.969901
6,0.883569,0.893124,0.850224,0.993992,0.889726,0.906164,1.0,0.903586,0.891582
7,0.957434,0.96473,0.915616,0.901906,0.963244,0.996362,0.903586,1.0,0.96758
8,0.951741,0.94973,0.905349,0.890149,0.950665,0.969901,0.891582,0.96758,1.0


In [10]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.983140
2,0,2,0.930732
3,0,3,0.881693
4,0,4,0.989294
...,...,...,...
76,8,4,0.950665
77,8,5,0.969901
78,8,6,0.891582
79,8,7,0.967580


In [11]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.983140
2,0,2,0.930732
3,0,3,0.881693
4,0,4,0.989294
5,0,5,0.959110
...,...,...,...
75,8,3,0.890149
76,8,4,0.950665
77,8,5,0.969901
78,8,6,0.891582


In [12]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.98314
2,0,2,0.930732
3,0,3,0.881693
4,0,4,0.989294
5,0,5,0.95911
6,0,6,0.883569
7,0,7,0.957434
8,0,8,0.951741
11,1,2,0.9445
12,1,3,0.891379


In [13]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# 1Timothy - Least Similar

In [14]:
## Get least similar (bottom)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      2      3          0.847818      2  NASB-1971-1Timothy.txt       3   
1      2      6          0.850224      2  NASB-1971-1Timothy.txt       6   
2      0      3          0.881693      0  NASB-2020-1Timothy.txt       3   
3      0      6          0.883569      0  NASB-2020-1Timothy.txt       6   
4      3      4          0.887810      3   ERV-1881-1Timothy.txt       4   
5      4      6          0.889726      4  NASB-1995-1Timothy.txt       6   
6      3      8          0.890149      3   ERV-1881-1Timothy.txt       8   
7      1      3          0.891379      1  NASB-1977-1Timothy.txt       3   
8      6      8          0.891582      6   ASV-1900-1Timothy.txt       8   
9      1      6          0.893124      1  NASB-1977-1Timothy.txt       6   

                 Doc_B_ID  
0   ERV-1881-1Timothy.txt  
1   ASV-1900-1Timothy.txt  
2   ERV-1881-1Timothy.txt  
3   ASV-1900-1Timothy.txt  
4  NASB-1995-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.847818,NASB-1971-1Timothy.txt,ERV-1881-1Timothy.txt
1,0.850224,NASB-1971-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.881693,NASB-2020-1Timothy.txt,ERV-1881-1Timothy.txt
3,0.883569,NASB-2020-1Timothy.txt,ASV-1900-1Timothy.txt
4,0.88781,ERV-1881-1Timothy.txt,NASB-1995-1Timothy.txt
5,0.889726,NASB-1995-1Timothy.txt,ASV-1900-1Timothy.txt
6,0.890149,ERV-1881-1Timothy.txt,NRSV-1989-1Timothy.txt
7,0.891379,NASB-1977-1Timothy.txt,ERV-1881-1Timothy.txt
8,0.891582,ASV-1900-1Timothy.txt,NRSV-1989-1Timothy.txt
9,0.893124,NASB-1977-1Timothy.txt,ASV-1900-1Timothy.txt


# 1Timothy - Most Similar

In [15]:
## Get most similar (top)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      1      7          0.964730      1  NASB-1977-1Timothy.txt       7   
1      4      5          0.964730      4  NASB-1995-1Timothy.txt       5   
2      1      5          0.965982      1  NASB-1977-1Timothy.txt       5   
3      7      8          0.967580      7   RSV-1946-1Timothy.txt       8   
4      5      8          0.969901      5   RSV-1971-1Timothy.txt       8   
5      0      1          0.983140      0  NASB-2020-1Timothy.txt       1   
6      0      4          0.989294      0  NASB-2020-1Timothy.txt       4   
7      3      6          0.993992      3   ERV-1881-1Timothy.txt       6   
8      1      4          0.994494      1  NASB-1977-1Timothy.txt       4   
9      5      7          0.996362      5   RSV-1971-1Timothy.txt       7   

                 Doc_B_ID  
0   RSV-1946-1Timothy.txt  
1   RSV-1971-1Timothy.txt  
2   RSV-1971-1Timothy.txt  
3  NRSV-1989-1Timothy.txt  
4  NRSV-1989-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.96473,NASB-1977-1Timothy.txt,RSV-1946-1Timothy.txt
1,0.96473,NASB-1995-1Timothy.txt,RSV-1971-1Timothy.txt
2,0.965982,NASB-1977-1Timothy.txt,RSV-1971-1Timothy.txt
3,0.96758,RSV-1946-1Timothy.txt,NRSV-1989-1Timothy.txt
4,0.969901,RSV-1971-1Timothy.txt,NRSV-1989-1Timothy.txt
5,0.98314,NASB-2020-1Timothy.txt,NASB-1977-1Timothy.txt
6,0.989294,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
7,0.993992,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.994494,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
9,0.996362,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt


In [16]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# 1Timothy - Get Similar Docs

In [17]:
get_similar_docs('ERV-1881-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,6,ASV-1900-1Timothy.txt,0.993992
1,5,RSV-1971-1Timothy.txt,0.904696
2,7,RSV-1946-1Timothy.txt,0.901906
3,1,NASB-1977-1Timothy.txt,0.891379
4,8,NRSV-1989-1Timothy.txt,0.890149
5,4,NASB-1995-1Timothy.txt,0.88781
6,0,NASB-2020-1Timothy.txt,0.881693
7,2,NASB-1971-1Timothy.txt,0.847818


In [18]:
get_similar_docs('RSV-1946-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,5,RSV-1971-1Timothy.txt,0.996362
1,8,NRSV-1989-1Timothy.txt,0.96758
2,1,NASB-1977-1Timothy.txt,0.96473
3,4,NASB-1995-1Timothy.txt,0.963244
4,0,NASB-2020-1Timothy.txt,0.957434
5,2,NASB-1971-1Timothy.txt,0.915616
6,6,ASV-1900-1Timothy.txt,0.903586
7,3,ERV-1881-1Timothy.txt,0.901906


In [19]:
get_similar_docs('ASV-1900-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,3,ERV-1881-1Timothy.txt,0.993992
1,5,RSV-1971-1Timothy.txt,0.906164
2,7,RSV-1946-1Timothy.txt,0.903586
3,1,NASB-1977-1Timothy.txt,0.893124
4,8,NRSV-1989-1Timothy.txt,0.891582
5,4,NASB-1995-1Timothy.txt,0.889726
6,0,NASB-2020-1Timothy.txt,0.883569
7,2,NASB-1971-1Timothy.txt,0.850224


In [20]:
get_similar_docs('NASB-1971-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,1,NASB-1977-1Timothy.txt,0.9445
1,4,NASB-1995-1Timothy.txt,0.940445
2,0,NASB-2020-1Timothy.txt,0.930732
3,5,RSV-1971-1Timothy.txt,0.916744
4,7,RSV-1946-1Timothy.txt,0.915616
5,8,NRSV-1989-1Timothy.txt,0.905349
6,6,ASV-1900-1Timothy.txt,0.850224
7,3,ERV-1881-1Timothy.txt,0.847818


# Ephesians

In [21]:
#### Reset book to Ephesians ####

ephesians_df = texts_df[texts_df['book'] == "Ephesians"]
texts_df[texts_df['book'] == "Ephesians"]

Unnamed: 0,doc_id,book,text
9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...
10,NASB-1995-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
11,NASB-2020-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
12,NASB-1971-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
13,NRSV-1989-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\nspiri...
14,ASV-1900-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
15,RSV-1946-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...
16,NASB-1977-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n1 paul...
17,RSV-1971-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...


In [22]:
ephesians_df.reset_index(inplace=True)
# jw - Added another "reset_index"
ephesians_df.reset_index(inplace=True)
ephesians_df

Unnamed: 0,level_0,index,doc_id,book,text
0,0,9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...
1,1,10,NASB-1995-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
2,2,11,NASB-2020-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
3,3,12,NASB-1971-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
4,4,13,NRSV-1989-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\nspiri...
5,5,14,ASV-1900-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
6,6,15,RSV-1946-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...
7,7,16,NASB-1977-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n1 paul...
8,8,17,RSV-1971-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...


In [23]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ephesians_df['text'])

print (tfidf_matrix.shape)

(9, 1236)


In [24]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.9222168  0.91963113 0.86782705 0.91050808 0.99756462
  0.9236831  0.92401696 0.92249382]
 [0.9222168  1.         0.99556158 0.93143752 0.98223641 0.92117801
  0.98728721 0.99708096 0.9870255 ]
 [0.91963113 0.99556158 1.         0.92797692 0.9819419  0.91872365
  0.98446632 0.99279049 0.9843556 ]
 [0.86782705 0.93143752 0.92797692 1.         0.91894563 0.86706852
  0.92430836 0.93401867 0.92425051]
 [0.91050808 0.98223641 0.9819419  0.91894563 1.         0.9100537
  0.98566172 0.98216593 0.98580626]
 [0.99756462 0.92117801 0.91872365 0.86706852 0.9100537  1.
  0.92328946 0.92305153 0.92211747]
 [0.9236831  0.98728721 0.98446632 0.92430836 0.98566172 0.92328946
  1.         0.9885749  0.99883528]
 [0.92401696 0.99708096 0.99279049 0.93401867 0.98216593 0.92305153
  0.9885749  1.         0.98868797]
 [0.92249382 0.9870255  0.9843556  0.92425051 0.98580626 0.92211747
  0.99883528 0.98868797 1.        ]]


In [25]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.922217,0.919631,0.867827,0.910508,0.997565,0.923683,0.924017,0.922494
1,0.922217,1.0,0.995562,0.931438,0.982236,0.921178,0.987287,0.997081,0.987026
2,0.919631,0.995562,1.0,0.927977,0.981942,0.918724,0.984466,0.99279,0.984356
3,0.867827,0.931438,0.927977,1.0,0.918946,0.867069,0.924308,0.934019,0.924251
4,0.910508,0.982236,0.981942,0.918946,1.0,0.910054,0.985662,0.982166,0.985806
5,0.997565,0.921178,0.918724,0.867069,0.910054,1.0,0.923289,0.923052,0.922117
6,0.923683,0.987287,0.984466,0.924308,0.985662,0.923289,1.0,0.988575,0.998835
7,0.924017,0.997081,0.99279,0.934019,0.982166,0.923052,0.988575,1.0,0.988688
8,0.922494,0.987026,0.984356,0.924251,0.985806,0.922117,0.998835,0.988688,1.0


In [26]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.922217,0.919631,0.867827,0.910508,0.997565,0.923683,0.924017,0.922494
1,0.922217,1.0,0.995562,0.931438,0.982236,0.921178,0.987287,0.997081,0.987026
2,0.919631,0.995562,1.0,0.927977,0.981942,0.918724,0.984466,0.99279,0.984356
3,0.867827,0.931438,0.927977,1.0,0.918946,0.867069,0.924308,0.934019,0.924251
4,0.910508,0.982236,0.981942,0.918946,1.0,0.910054,0.985662,0.982166,0.985806
5,0.997565,0.921178,0.918724,0.867069,0.910054,1.0,0.923289,0.923052,0.922117
6,0.923683,0.987287,0.984466,0.924308,0.985662,0.923289,1.0,0.988575,0.998835
7,0.924017,0.997081,0.99279,0.934019,0.982166,0.923052,0.988575,1.0,0.988688
8,0.922494,0.987026,0.984356,0.924251,0.985806,0.922117,0.998835,0.988688,1.0


In [27]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.922217
2,0,2,0.919631
3,0,3,0.867827
4,0,4,0.910508
...,...,...,...
76,8,4,0.985806
77,8,5,0.922117
78,8,6,0.998835
79,8,7,0.988688


In [28]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.922217
2,0,2,0.919631
3,0,3,0.867827
4,0,4,0.910508
5,0,5,0.997565
...,...,...,...
75,8,3,0.924251
76,8,4,0.985806
77,8,5,0.922117
78,8,6,0.998835


In [29]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.922217
2,0,2,0.919631
3,0,3,0.867827
4,0,4,0.910508
5,0,5,0.997565
6,0,6,0.923683
7,0,7,0.924017
8,0,8,0.922494
11,1,2,0.995562
12,1,3,0.931438


In [30]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="level_0").merge(metadata, how='left',
    left_on="Doc_B", right_on="level_0") # Changed the 'right_on' condition to 'level_0'
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

# Ephesians - Least Similar

In [31]:
## Get least similar (bottom)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      3      5          0.867069      3  NASB-1971-Ephesians.txt       5   
1      0      3          0.867827      0   ERV-1881-Ephesians.txt       3   
2      4      5          0.910054      4  NRSV-1989-Ephesians.txt       5   
3      0      4          0.910508      0   ERV-1881-Ephesians.txt       4   
4      2      5          0.918724      2  NASB-2020-Ephesians.txt       5   
5      3      4          0.918946      3  NASB-1971-Ephesians.txt       4   
6      0      2          0.919631      0   ERV-1881-Ephesians.txt       2   
7      1      5          0.921178      1  NASB-1995-Ephesians.txt       5   
8      5      8          0.922117      5   ASV-1900-Ephesians.txt       8   
9      0      1          0.922217      0   ERV-1881-Ephesians.txt       1   

                  Doc_B_ID  
0   ASV-1900-Ephesians.txt  
1  NASB-1971-Ephesians.txt  
2   ASV-1900-Ephesians.txt  
3  NRSV-1989-Ephesians.txt  
4   ASV

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.867069,NASB-1971-Ephesians.txt,ASV-1900-Ephesians.txt
1,0.867827,ERV-1881-Ephesians.txt,NASB-1971-Ephesians.txt
2,0.910054,NRSV-1989-Ephesians.txt,ASV-1900-Ephesians.txt
3,0.910508,ERV-1881-Ephesians.txt,NRSV-1989-Ephesians.txt
4,0.918724,NASB-2020-Ephesians.txt,ASV-1900-Ephesians.txt
5,0.918946,NASB-1971-Ephesians.txt,NRSV-1989-Ephesians.txt
6,0.919631,ERV-1881-Ephesians.txt,NASB-2020-Ephesians.txt
7,0.921178,NASB-1995-Ephesians.txt,ASV-1900-Ephesians.txt
8,0.922117,ASV-1900-Ephesians.txt,RSV-1971-Ephesians.txt
9,0.922217,ERV-1881-Ephesians.txt,NASB-1995-Ephesians.txt


# Ephesians - Most Similar

In [32]:
## Get most similar (top)

get_top_docs(unique_pairs, ephesians_df[['level_0', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                 Doc_A_ID  IndexB  \
0      4      8          0.985806      4  NRSV-1989-Ephesians.txt       8   
1      1      8          0.987026      1  NASB-1995-Ephesians.txt       8   
2      1      6          0.987287      1  NASB-1995-Ephesians.txt       6   
3      6      7          0.988575      6   RSV-1946-Ephesians.txt       7   
4      7      8          0.988688      7  NASB-1977-Ephesians.txt       8   
5      2      7          0.992790      2  NASB-2020-Ephesians.txt       7   
6      1      2          0.995562      1  NASB-1995-Ephesians.txt       2   
7      1      7          0.997081      1  NASB-1995-Ephesians.txt       7   
8      0      5          0.997565      0   ERV-1881-Ephesians.txt       5   
9      6      8          0.998835      6   RSV-1946-Ephesians.txt       8   

                  Doc_B_ID  
0   RSV-1971-Ephesians.txt  
1   RSV-1971-Ephesians.txt  
2   RSV-1946-Ephesians.txt  
3  NASB-1977-Ephesians.txt  
4   RSV

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.985806,NRSV-1989-Ephesians.txt,RSV-1971-Ephesians.txt
1,0.987026,NASB-1995-Ephesians.txt,RSV-1971-Ephesians.txt
2,0.987287,NASB-1995-Ephesians.txt,RSV-1946-Ephesians.txt
3,0.988575,RSV-1946-Ephesians.txt,NASB-1977-Ephesians.txt
4,0.988688,NASB-1977-Ephesians.txt,RSV-1971-Ephesians.txt
5,0.99279,NASB-2020-Ephesians.txt,NASB-1977-Ephesians.txt
6,0.995562,NASB-1995-Ephesians.txt,NASB-2020-Ephesians.txt
7,0.997081,NASB-1995-Ephesians.txt,NASB-1977-Ephesians.txt
8,0.997565,ERV-1881-Ephesians.txt,ASV-1900-Ephesians.txt
9,0.998835,RSV-1946-Ephesians.txt,RSV-1971-Ephesians.txt


In [33]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

# Ephesians - Get Similar Docs

In [34]:
get_similar_docs('ERV-1881-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,5,ASV-1900-Ephesians.txt,0.997565
1,7,NASB-1977-Ephesians.txt,0.924017
2,6,RSV-1946-Ephesians.txt,0.923683
3,8,RSV-1971-Ephesians.txt,0.922494
4,1,NASB-1995-Ephesians.txt,0.922217
5,2,NASB-2020-Ephesians.txt,0.919631
6,4,NRSV-1989-Ephesians.txt,0.910508
7,3,NASB-1971-Ephesians.txt,0.867827


In [35]:
get_similar_docs('ASV-1900-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,0,ERV-1881-Ephesians.txt,0.997565
1,6,RSV-1946-Ephesians.txt,0.923289
2,7,NASB-1977-Ephesians.txt,0.923052
3,8,RSV-1971-Ephesians.txt,0.922117
4,1,NASB-1995-Ephesians.txt,0.921178
5,2,NASB-2020-Ephesians.txt,0.918724
6,4,NRSV-1989-Ephesians.txt,0.910054
7,3,NASB-1971-Ephesians.txt,0.867069


In [36]:
get_similar_docs('NASB-1971-Ephesians.txt', corr_df, ephesians_df) #changed this to the ephesians dataframe.

Unnamed: 0,index,doc_id,similarity_score
0,7,NASB-1977-Ephesians.txt,0.934019
1,1,NASB-1995-Ephesians.txt,0.931438
2,2,NASB-2020-Ephesians.txt,0.927977
3,6,RSV-1946-Ephesians.txt,0.924308
4,8,RSV-1971-Ephesians.txt,0.924251
5,4,NRSV-1989-Ephesians.txt,0.918946
6,0,ERV-1881-Ephesians.txt,0.867827
7,5,ASV-1900-Ephesians.txt,0.867069
