In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
### Load texts into a dataframe (long version)


# source_dir = "../data/analysis_data/bibles-txt-ft/"

# texts = []

# for filename in os.listdir(source_dir):
#     if filename.endswith('1Timothy.txt'):
#         with open(os.path.join(source_dir, filename), 'r') as obit:
#             content = obit.read()
#         texts.append(
#             {
#                 "doc_id": filename,
#                 "book": '1Timothy',
#                 "text": content
#             }
#         )
#     elif filename in os.listdir(source_dir):
#         if filename.endswith('Ephesians.txt'):
#             with open(os.path.join(source_dir, filename), 'r') as obit:
#                 content = obit.read()
#             texts.append(
#                 {
#                     "doc_id": filename,
#                     "book": 'Ephesians',
#                     "text": content
#                 }
#             )
        
#             else:
#                 pass

In [3]:
### Load texts into a dataframe (written as list, shorter version)

# Create function that changes the text

def preprocessing_function(content):
#   lines of code here that change the text
    content = content.lower()
    
    return content


source_dir = "../data/analysis_data/bibles-txt-ft/"

books = ['1Timothy', 'Ephesians']

texts = []


for book in books:
    for filename in os.listdir(source_dir):
        if filename.endswith(book+".txt"):
            with open(os.path.join(source_dir, filename), 'r') as obit:
                content = obit.read()
#               Apply cleaned content code
                cleaned_content = preprocessing_function(content)
            texts.append(
                {
                    "doc_id": filename,
                    "book": book,
                    "text": cleaned_content
                }
            )
        else:
            pass

texts_df = pd.DataFrame(texts)
texts_df.head(10)

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...
9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...


In [4]:
### Set book to 1Timothy

timothy_df = texts_df[texts_df['book'] == "1Timothy"]
texts_df[texts_df['book'] == "1Timothy"]

Unnamed: 0,doc_id,book,text
0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...


In [5]:
timothy_df.reset_index(inplace=True)
timothy_df

Unnamed: 0,index,doc_id,book,text
0,0,NASB-2020-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\ncor...
1,1,NASB-1977-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\n1 pa...
2,2,NASB-1971-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
3,3,ERV-1881-1Timothy.txt,1Timothy,the first epistle of paul the apostle to timot...
4,4,NASB-1995-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n\nmis...
5,5,RSV-1971-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
6,6,ASV-1900-1Timothy.txt,1Timothy,the first epistle of paul to timothy\n\n\nsalu...
7,7,RSV-1946-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\n1 pau...
8,8,NRSV-1989-1Timothy.txt,1Timothy,the first letter of paul to timothy\n\n\ngrati...


In [6]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(timothy_df['text'])

print (tfidf_matrix.shape)

(9, 1410)


In [7]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.98287292 0.93024916 0.88166837 0.9891282  0.95843405
  0.88353897 0.95697244 0.95171007]
 [0.98287292 1.         0.94418793 0.89133853 0.99449503 0.96516871
  0.89309519 0.96427296 0.94971422]
 [0.93024916 0.94418793 1.         0.84761276 0.94015519 0.91645149
  0.85005914 0.91539267 0.90510746]
 [0.88166837 0.89133853 0.84761276 1.         0.88776873 0.90436306
  0.99398531 0.90161233 0.89013045]
 [0.9891282  0.99449503 0.94015519 0.88776873 1.         0.96400904
  0.88969633 0.96280236 0.95064903]
 [0.95843405 0.96516871 0.91645149 0.90436306 0.96400904 1.
  0.90589049 0.99552997 0.96969133]
 [0.88353897 0.89309519 0.85005914 0.99398531 0.88969633 0.90589049
  1.         0.90336567 0.89157442]
 [0.95697244 0.96427296 0.91539267 0.90161233 0.96280236 0.99552997
  0.90336567 1.         0.96742115]
 [0.95171007 0.94971422 0.90510746 0.89013045 0.95064903 0.96969133
  0.89157442 0.96742115 1.        ]]


In [8]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.982873,0.930249,0.881668,0.989128,0.958434,0.883539,0.956972,0.95171
1,0.982873,1.0,0.944188,0.891339,0.994495,0.965169,0.893095,0.964273,0.949714
2,0.930249,0.944188,1.0,0.847613,0.940155,0.916451,0.850059,0.915393,0.905107
3,0.881668,0.891339,0.847613,1.0,0.887769,0.904363,0.993985,0.901612,0.89013
4,0.989128,0.994495,0.940155,0.887769,1.0,0.964009,0.889696,0.962802,0.950649
5,0.958434,0.965169,0.916451,0.904363,0.964009,1.0,0.90589,0.99553,0.969691
6,0.883539,0.893095,0.850059,0.993985,0.889696,0.90589,1.0,0.903366,0.891574
7,0.956972,0.964273,0.915393,0.901612,0.962802,0.99553,0.903366,1.0,0.967421
8,0.95171,0.949714,0.905107,0.89013,0.950649,0.969691,0.891574,0.967421,1.0


In [9]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.982873,0.930249,0.881668,0.989128,0.958434,0.883539,0.956972,0.95171
1,0.982873,1.0,0.944188,0.891339,0.994495,0.965169,0.893095,0.964273,0.949714
2,0.930249,0.944188,1.0,0.847613,0.940155,0.916451,0.850059,0.915393,0.905107
3,0.881668,0.891339,0.847613,1.0,0.887769,0.904363,0.993985,0.901612,0.89013
4,0.989128,0.994495,0.940155,0.887769,1.0,0.964009,0.889696,0.962802,0.950649
5,0.958434,0.965169,0.916451,0.904363,0.964009,1.0,0.90589,0.99553,0.969691
6,0.883539,0.893095,0.850059,0.993985,0.889696,0.90589,1.0,0.903366,0.891574
7,0.956972,0.964273,0.915393,0.901612,0.962802,0.99553,0.903366,1.0,0.967421
8,0.95171,0.949714,0.905107,0.89013,0.950649,0.969691,0.891574,0.967421,1.0


In [10]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.982873
2,0,2,0.930249
3,0,3,0.881668
4,0,4,0.989128
...,...,...,...
76,8,4,0.950649
77,8,5,0.969691
78,8,6,0.891574
79,8,7,0.967421


In [11]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.982873
2,0,2,0.930249
3,0,3,0.881668
4,0,4,0.989128
5,0,5,0.958434
...,...,...,...
75,8,3,0.890130
76,8,4,0.950649
77,8,5,0.969691
78,8,6,0.891574


In [12]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.982873
2,0,2,0.930249
3,0,3,0.881668
4,0,4,0.989128
5,0,5,0.958434
6,0,6,0.883539
7,0,7,0.956972
8,0,8,0.95171
11,1,2,0.944188
12,1,3,0.891339


In [13]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

In [14]:
## Get least similar (bottom)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      2      3          0.847613      2  NASB-1971-1Timothy.txt       3   
1      2      6          0.850059      2  NASB-1971-1Timothy.txt       6   
2      0      3          0.881668      0  NASB-2020-1Timothy.txt       3   
3      0      6          0.883539      0  NASB-2020-1Timothy.txt       6   
4      3      4          0.887769      3   ERV-1881-1Timothy.txt       4   
5      4      6          0.889696      4  NASB-1995-1Timothy.txt       6   
6      3      8          0.890130      3   ERV-1881-1Timothy.txt       8   
7      1      3          0.891339      1  NASB-1977-1Timothy.txt       3   
8      6      8          0.891574      6   ASV-1900-1Timothy.txt       8   
9      1      6          0.893095      1  NASB-1977-1Timothy.txt       6   

                 Doc_B_ID  
0   ERV-1881-1Timothy.txt  
1   ASV-1900-1Timothy.txt  
2   ERV-1881-1Timothy.txt  
3   ASV-1900-1Timothy.txt  
4  NASB-1995-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.847613,NASB-1971-1Timothy.txt,ERV-1881-1Timothy.txt
1,0.850059,NASB-1971-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.881668,NASB-2020-1Timothy.txt,ERV-1881-1Timothy.txt
3,0.883539,NASB-2020-1Timothy.txt,ASV-1900-1Timothy.txt
4,0.887769,ERV-1881-1Timothy.txt,NASB-1995-1Timothy.txt
5,0.889696,NASB-1995-1Timothy.txt,ASV-1900-1Timothy.txt
6,0.89013,ERV-1881-1Timothy.txt,NRSV-1989-1Timothy.txt
7,0.891339,NASB-1977-1Timothy.txt,ERV-1881-1Timothy.txt
8,0.891574,ASV-1900-1Timothy.txt,NRSV-1989-1Timothy.txt
9,0.893095,NASB-1977-1Timothy.txt,ASV-1900-1Timothy.txt


In [15]:
## Get most similar (top)

get_top_docs(unique_pairs, timothy_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                Doc_A_ID  IndexB  \
0      4      5          0.964009      4  NASB-1995-1Timothy.txt       5   
1      1      7          0.964273      1  NASB-1977-1Timothy.txt       7   
2      1      5          0.965169      1  NASB-1977-1Timothy.txt       5   
3      7      8          0.967421      7   RSV-1946-1Timothy.txt       8   
4      5      8          0.969691      5   RSV-1971-1Timothy.txt       8   
5      0      1          0.982873      0  NASB-2020-1Timothy.txt       1   
6      0      4          0.989128      0  NASB-2020-1Timothy.txt       4   
7      3      6          0.993985      3   ERV-1881-1Timothy.txt       6   
8      1      4          0.994495      1  NASB-1977-1Timothy.txt       4   
9      5      7          0.995530      5   RSV-1971-1Timothy.txt       7   

                 Doc_B_ID  
0   RSV-1971-1Timothy.txt  
1   RSV-1946-1Timothy.txt  
2   RSV-1971-1Timothy.txt  
3  NRSV-1989-1Timothy.txt  
4  NRSV-1989-1Timothy.t

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.964009,NASB-1995-1Timothy.txt,RSV-1971-1Timothy.txt
1,0.964273,NASB-1977-1Timothy.txt,RSV-1946-1Timothy.txt
2,0.965169,NASB-1977-1Timothy.txt,RSV-1971-1Timothy.txt
3,0.967421,RSV-1946-1Timothy.txt,NRSV-1989-1Timothy.txt
4,0.969691,RSV-1971-1Timothy.txt,NRSV-1989-1Timothy.txt
5,0.982873,NASB-2020-1Timothy.txt,NASB-1977-1Timothy.txt
6,0.989128,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
7,0.993985,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
8,0.994495,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
9,0.99553,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt


In [16]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

In [17]:
get_similar_docs('ERV-1881-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,6,ASV-1900-1Timothy.txt,0.993985
1,5,RSV-1971-1Timothy.txt,0.904363
2,7,RSV-1946-1Timothy.txt,0.901612
3,1,NASB-1977-1Timothy.txt,0.891339
4,8,NRSV-1989-1Timothy.txt,0.89013
5,4,NASB-1995-1Timothy.txt,0.887769
6,0,NASB-2020-1Timothy.txt,0.881668
7,2,NASB-1971-1Timothy.txt,0.847613


In [18]:
#### Reset book to Ephesians ####

ephesians_df = texts_df[texts_df['book'] == "Ephesians"]
texts_df[texts_df['book'] == "Ephesians"]

Unnamed: 0,doc_id,book,text
9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...
10,NASB-1995-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
11,NASB-2020-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
12,NASB-1971-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
13,NRSV-1989-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\nspiri...
14,ASV-1900-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
15,RSV-1946-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...
16,NASB-1977-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n1 paul...
17,RSV-1971-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n4 pau...


In [19]:
ephesians_df.reset_index(inplace=True)
ephesians_df

Unnamed: 0,index,doc_id,book,text
0,9,ERV-1881-Ephesians.txt,Ephesians,the epistle of paul the apostle to the ephesia...
1,10,NASB-1995-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
2,11,NASB-2020-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n\nthe...
3,12,NASB-1971-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
4,13,NRSV-1989-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\nspiri...
5,14,ASV-1900-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n\nsalu...
6,15,RSV-1946-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n1 pau...
7,16,NASB-1977-Ephesians.txt,Ephesians,the epistle of paul to the ephesians\n\n1 paul...
8,17,RSV-1971-Ephesians.txt,Ephesians,the letter of paul to the ephesians\n\n\n4 pau...


In [20]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ephesians_df['text'])

print (tfidf_matrix.shape)

(9, 1238)


In [21]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.9222168  0.91963391 0.86782705 0.91049586 0.99756462
  0.9236831  0.92401696 0.92249291]
 [0.9222168  1.         0.99553071 0.93143752 0.98224001 0.92117801
  0.98728721 0.99708096 0.98702505]
 [0.91963391 0.99553071 1.         0.92792659 0.98185584 0.91872221
  0.98438667 0.9927649  0.98427914]
 [0.86782705 0.93143752 0.92792659 1.         0.91893427 0.86706852
  0.92430836 0.93401867 0.92423088]
 [0.91049586 0.98224001 0.98185584 0.91893427 1.         0.91002938
  0.98557368 0.98213411 0.98572728]
 [0.99756462 0.92117801 0.91872221 0.86706852 0.91002938 1.
  0.92328946 0.92305153 0.92211762]
 [0.9236831  0.98728721 0.98438667 0.92430836 0.98557368 0.92328946
  1.         0.9885749  0.9988322 ]
 [0.92401696 0.99708096 0.9927649  0.93401867 0.98213411 0.92305153
  0.9885749  1.         0.98868762]
 [0.92249291 0.98702505 0.98427914 0.92423088 0.98572728 0.92211762
  0.9988322  0.98868762 1.        ]]


In [22]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.922217,0.919634,0.867827,0.910496,0.997565,0.923683,0.924017,0.922493
1,0.922217,1.0,0.995531,0.931438,0.98224,0.921178,0.987287,0.997081,0.987025
2,0.919634,0.995531,1.0,0.927927,0.981856,0.918722,0.984387,0.992765,0.984279
3,0.867827,0.931438,0.927927,1.0,0.918934,0.867069,0.924308,0.934019,0.924231
4,0.910496,0.98224,0.981856,0.918934,1.0,0.910029,0.985574,0.982134,0.985727
5,0.997565,0.921178,0.918722,0.867069,0.910029,1.0,0.923289,0.923052,0.922118
6,0.923683,0.987287,0.984387,0.924308,0.985574,0.923289,1.0,0.988575,0.998832
7,0.924017,0.997081,0.992765,0.934019,0.982134,0.923052,0.988575,1.0,0.988688
8,0.922493,0.987025,0.984279,0.924231,0.985727,0.922118,0.998832,0.988688,1.0


In [23]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.922217,0.919634,0.867827,0.910496,0.997565,0.923683,0.924017,0.922493
1,0.922217,1.0,0.995531,0.931438,0.98224,0.921178,0.987287,0.997081,0.987025
2,0.919634,0.995531,1.0,0.927927,0.981856,0.918722,0.984387,0.992765,0.984279
3,0.867827,0.931438,0.927927,1.0,0.918934,0.867069,0.924308,0.934019,0.924231
4,0.910496,0.98224,0.981856,0.918934,1.0,0.910029,0.985574,0.982134,0.985727
5,0.997565,0.921178,0.918722,0.867069,0.910029,1.0,0.923289,0.923052,0.922118
6,0.923683,0.987287,0.984387,0.924308,0.985574,0.923289,1.0,0.988575,0.998832
7,0.924017,0.997081,0.992765,0.934019,0.982134,0.923052,0.988575,1.0,0.988688
8,0.922493,0.987025,0.984279,0.924231,0.985727,0.922118,0.998832,0.988688,1.0


In [24]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.922217
2,0,2,0.919634
3,0,3,0.867827
4,0,4,0.910496
...,...,...,...
76,8,4,0.985727
77,8,5,0.922118
78,8,6,0.998832
79,8,7,0.988688


In [25]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.922217
2,0,2,0.919634
3,0,3,0.867827
4,0,4,0.910496
5,0,5,0.997565
...,...,...,...
75,8,3,0.924231
76,8,4,0.985727
77,8,5,0.922118
78,8,6,0.998832


In [26]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.922217
2,0,2,0.919634
3,0,3,0.867827
4,0,4,0.910496
5,0,5,0.997565
6,0,6,0.923683
7,0,7,0.924017
8,0,8,0.922493
11,1,2,0.995531
12,1,3,0.931438


In [27]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

In [28]:
## Get least similar (bottom)

get_top_docs(unique_pairs, ephesians_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index Doc_A_ID  IndexB Doc_B_ID
0      3      5          0.867069    NaN      NaN     NaN      NaN
1      0      3          0.867827    NaN      NaN     NaN      NaN
2      4      5          0.910029    NaN      NaN     NaN      NaN
3      0      4          0.910496    NaN      NaN     NaN      NaN
4      2      5          0.918722    NaN      NaN     NaN      NaN
5      3      4          0.918934    NaN      NaN     NaN      NaN
6      0      2          0.919634    NaN      NaN     NaN      NaN
7      1      5          0.921178    NaN      NaN     NaN      NaN
8      5      8          0.922118    NaN      NaN     NaN      NaN
9      0      1          0.922217    NaN      NaN     NaN      NaN


Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.867069,,
1,0.867827,,
2,0.910029,,
3,0.910496,,
4,0.918722,,
5,0.918934,,
6,0.919634,,
7,0.921178,,
8,0.922118,,
9,0.922217,,


In [29]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

In [30]:
get_similar_docs('ERV-1881-Ephesians.txt', corr_df, texts_df)

KeyError: 9