In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load texts into a dataframe

source_dir = "../data/analysis_data/bibles-txt-sw/"

texts = []

for filename in os.listdir(source_dir):
    if filename.startswith('.'):
        pass
    else:
        with open(os.path.join(source_dir, filename), 'r') as obit:
            content = obit.read()
    texts.append(
        {
            "doc_id": filename,
            "text": content
        }
    )
    
texts_df = pd.DataFrame(texts)

texts_df.head()

Unnamed: 0,doc_id,text
0,RSV-1952-Genesis.txt,﻿THE FIRST BOOK OF MOSES\nCOMMONLY CALLED\nGEN...
1,NRSV-1989-1Corinthians.txt,﻿THE FIRST LETTER OF PAUL To THE\nC O R I N T ...
2,.DS_Store,﻿THE FIRST LETTER OF PAUL To THE\nC O R I N T ...
3,NASB-2020-1Timothy.txt,THE FIRST LETTER OF PAUL TO TIMOTHY\n\n\n\nCOR...
4,NRSV-1989-Genesis.txt,﻿GENESIS\n1 In the beginning when God created“...


In [3]:
texts_df.reset_index(inplace=True)
texts_df

Unnamed: 0,index,doc_id,text
0,0,RSV-1952-Genesis.txt,﻿THE FIRST BOOK OF MOSES\nCOMMONLY CALLED\nGEN...
1,1,NRSV-1989-1Corinthians.txt,﻿THE FIRST LETTER OF PAUL To THE\nC O R I N T ...
2,2,.DS_Store,﻿THE FIRST LETTER OF PAUL To THE\nC O R I N T ...
3,3,NASB-2020-1Timothy.txt,THE FIRST LETTER OF PAUL TO TIMOTHY\n\n\n\nCOR...
4,4,NRSV-1989-Genesis.txt,﻿GENESIS\n1 In the beginning when God created“...
5,5,NASB-1995-Romans.txt,﻿THE LETTER OF PAUL TO THE ROMANS\n1‘nt'r;t»p¢...
6,6,NASB-1971-Romans.txt,﻿‘[12 110510; qsuqg snssf q3n01q4 pof) .{u1 >[...
7,7,RSV-1946-Romans.txt,"﻿THE\nLETTER OF PAUL TO THE ROMANS\n1 Paul, a ..."
8,8,NASB-1977-1Timothy.txt,THE FIRST EPISTLE OF PAUL TO TIMOTHY\n\n\n1 PA...
9,9,NASB-1995-Revelation.txt,﻿189 REVELATION 2\n18 that they were saying to...


In [4]:
# Create Tf-IDF Vector Representaiton

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(texts_df['text'])

print (tfidf_matrix.shape)

(54, 58227)


In [5]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.668815   0.668815   ... 0.6060212  0.65341713 0.6936091 ]
 [0.668815   1.         1.         ... 0.81627726 0.62403131 0.57314607]
 [0.668815   1.         1.         ... 0.81627726 0.62403131 0.57314607]
 ...
 [0.6060212  0.81627726 0.81627726 ... 1.         0.62463931 0.49361134]
 [0.65341713 0.62403131 0.62403131 ... 0.62463931 1.         0.5271941 ]
 [0.6936091  0.57314607 0.57314607 ... 0.49361134 0.5271941  1.        ]]


In [6]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,1.0,0.668815,0.668815,0.598568,0.767573,0.568877,0.571855,0.683253,0.612258,0.817383,...,0.609963,0.705529,0.753694,0.662066,0.556944,0.638253,0.760574,0.606021,0.653417,0.693609
1,0.668815,1.0,1.0,0.798561,0.7586,0.553337,0.619406,0.725502,0.79054,0.71433,...,0.797909,0.635536,0.639284,0.779728,0.609405,0.840114,0.563984,0.816277,0.624031,0.573146
2,0.668815,1.0,1.0,0.798561,0.7586,0.553337,0.619406,0.725502,0.79054,0.71433,...,0.797909,0.635536,0.639284,0.779728,0.609405,0.840114,0.563984,0.816277,0.624031,0.573146
3,0.598568,0.798561,0.798561,1.0,0.760812,0.453195,0.548732,0.634306,0.975853,0.714143,...,0.929797,0.58305,0.611155,0.665991,0.589503,0.849994,0.464003,0.932438,0.616726,0.494896
4,0.767573,0.7586,0.7586,0.760812,1.0,0.450557,0.547883,0.601046,0.774994,0.820584,...,0.77712,0.670187,0.70052,0.631105,0.657096,0.824873,0.551957,0.776126,0.75612,0.564675
5,0.568877,0.553337,0.553337,0.453195,0.450557,1.0,0.695079,0.681789,0.451414,0.578552,...,0.453771,0.527482,0.476126,0.73931,0.480141,0.466376,0.691253,0.45297,0.599329,0.479374
6,0.571855,0.619406,0.619406,0.548732,0.547883,0.695079,1.0,0.749445,0.547721,0.59665,...,0.551453,0.55541,0.506446,0.706618,0.480227,0.572003,0.587024,0.550478,0.576743,0.490477
7,0.683253,0.725502,0.725502,0.634306,0.601046,0.681789,0.749445,1.0,0.631004,0.647847,...,0.638287,0.584313,0.574381,0.74757,0.522893,0.66001,0.639214,0.638978,0.590961,0.59887
8,0.612258,0.79054,0.79054,0.975853,0.774994,0.451414,0.547721,0.631004,1.0,0.730998,...,0.936539,0.58827,0.620631,0.660751,0.593235,0.857731,0.469771,0.930255,0.629258,0.497852
9,0.817383,0.71433,0.71433,0.714143,0.820584,0.578552,0.59665,0.647847,0.730998,1.0,...,0.726912,0.713203,0.744196,0.679476,0.610796,0.748311,0.74361,0.720477,0.724091,0.601789


In [7]:
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
0,1.0,0.668815,0.668815,0.598568,0.767573,0.568877,0.571855,0.683253,0.612258,0.817383,0.676296,0.565834,0.764392,0.505765,0.635207,0.632479,0.714261,0.671825,0.583515,0.576583,0.611285,0.784925,0.592454,0.628623,0.843099,0.765433,0.769536,0.567601,0.640202,0.502851,0.647046,0.641074,0.861718,0.547126,0.752559,0.636582,0.561832,0.757056,0.856008,0.606764,0.740952,0.614349,0.604287,0.562325,0.609963,0.705529,0.753694,0.662066,0.556944,0.638253,0.760574,0.606021,0.653417,0.693609
1,0.668815,1.0,1.0,0.798561,0.7586,0.553337,0.619406,0.725502,0.79054,0.71433,0.703249,0.729554,0.645501,0.659184,0.836601,0.836072,0.556132,0.75016,0.677435,0.766117,0.803277,0.663987,0.770574,0.844757,0.624998,0.589314,0.603709,0.728799,0.838559,0.710857,0.87795,0.656278,0.603879,0.576715,0.70763,0.835012,0.731007,0.550319,0.578701,0.795469,0.535775,0.804133,0.918456,0.730778,0.797909,0.635536,0.639284,0.779728,0.609405,0.840114,0.563984,0.816277,0.624031,0.573146
2,0.668815,1.0,1.0,0.798561,0.7586,0.553337,0.619406,0.725502,0.79054,0.71433,0.703249,0.729554,0.645501,0.659184,0.836601,0.836072,0.556132,0.75016,0.677435,0.766117,0.803277,0.663987,0.770574,0.844757,0.624998,0.589314,0.603709,0.728799,0.838559,0.710857,0.87795,0.656278,0.603879,0.576715,0.70763,0.835012,0.731007,0.550319,0.578701,0.795469,0.535775,0.804133,0.918456,0.730778,0.797909,0.635536,0.639284,0.779728,0.609405,0.840114,0.563984,0.816277,0.624031,0.573146
3,0.598568,0.798561,0.798561,1.0,0.760812,0.453195,0.548732,0.634306,0.975853,0.714143,0.729161,0.740578,0.577614,0.601044,0.848331,0.846894,0.476189,0.783876,0.677252,0.681125,0.814479,0.658215,0.945818,0.845346,0.559843,0.529976,0.526797,0.741411,0.850494,0.621539,0.822668,0.648173,0.521674,0.583612,0.756093,0.847373,0.841443,0.475068,0.487997,0.982847,0.452944,0.932622,0.848457,0.843651,0.929797,0.58305,0.611155,0.665991,0.589503,0.849994,0.464003,0.932438,0.616726,0.494896
4,0.767573,0.7586,0.7586,0.760812,1.0,0.450557,0.547883,0.601046,0.774994,0.820584,0.812597,0.738379,0.703026,0.571524,0.823907,0.821451,0.577362,0.877984,0.743002,0.644477,0.790905,0.753206,0.747592,0.814328,0.661608,0.626262,0.633732,0.740492,0.82721,0.578043,0.763139,0.724644,0.673207,0.653021,0.921271,0.825172,0.72202,0.570375,0.595608,0.769697,0.5473,0.783514,0.782096,0.723028,0.77712,0.670187,0.70052,0.631105,0.657096,0.824873,0.551957,0.776126,0.75612,0.564675
5,0.568877,0.553337,0.553337,0.453195,0.450557,1.0,0.695079,0.681789,0.451414,0.578552,0.527229,0.406239,0.568587,0.642099,0.465438,0.464239,0.562976,0.417837,0.361413,0.633844,0.451135,0.569674,0.445376,0.465372,0.58069,0.480108,0.660024,0.407492,0.466401,0.597095,0.55122,0.551642,0.641272,0.481677,0.446333,0.463195,0.415279,0.557593,0.588009,0.453899,0.533537,0.455993,0.481317,0.413938,0.453771,0.527482,0.476126,0.73931,0.480141,0.466376,0.691253,0.45297,0.599329,0.479374
6,0.571855,0.619406,0.619406,0.548732,0.547883,0.695079,1.0,0.749445,0.547721,0.59665,0.566807,0.503217,0.640982,0.662841,0.569868,0.568218,0.481701,0.511492,0.443227,0.79743,0.558361,0.611454,0.543008,0.567042,0.582957,0.4737,0.715227,0.504796,0.57185,0.619202,0.636023,0.564475,0.573907,0.493308,0.549519,0.569308,0.5128,0.529547,0.542181,0.549416,0.479435,0.555105,0.581667,0.511637,0.551453,0.55541,0.506446,0.706618,0.480227,0.572003,0.587024,0.550478,0.576743,0.490477
7,0.683253,0.725502,0.725502,0.634306,0.601046,0.681789,0.749445,1.0,0.631004,0.647847,0.607694,0.571215,0.627993,0.658312,0.655532,0.65445,0.567597,0.574142,0.510551,0.758433,0.635187,0.647137,0.620851,0.660552,0.701414,0.570785,0.667114,0.572394,0.659567,0.626173,0.743303,0.600781,0.642125,0.523447,0.586626,0.654172,0.580028,0.582635,0.65367,0.635229,0.557036,0.641687,0.682165,0.58042,0.638287,0.584313,0.574381,0.74757,0.522893,0.66001,0.639214,0.638978,0.590961,0.59887
8,0.612258,0.79054,0.79054,0.975853,0.774994,0.451414,0.547721,0.631004,1.0,0.730998,0.741661,0.749539,0.588765,0.59365,0.8537,0.848317,0.481872,0.796234,0.679294,0.680713,0.823845,0.676879,0.965904,0.849224,0.573803,0.537654,0.535739,0.750797,0.858545,0.613665,0.814522,0.656967,0.532856,0.589854,0.776045,0.857154,0.852226,0.482407,0.499953,0.99338,0.457838,0.938779,0.843952,0.854316,0.936539,0.58827,0.620631,0.660751,0.593235,0.857731,0.469771,0.930255,0.629258,0.497852
9,0.817383,0.71433,0.71433,0.714143,0.820584,0.578552,0.59665,0.647847,0.730998,1.0,0.873511,0.672606,0.761917,0.571315,0.748223,0.744667,0.688159,0.870972,0.636902,0.632529,0.72154,0.888214,0.707876,0.736611,0.83457,0.727896,0.755401,0.675186,0.751633,0.571326,0.708616,0.75731,0.798621,0.652037,0.821952,0.74975,0.66522,0.712052,0.769316,0.725508,0.699763,0.731436,0.690316,0.666473,0.726912,0.713203,0.744196,0.679476,0.610796,0.748311,0.74361,0.720477,0.724091,0.601789


In [8]:
# Move from matrix into tidy data

pairs = corr_df.unstack().reset_index()
# pairs _df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']
    
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.668815
2,0,2,0.668815
3,0,3,0.598568
4,0,4,0.767573
...,...,...,...
2911,53,49,0.514428
2912,53,50,0.588304
2913,53,51,0.493611
2914,53,52,0.527194


In [9]:
# Clean out rows where maching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.668815
2,0,2,0.668815
3,0,3,0.598568
4,0,4,0.767573
5,0,5,0.568877
...,...,...,...
2910,53,48,0.585544
2911,53,49,0.514428
2912,53,50,0.588304
2913,53,51,0.493611


In [10]:
# Get unique pairs (we don't want each thing twice; no duplicates)

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates
(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.668815
2,0,2,0.668815
3,0,3,0.598568
4,0,4,0.767573
5,0,5,0.568877
...,...,...,...
2752,50,52,0.602491
2753,50,53,0.588304
2806,51,52,0.624639
2807,51,53,0.493611


In [11]:
# Get most and least similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how='left',
    left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', "IndexB", "Doc_B_ID"]
    
    print(sliced_named)
    
    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]
    
    return top_docs_df

In [12]:
## Get least similar (bottom)

get_top_docs(unique_pairs, texts_df[['index', 'doc_id']], rank='bottom')

   Doc_A  Doc_B  Similarity_Score  Index                    Doc_A_ID  IndexB  \
0      5     18          0.361413      5        NASB-1995-Romans.txt      18   
1     29     40          0.398739     29  NASB-2020-1Corinthians.txt      40   
2      5     11          0.406239      5        NASB-1995-Romans.txt      11   
3      5     27          0.407492      5        NASB-1995-Romans.txt      27   
4     25     29          0.408788     25         RSV-1952-Esther.txt      29   
5      5     43          0.413938      5        NASB-1995-Romans.txt      43   
6     29     37          0.414563     29  NASB-2020-1Corinthians.txt      37   
7      5     36          0.415279      5        NASB-1995-Romans.txt      36   
8     13     40          0.415930     13        NASB-2020-Romans.txt      40   
9     13     25          0.417099     13        NASB-2020-Romans.txt      25   

                     Doc_B_ID  
0          NRSV-1989-Ruth.txt  
1        NASB-1995-Esther.txt  
2      ERV-1881-Ephesia

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.361413,NASB-1995-Romans.txt,NRSV-1989-Ruth.txt
1,0.398739,NASB-2020-1Corinthians.txt,NASB-1995-Esther.txt
2,0.406239,NASB-1995-Romans.txt,ERV-1881-Ephesians.txt
3,0.407492,NASB-1995-Romans.txt,ASV-1900-Ephesians.txt
4,0.408788,RSV-1952-Esther.txt,NASB-2020-1Corinthians.txt
5,0.413938,NASB-1995-Romans.txt,ASV-1900-1Timothy.txt
6,0.414563,NASB-2020-1Corinthians.txt,NASB-1971-Esther.txt
7,0.415279,NASB-1995-Romans.txt,ERV-1881-1Timothy.txt
8,0.41593,NASB-2020-Romans.txt,NASB-1995-Esther.txt
9,0.417099,NASB-2020-Romans.txt,RSV-1952-Esther.txt


In [13]:
## Get most similar (top)

get_top_docs(unique_pairs, texts_df[['index', 'doc_id']], rank='top')

   Doc_A  Doc_B  Similarity_Score  Index                    Doc_A_ID  IndexB  \
0      3     39          0.982847      3      NASB-2020-1Timothy.txt      39   
1     36     43          0.988327     36       ERV-1881-1Timothy.txt      43   
2     15     35          0.989762     15     NASB-2020-Ephesians.txt      35   
3     41     44          0.993023     41       RSV-1971-1Timothy.txt      44   
4     14     15          0.993164     14     NASB-1995-Ephesians.txt      15   
5      8     39          0.993380      8      NASB-1977-1Timothy.txt      39   
6     11     27          0.995983     11      ERV-1881-Ephesians.txt      27   
7     14     35          0.996179     14     NASB-1995-Ephesians.txt      35   
8     28     49          0.997524     28      RSV-1946-Ephesians.txt      49   
9      1      2          1.000000      1  NRSV-1989-1Corinthians.txt       2   

                  Doc_B_ID  
0   NASB-1995-1Timothy.txt  
1    ASV-1900-1Timothy.txt  
2  NASB-1977-Ephesians.txt  
3  

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,0.982847,NASB-2020-1Timothy.txt,NASB-1995-1Timothy.txt
1,0.988327,ERV-1881-1Timothy.txt,ASV-1900-1Timothy.txt
2,0.989762,NASB-2020-Ephesians.txt,NASB-1977-Ephesians.txt
3,0.993023,RSV-1971-1Timothy.txt,RSV-1946-1Timothy.txt
4,0.993164,NASB-1995-Ephesians.txt,NASB-2020-Ephesians.txt
5,0.99338,NASB-1977-1Timothy.txt,NASB-1995-1Timothy.txt
6,0.995983,ERV-1881-Ephesians.txt,ASV-1900-Ephesians.txt
7,0.996179,NASB-1995-Ephesians.txt,NASB-1977-Ephesians.txt
8,0.997524,RSV-1946-Ephesians.txt,RSV-1971-Ephesians.txt
9,1.0,NRSV-1989-1Corinthians.txt,.DS_Store


In [14]:
# Get most similar to a particular title

def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

In [15]:
get_similar_docs('ERV-1881-1Timothy.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,43,ASV-1900-1Timothy.txt,0.988327
1,41,RSV-1971-1Timothy.txt,0.865637
2,44,RSV-1946-1Timothy.txt,0.861998
3,51,NRSV-1989-1Timothy.txt,0.853565
4,8,NASB-1977-1Timothy.txt,0.852226
5,39,NASB-1995-1Timothy.txt,0.849309
6,3,NASB-2020-1Timothy.txt,0.841443
7,22,NASB-1971-1Timothy.txt,0.8308
8,28,RSV-1946-Ephesians.txt,0.814557
9,49,RSV-1971-Ephesians.txt,0.812635
