In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
source_dir = "../Data/Periodical-text-files/"

texts = []

for filename in os.listdir(source_dir):
    with open(os.path.join(source_dir, filename), 'r') as periodical:
        content = periodical.read()
    texts.append(
        {
            "doc_id": filename,
            "text": content
        }
    )

texts_df = pd.DataFrame(texts)

texts_df.head()

Unnamed: 0,doc_id,text
0,AmSn18940802-V09-31-page-5.txt,"AUGUST 2, 1894. AIWIlEati CAN S1INTriTAINIHA1...."
1,AmSn18990824-V14-33-page-15.txt,AMERICAN SENTINEL. 527 Two Important Sabbath. ...
2,SOL19001101-V15-43-page-6.txt,678 THE SENTINEL OF LIBERTY. THE EVIL OF MISUS...
3,LibM19091001-V04-04-page-26.txt,"LIBERTY express it."" Now, with a nation, as a ..."
4,SOL19030205-V18-06-page-2.txt,"Ł:"":ŁŁ:"":"":"":"":-:"":"":"":ŁŁ:ŁŁ:ﬂ:ŁŁ:ŁŁ:-:"":-:-:Ł..."


In [3]:
texts_df.reset_index(inplace=True)
texts_df

Unnamed: 0,index,doc_id,text
0,0,AmSn18940802-V09-31-page-5.txt,"AUGUST 2, 1894. AIWIlEati CAN S1INTriTAINIHA1...."
1,1,AmSn18990824-V14-33-page-15.txt,AMERICAN SENTINEL. 527 Two Important Sabbath. ...
2,2,SOL19001101-V15-43-page-6.txt,678 THE SENTINEL OF LIBERTY. THE EVIL OF MISUS...
3,3,LibM19091001-V04-04-page-26.txt,"LIBERTY express it."" Now, with a nation, as a ..."
4,4,SOL19030205-V18-06-page-2.txt,"Ł:"":ŁŁ:"":"":"":"":-:"":"":"":ŁŁ:ŁŁ:ﬂ:ŁŁ:ŁŁ:-:"":-:-:Ł..."
...,...,...,...
11467,11467,LibM19060401-V01-01-page-7.txt,LIBERTY 5 for mutual counsel and harmonious a...
11468,11468,AmSn18980922-V13-37-page-11.txt,AMERICAN SENTIN EL. 591 the universe that coul...
11469,11469,AmSn19000830-V15-34-page-12.txt,"540 'THE SENTINEL OF LIE3ERTY, News, Notes . ...."
11470,11470,AmSn19000802-V15-30-page-6.txt,470 THE SENTINEL OF LIBERTY. purpose of the ma...


In [4]:
# Create Tf-IDF Vector Representation

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(texts_df['text'])

print(tfidf_matrix.shape)

(11472, 120743)


In [5]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.43343484 0.6010081  ... 0.57209737 0.57159926 0.50459709]
 [0.43343484 1.         0.37856121 ... 0.34124843 0.36021856 0.32261786]
 [0.6010081  0.37856121 1.         ... 0.50028508 0.50469716 0.44110747]
 ...
 [0.57209737 0.34124843 0.50028508 ... 1.         0.48533293 0.41154737]
 [0.57159926 0.36021856 0.50469716 ... 0.48533293 1.         0.46664933]
 [0.50459709 0.32261786 0.44110747 ... 0.41154737 0.46664933 1.        ]]


In [9]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11462,11463,11464,11465,11466,11467,11468,11469,11470,11471
0,1.000000,0.433435,0.601008,0.654178,0.221686,0.607493,0.604011,0.647261,0.525173,0.581390,...,0.549765,0.486499,0.488906,0.683733,0.585686,0.633031,0.481638,0.572097,0.571599,0.504597
1,0.433435,1.000000,0.378561,0.376239,0.202671,0.388398,0.376193,0.390456,0.329926,0.367629,...,0.340245,0.279019,0.348325,0.410714,0.358484,0.386198,0.313028,0.341248,0.360219,0.322618
2,0.601008,0.378561,1.000000,0.581928,0.190424,0.549927,0.523190,0.555440,0.470097,0.558356,...,0.481530,0.436338,0.467495,0.600137,0.525884,0.586207,0.471084,0.500285,0.504697,0.441107
3,0.654178,0.376239,0.581928,1.000000,0.204099,0.562280,0.575277,0.567028,0.496355,0.556394,...,0.513287,0.448605,0.438739,0.627091,0.578330,0.598010,0.432980,0.528422,0.532595,0.442638
4,0.221686,0.202671,0.190424,0.204099,1.000000,0.205227,0.196261,0.201753,0.174475,0.184492,...,0.174107,0.155890,0.189496,0.219975,0.193262,0.205079,0.160804,0.186664,0.183701,0.158032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11467,0.633031,0.386198,0.586207,0.598010,0.205079,0.565486,0.565727,0.581102,0.496070,0.543955,...,0.508324,0.463973,0.477806,0.633479,0.560114,1.000000,0.473010,0.522866,0.531156,0.468854
11468,0.481638,0.313028,0.471084,0.432980,0.160804,0.444261,0.430373,0.429693,0.384146,0.433696,...,0.401864,0.338986,0.378143,0.492237,0.447541,0.473010,1.000000,0.400101,0.419717,0.381602
11469,0.572097,0.341248,0.500285,0.528422,0.186664,0.509728,0.513235,0.506929,0.446294,0.456627,...,0.473387,0.408936,0.394851,0.562854,0.484864,0.522866,0.400101,1.000000,0.485333,0.411547
11470,0.571599,0.360219,0.504697,0.532595,0.183701,0.525809,0.556223,0.504016,0.515169,0.491414,...,0.460702,0.476854,0.411840,0.608983,0.493942,0.531156,0.419717,0.485333,1.000000,0.466649


In [11]:
#The collection is too big to run this step so I commented it out

# import seaborn as sns
# import matplotlib.pyplot as plt

#corr_df.style.background_gradient(cmap ='viridis')\
         #.set_properties(**{'font-size': '8px'})

In [10]:
pairs = corr_df.unstack().reset_index()
# pairs_df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']

pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
0,0,0,1.000000
1,0,1,0.433435
2,0,2,0.601008
3,0,3,0.654178
4,0,4,0.221686
...,...,...,...
131606779,11471,11467,0.468854
131606780,11471,11468,0.381602
131606781,11471,11469,0.411547
131606782,11471,11470,0.466649


In [12]:
pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.433435
2,0,2,0.601008
3,0,3,0.654178
4,0,4,0.221686
5,0,5,0.607493
...,...,...,...
131606778,11471,11466,0.421182
131606779,11471,11467,0.468854
131606780,11471,11468,0.381602
131606781,11471,11469,0.411547


In [13]:
unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates(keep='first').index]

unique_pairs

Unnamed: 0,Doc_A,Doc_B,Similarity_Score
1,0,1,0.433435
2,0,2,0.601008
3,0,3,0.654178
4,0,4,0.221686
5,0,5,0.607493
...,...,...,...
131572366,11468,11470,0.419717
131572367,11468,11471,0.381602
131583838,11469,11470,0.485333
131583839,11469,11471,0.411547


In [15]:
def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how="left", left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', 'IndexB', 'Doc_B_ID']

    # print(sliced_named)

    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]

    return top_docs_df

In [16]:
get_top_docs(unique_pairs, texts_df[['index', 'doc_id']], rank='top')

Unnamed: 0,Similarity_Score,Doc_A_ID,Doc_B_ID
0,1.0,AmSn19000531-V15-21-page-11.txt,SOL19000531-V15-21-page-11.txt
1,1.0,AmSn19000628-V15-25-page-5.txt,SOL19000628-V15-25-page-5.txt
2,1.0,AmSn19000726-V15-29-page-4.txt,SOL19000726-V15-29-page-4.txt
3,1.0,LibM19070701-V02-03-page-2.txt,LibM19060701-V01-02-page-2.txt
4,1.0,SOL19000531-V15-21-page-3.txt,AmSn19000531-V15-21-page-3.txt
5,1.0,AmSn19000531-V15-21-page-13.txt,SOL19000531-V15-21-page-13.txt
6,1.0,SOL19000614-V15-23-page-11.txt,AmSn19000614-V15-23-page-11.txt
7,1.0,LibM19070701-V02-03-page-2.txt,LibM19131001-V08-04-page-6.txt
8,1.0,LibM19060701-V01-02-page-2.txt,LibM19131001-V08-04-page-6.txt
9,1.0,SOL19000510-V15-17-page-3.txt,AmSn19000510-V15-18-page-3.txt


In [17]:
def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

In [19]:
get_similar_docs('AmSn19000531-V15-21-page-11.txt', corr_df, texts_df)

Unnamed: 0,index,doc_id,similarity_score
0,9522,SOL19000531-V15-21-page-11.txt,1.0
1,6806,AmSn18910604-V06-23-page-2.txt,0.652711
2,4111,AmSn18931109-V08-44-page-2.txt,0.647804
3,7332,AmSn18890918-V04-34-page-4.txt,0.646494
4,8814,AmSn18950103-V10-01-page-2.txt,0.646084
5,6234,AmSn18900410-V05-15-page-8.txt,0.644835
6,5015,AmSn18911112-V06-44-page-7.txt,0.644066
7,4859,AmSn18941220-V09-50-page-4.txt,0.642677
8,8729,AmSn18910409-V06-15-page-4.txt,0.639841
9,1725,AmSn18890911-V04-33-page-5.txt,0.639624
