# Measuring Similarity Between Documents

We are going to measure similarity between documents by representing them as a vector of their most significant words and then measuring the distance beween those vectors.

So, we are going to represent our documents in a format called [TF-IDF using a library called Scikit Learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidf#sklearn.feature_extraction.text.TfidfVectorizer).


Code examples from 
- https://goodboychan.github.io/python/datacamp/natural_language_processing/2020/07/17/04-TF-IDF-and-similarity-scores.html

Install to the REL 560 Environment
- Pandas
- Numpy
- SciKit Learn 
- Matplotlib

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load texts into a dataframe

source_dir = "../data/Example_texts/history/NYT-Obituaries/"

texts = []

for filename in os.listdir(source_dir):
    with open(os.path.join(source_dir, filename), 'r') as obit:
        content = obit.read()
    texts.append(
        {
            "doc_id": filename,
            "text": content
        }
    )

texts_df = pd.DataFrame(texts)

texts_df.head()

In [None]:
texts_df.reset_index(inplace=True)
texts_df

In [None]:
# Create Tf-IDF Vector Representation

vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(texts_df['text'])

print(tfidf_matrix.shape)

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

In [None]:
corr_df = pd.DataFrame(cosine_sim)
corr_df

In [None]:
# import seaborn as sns
import matplotlib.pyplot as plt

corr_df.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '8px'})


Now that we have the correspondence, we can do a couple of things:
- Get most similar documents
- Get least similar documents
- Get documents most similar to a particular document

In [None]:
pairs = corr_df.unstack().reset_index()
# pairs_df.columns = ['doc_A', 'doc_B', 'similarity_measure']
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['Doc_A', 'Doc_B', 'Similarity_Score']

pairs_df

In [None]:
# Clean out rows where matching self

pairs_df = pairs_df[pairs_df['Doc_A'] != pairs_df['Doc_B']]
pairs_df

In [None]:
# https://stackoverflow.com/questions/48549637/pandas-removing-mirror-pairs-from-dataframe
#  df.loc[pd.DataFrame(np.sort(df[['A','B']],1),index=df.index).drop_duplicates(keep='first').index]

unique_pairs = pairs_df.loc[pd.DataFrame(np.sort(pairs_df[['Doc_A', 'Doc_B']], 1), index=pairs_df.index).drop_duplicates(keep='first').index]

unique_pairs

In [None]:
# Get most similar documents

def get_top_docs(sim_df, metadata, num_docs=10, rank='top'):
    sorted = sim_df.sort_values('Similarity_Score', ascending=True)
    if rank == 'top':
        sliced = sorted.tail(num_docs)
    elif rank == 'bottom':
        sliced = sorted.head(num_docs)
    else:
        return "Please use 'top' or 'bottom' for rank variable"
    
    sliced_named = sliced.merge(metadata, how="left", left_on = "Doc_A", right_on="index").merge(metadata, how="left", left_on="Doc_B", right_on="index")
    sliced_named.columns = ['Doc_A', 'Doc_B', 'Similarity_Score', 'Index', 'Doc_A_ID', 'IndexB', 'Doc_B_ID']

    # print(sliced_named)

    top_docs_df = sliced_named[['Similarity_Score', 'Doc_A_ID', 'Doc_B_ID']]

    return top_docs_df

In [None]:
get_top_docs(unique_pairs, texts_df[['index', 'doc_id']], rank='top')

In [None]:
def get_similar_docs(title, sim_mx, metadata):

    idx = metadata.index[metadata['doc_id'] == title].tolist()
    # print(idx)

    # Get similarity scores
    sim_scores = list(enumerate(sim_mx[idx[0]]))

    # sort them
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 (ignorning the 1 for self-matching)

    sim_scores = sim_scores[1:11]
    
    scores = pd.DataFrame(sim_scores)
    scores.columns = ['index', 'similarity_score']
    # print(scores)

    title_index = [i[0] for i in sim_scores]
    matches = pd.DataFrame(metadata['doc_id'].iloc[title_index]).reset_index()
    # print(matches)
    matches = matches.merge(scores, how="left", on="index")

    return matches

In [None]:
get_similar_docs('1870-Robert-E-Lee.txt', corr_df, texts_df)