# Document Similarity Finder

This notebook is used for exploratory data analysis and experimentation with the dataset and preprocessing techniques for the Document Similarity Finder tool.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from src.similarity import compute_cosine_similarity
from src.visualization import plot_heatmap, display_top_pairs

# Load the dataset
documents = pd.read_csv('../data/raw/documents.csv')
documents.head()

In [None]:
# Preprocess the documents
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents['text'])

# Compute the cosine similarity matrix
similarity_matrix = compute_cosine_similarity(tfidf_matrix)

# Display the heatmap of the similarity matrix
plt.figure(figsize=(10, 8))
plot_heatmap(similarity_matrix, documents['title'])

# Display the top 3 most similar document pairs
top_pairs = display_top_pairs(similarity_matrix, documents['title'], top_n=3)
top_pairs