In [2]:
import re
import string
import numpy as np
import pandas as pd
from pathlib import Path


class DocumentSimilarityAnalyzer:
    def __init__(self, directory):
        self.directory = directory
        self.dictionary = set()
        self.corpus = []
        self.name_list = []

    def clear_corpus(self):
        self.corpus.clear()

    def clear_name_list(self):
        self.name_list.clear()

    def load_search_doc(self, doc_name):
        file_to_open = Path(self.directory + doc_name)
        try:
            with open(file_to_open, "r") as f:
                search_doc = f.read()
            return search_doc
        except FileNotFoundError as e:
            print(f"Error: File not found - {file_to_open}: {e}")
        except IOError as e:
            print(f"Error opening or reading file {file_to_open}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
        return None

    def update_corpus(self, doc_name):
        file_to_open = Path(self.directory + doc_name)

        try:
            assert doc_name not in self.name_list, f"Document {doc_name} already uploaded, please choose another file."
            self.name_list.append(doc_name)

            with open(file_to_open, "r") as f:
                self.corpus.append(f.read())
            self.dictionary.update(self.create_dictionary())
        except FileNotFoundError as e:
            print(f"Error: File not found - {file_to_open}: {e}")
        except IOError as e:
            print(f"Error opening or reading file {file_to_open}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
        return None

    def process_text(self, text):
        return re.findall(r'\w+', text.lower())

    def create_dictionary(self):
        new_words = set()

        for document in self.corpus:
            words = self.process_text(document)
            new_words.update(words)

        return new_words

    def document_to_vector(self, document):
        word_list = self.process_text(document)
        return np.array([1 if word in word_list else 0 for word in self.dictionary])

    def freq_vector(self, document):
        word_list = self.process_text(document)
        word_counts = {word: word_list.count(word) for word in set(word_list)}
        return np.array([word_counts.get(word, 0) for word in self.dictionary])

    def dot_similarity(self, search_vector):
        similarity_dic = {}
        for i, doc_vector in enumerate(self.doc_vectors):
            similarity = np.dot(doc_vector, search_vector)
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})
        return similarity_dic

    def jac_similarity(self, search_vector):
        similarity_dic = {}
        for i, doc_vector in enumerate(self.doc_vectors):
            cor_len = len(self.process_text(self.corpus[i]))
            search_len = len(self.process_text(str(search_vector)))
            similarity = np.dot(doc_vector, search_vector) / (search_len + cor_len)
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})
        return similarity_dic

    def euc_similarity(self, search_vector):
        similarity_dic = {}
        for i, doc_vector in enumerate(self.doc_vectors):
            similarity = np.linalg.norm(doc_vector - search_vector)
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})
        return similarity_dic

    def compute_similarity(self, search_doc, method):
        search_doc = self.load_search_doc(search_doc)
        search_vector = self.document_to_vector(search_doc)
        self.doc_vectors = np.array([self.document_to_vector(document) for document in self.corpus])

        if method == "Dot Product":
            similarities = self.dot_similarity(search_vector)
        elif method == "Jaccard Index":
            similarities = self.jac_similarity(search_vector)
        elif method == "Euclidean Distance":
            similarities = self.euc_similarity(search_vector)
        else:
            print("Unknown method")
            return None

        similarities_df = pd.DataFrame(similarities.items(), columns=['Document', 'Similarity'])
        similarities_df.sort_values(["Similarity"], ascending=False, inplace=True)
        print(f"{method}: \n", similarities_df)
        return similarities_df




