### Main Program

In [1]:
import re
import string
import numpy as np
import pandas as pd
from pathlib import Path


class DocumentSimilarityAnalyzer:
    def __init__(self, directory):
        '''
        Initializing the Analyer Class. Variabes are defined in class to be usable across functions. 
        In case of directory, an if statement assesses if the directory exists in the users drive or not.
        '''
        self.dictionary = set()
        self.corpus = [] #list of strings -> appended in the "update corpus" function
        self.name_list = [] #list of strings -> appended in the "update name" function -> used to get document names
        #path is only assigned as directory if it is a valid path existising on the user's system
        if Path(directory).is_dir() == True:
            self.directory = directory
        else:
            print(f"Inputted path ", {directory}," is not valid. Please check your input.")
        
    def print_dictionary(self): 
        print(self.dictionary)
    
    def print_corpus(self):
        print(self.corpus)
    
    def print_name(self): 
        print(self.name_list)
    
    def clear_corpus(self):
        self.corpus.clear()

    def clear_name_list(self):
        self.name_list.clear()

    def load_search_doc(self, doc_name): 
        '''
        Function to load the search document from drive by taking the document as input.
        Error handling is included.
        '''
        #open file by using path() -> to make it operateable on windows and mac systems
        assert Path(self.directory+doc_name).is_file(), "The inputted file does not exist at that directory"
        file = Path(self.directory + doc_name)
        
        #open and read search file
        #throw errors in case try fails
        try:
            with open(file, "r") as f:
                search_doc = f.read()
            return search_doc
        except FileNotFoundError as e:
            print(f"Error: File not found - {file}: {e}")
        except IOError as e:
            print(f"Error opening or reading file {file}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
        return None

    def create_corpus(self, corpus_folder):
        '''
        Function to create the corpus. Takes corpus folder name as argument, iterates over all files in the corpus folder and creates a new corpus of all .txt documents.
        Triggers the create dictionary function.
        '''
        assert Path(self.directory+corpus_folder).is_dir(), "The inputted file does no exist in the directory."
        files = Path(self.directory+corpus_folder).glob('*')
        self.clear_corpus() #needs to be cleared, otherwise there will be dulpicates
        self.clear_name_list()
        for file in files:
            if file.suffixes == [".txt"]:
                try:
                    self.name_list.append(file.name)
                    with open(file, "r") as f:
                        self.corpus.append(f.read())
                except FileNotFoundError as e:
                    print(f"Error: File not found - {file}: {e}")
                except IOError as e:
                    print(f"Error opening or reading file {file}: {e}")
                except Exception as e:
                    print(f"An unexpected error occurred: {e}")
                
            else: print(f"The following file could not be uploaded as it is no in .txt format: {file.name}")
        self.dictionary = self.create_dictionary()

    def update_corpus(self,corpus_folder):
        '''
        Triggers the create_corpus function to creat a new corpus. 
        '''
        assert Path(self.directory+corpus_folder).is_dir(), "The input directory does not exist. Please input an existing corpus folder to upadate the corpus."
        self.create_corpus(corpus_folder)
        print("The corpus was successfully updated.")

    def process_text(self, text):
        '''
        Function to extract words from string and return in lower cases
        '''
        return re.findall(r'\w+', text.lower())

    def create_dictionary(self): 
        '''
        Function to create a dictionary from the corpus documents by iterating through every document and creating sets of words contained and union them with the dictionary.
        '''
        #iterate through all documents from the corpus
        for document in self.corpus:
            #extract all words in lower case from the string
            #save words as a set and add words to the dictionary by building the union
            words = self.process_text(document)
            new_words = set(words)
            self.dictionary = self.dictionary.union(new_words)
        return self.dictionary

    def document_to_vector(self, document): 
        '''
        Function to convert a document (list of document words) into a binary vector.
        '''
        # uses process_text function to convert document into a list of words
        word_list = self.process_text(document)
        # iterates through the dictionary  
        # appends 1 when word is in the list of words from the document  
        # appends 0 when word is not in the list of words from the document  
        doc_vector = np.array([1 if word in word_list else 0 for word in self.dictionary])
        return doc_vector

    def freq_vector(self, document): 
        '''
        Function to convert a document (list of document words) into a frequency vector.
        '''
        #Convert a document into a list of words
        word_list = self.process_text(document)
        #Count occurrences of each word in the document
        word_counts = {word: word_list.count(word) for word in set(word_list)}
        #Create the frequency vector
        freq_vector = np.array([word_counts.get(word, 0) for word in self.dictionary])
        return freq_vector

    def dot_similarity(self, search_vector): 
        '''
        Function to compute similarities by using dot product. Returining a dictionary containing the document name and the similarities of each document.
        '''
        similarity_dic = {}
        #use "for i, doc_vector" to also get index of the iteration -> used to get the document from the corpus
        for i, doc_vector in enumerate(self.doc_vectors):
            similarity = np.dot(doc_vector, search_vector)
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})
        
        #convert dictionary into data frame for formatted output and possibility to easily order results
        similarities = pd.DataFrame(similarity_dic.items(), columns=['Document', 'Similarity'])
        similarities.sort_values(["Similarity"], ascending=False, inplace=True)
        
        #convert dictionary into data frame for formatted output and possibility to easily order results
        print("Dot Product: The higher the dot product the higher the similarity.\n", similarities)

    def jac_similarity(self, search_doc, search_vector): 
        '''
        Function to compute similarities by using Jaccard Index. 
        search_len is the number of words in the search document.
        cor_doc_nr is the number of words in the document of the corpus.
        The Jaccard Index is computed by divinding the dot product of the search document and corpus document by the number of union of word sets of the search documents and corpus documents.
        Returining a dictionary containing the document name and the similarities of each document. 
        '''
        similarity_dic = {}
        #create set of all words in the search document
        
        search_nr = len(set(self.process_text(search_doc)))
        
        for i, doc_vector in enumerate(self.doc_vectors):
            #create set of all words for each document in the corpus
            cor_doc_nr = len(set(self.process_text(self.corpus[i])))
            #divide the dot product by the number of words in the union of search_doc and corpus docs
            similarity = np.dot(doc_vector, search_vector) / (search_nr + cor_doc_nr)
            
            #get name from name_list by indexing from the name_list
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})
            
        similarities = pd.DataFrame(similarity_dic.items(), columns=['Document', 'Similarity'])
        similarities.sort_values(["Similarity"], ascending=False, inplace=True)
        
        print("Jaccard Index: The higher the Jaccard Index the higher the similarity.\n", similarities)

    def euc_similarity(self, search_vector): 
        '''
        Function to compute similarities by calculating the euclidean distance  between the search document and each document of the corpus.
        Returining a dictionary containing the document name and the similarities of each document. 
        '''
        similarity_dic = {}
        for i, doc_vector in enumerate(self.doc_vectors):
            similarity = np.linalg.norm(doc_vector - search_vector)
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})

        similarities = pd.DataFrame(similarity_dic.items(), columns=['Document', 'Similarity'])
        similarities.sort_values(["Similarity"], ascending=True, inplace=True)
        
        print("Euclidean Distance: The lower the Euclidean Distance the higher the similarity.\n", similarities)

    def cosine_similarity(self, search_vector): 
        '''
        Function to compute similarities by using the cosine similarity.
        Returining a dictionary containing the document name and the similarities of each document. 
        '''
        similarity_dic = {}
        for i, doc_vector in enumerate(self.doc_vectors):
            #dot product of the seach vector and document vector is divided by the product of the lengths of the vectors
            similarity = np.dot(doc_vector, search_vector) / (np.linalg.norm(doc_vector) * np.linalg.norm(search_vector))
            doc_name = self.name_list[i]
            similarity_dic.update({doc_name: similarity})

        similarities = pd.DataFrame(similarity_dic.items(), columns=['Document', 'Similarity'])
        similarities.sort_values(["Similarity"], ascending=False, inplace=True)
        
        print("Cosine Similarity: The lower the Cosine Similarity the higher the similarity.\n", similarities)
        

    def compute_similarity(self, search_doc): 
        '''
        Function which is triggered by user to compute similarity.
        User input required, asking which method the user wants to be computed
        If statements are used to assess which computational method the user wants to use and is triggered. 
        Error message will be shown, when no known method is inputted by the user.
        '''

        method = input("Choose the comparing method you want to use: Insert the index number of the respective Methods:\n 1: Dot Product, 2: Jaccard Index,  3: Euclidean Similarity, 4: Cosine Similarity, 5: All Measures")
        #sends search document string to load search document function and gets the content of the file as string
        search_doc = self.load_search_doc(search_doc)
        #create binary vector of search document
        search_vector = self.document_to_vector(search_doc)
        #create array of binary vectors of corpus documents
        self.doc_vectors = np.array([self.document_to_vector(document) for document in self.corpus])

        #if statements to assess which method is chosen
        #sends vectors and corpus to computing function
        if method == "1": #Dot Product
            similarities = self.dot_similarity(search_vector)
        elif method == "2": #Jaccard Index
            similarities = self.jac_similarity(search_doc, search_vector)
        elif method == "3": #Euclidean Distance
            similarities = self.euc_similarity(search_vector)
        elif method == "4": #Cosine Similarity
            similarities = self.cosine_similarity(search_vector)
        elif method == "5": 
            similarities_dot = self.dot_similarity(search_vector)
            similarities_jac = self.jac_similarity(search_doc,search_vector)
            similarities_euc = self.euc_similarity(search_vector)
            similarities_cos = self.cosine_similarity(search_vector)

        else:
            print(f"Unknown method: '{method}'. Please input one of the following Methods: \n", "1: Dot Product, ", "2: Jaccard Index, ", "3: Euclidean Distance, ", "4: Cosine Similarity, ", "5: All Measures")
            return None

### Initialise the program

In [2]:
#requires directory path as input
directory = input("Please enter the path to the folder containing the files for which you want to compute the stext similiarities: ")
analyzer = DocumentSimilarityAnalyzer(directory)

In [3]:
#upload documents for corpus
analyzer.create_corpus("Corpus_Docs")
analyzer.print_corpus()
analyzer.print_name()
analyzer.print_dictionary()

The following file could not be uploaded as it is no in .txt format: Error_Test.docx
['Frankenstein.txt', 'The Picture of Dorian Gray.txt', 'Pride and Prejudice.txt', 'Romeo and Juliet.txt']


In [4]:
#update corpus
analyzer.update_corpus("Corpus_Docs")
analyzer.print_corpus()
analyzer.print_name()
analyzer.print_dictionary()

The following file could not be uploaded as it is no in .txt format: Error_Test.docx
The corpus was successfully updated.
['Frankenstein.txt', 'The Picture of Dorian Gray.txt', 'Pride and Prejudice.txt', 'Romeo and Juliet.txt']


In [5]:
#compute similarities

analyzer.compute_similarity("search_doc.txt")

analyzer.compute_similarity("search_doc.txt")

analyzer.compute_similarity("search_doc.txt")

analyzer.compute_similarity("search_doc.txt")

analyzer.compute_similarity("search_doc.txt")

Dot Product: The higher the dot product the higher the similarity.
                          Document  Similarity
0                Frankenstein.txt         993
1  The Picture of Dorian Gray.txt         660
2         Pride and Prejudice.txt         559
3            Romeo and Juliet.txt         331
Jaccard Index: The higher the Jaccard Index the higher the similarity.
                          Document  Similarity
0                Frankenstein.txt    0.351753
1  The Picture of Dorian Gray.txt    0.173593
2         Pride and Prejudice.txt    0.156102
3            Romeo and Juliet.txt    0.127357
Euclidean Distance: The lower the Euclidean Distance the higher the similarity.
                          Document  Similarity
0                Frankenstein.txt   19.544820
3            Romeo and Juliet.txt   38.496753
2         Pride and Prejudice.txt   44.810713
1  The Picture of Dorian Gray.txt   45.022217
Cosine Similarity: The lower the Cosine Similarity the higher the similarity.
           

### Error Triggering

In [6]:
#create corpus with not valid folder name
analyzer.create_corpus("Corpus_Dcs")

AssertionError: The inputted file does no exist in the directory.

In [7]:
#update corpus with not valid folder name
analyzer.update_corpus("Corus_Docs")

AssertionError: The input directory does not exist. Please input an existing corpus folder to upadate the corpus.

In [9]:
#input not valid search documens
analyzer.compute_similarity("search_docs.txt")

AssertionError: The inputted file does not exist at that directory

In [10]:
#input 7 as method
analyzer.compute_similarity("search_doc.txt")

Unknown method: '7'. Please input one of the following Methods: 
 1: Dot Product,  2: Jaccard Index,  3: Euclidean Distance,  4: Cosine Similarity,  5: All Measures


In [11]:
#input wrong directory
analyzer = DocumentSimilarityAnalyzer("adf3a/asdaf")

Inputted path {'adf3a/asdaf'}  is not valid. Please check your input.
