# Recursive Inverted Index (5 Points)

Copyright [Big Data Analytics Group](https://bigdata.uni-saarland.de/), [CC-BY-SA](https://creativecommons.org/licenses/by-sa/4.0/legalcode)

In this exercise, you will implement an [inverted index](https://en.wikipedia.org/wiki/Inverted_index). An inverted index is a mapping from the content (e.g. words) of
a document to the document itself. In addition, in this exercise we radix-partition the words, i.e. we _recursively_ partition the words characterwise similar to the notebook [Indexing by Recursive External Partitioning.ipynb](https://github.com/BigDataAnalyticsGroup/bigdataengineering/blob/master/Indexing%20by%20Recursive%20External%20Partitioning.ipynb). This structure allows us to easily search for documents that contain a word that exactly matches a given search word, but also words that start with the given search word (prefix search). Implement the following functions of the class `RecursiveInvertedIndex` in the attached notebook:
* `__init__`: This is the constructor of the class, which takes a list of documents names and creates the
    recursive, inverted index based on these documents. Your words must be radix-partitioned. You can use the helper
        function`get_words_from_document(document_name)`, which takes a document name as input and then
        returns a set of words contained in the document. Note, that all words are converted to lowercase.
* `search(word)`: This function uses the inverted index to return a list of all matching documents given the key `word`. Matching here means that the search word either exactly matches one of the words in a document, or at least one of the words in a document starts with the search word. If no documents exist for a search word, an empty set is returned. Otherwise, the set of all matching documents is returned. Your implementation should be case-insensitive.

Your implementation must pass all provided unit tests without hardcoding!

In [3]:
def get_words_from_document(document_name):
    with open(document_name) as f:
        text = f.read().replace('.', '')
        word_list = text.split(" ")
        words = set([word.lower() for word in word_list])
        return words

In [102]:
# Upload the contents of this cell to our CMS as a text file

# A class implementing a recursive, inverted index
class RecursiveInvertedIndex:
    out = {}
    def create_key_values(self, file_list):
        output_dict = {}
        test_words = []

        for file in file_list:
            test_words.append(get_words_from_document(file))

        # Iterate through each set in the list
        for index, word_set in enumerate(test_words):
            # Iterate through each word in the set
            for word in word_set:
                # If the word is not already in the dictionary, add it with the current index
                if word not in output_dict:
                    output_dict[word] = set()
                # Append the current index (1-based) to the list of indices for the word
                output_dict[word].add(index + 1)

        # Print the resulting dictionary
        return output_dict
                
    # a partitioning function returning the <position>-th character from <inputString>
    def radixPartitioningFunction(self, inputString, position):
        if len(inputString) > position:
            return inputString[position]
        else:
            return inputString[-1]

    def partitioningStep(self, file_list, recursionLevel=0):
        # initialize a dictionary for the output partitions and files created in this partitioning step
        outputFiles = {}
        
        output_dictionary = self.create_key_values(file_list)

        for key, value in output_dictionary.items():
            # compute the partition of the key:
            partition = self.radixPartitioningFunction(key.lower(), recursionLevel)

            # check if the output file already exists, create it if not:
            if partition not in outputFiles:
                outputFiles[partition] = set()
                outputFiles[partition].update(value)

        return outputFiles, recursionLevel+1

    def __init__(self, document_names):
        self.out, _ = self.partitioningStep(document_names)

    def search(self, word): 
        documents = set()
        for key, value in self.out.items():
            if word == key:
                documents.update(value)
        return documents

        # Add your code here!
        # ...
        # This function should return the set of all matching documents. A matching document either contains exactly the search word or
        # a word that starts with the given search word (or both).

In [111]:
test = RecursiveInvertedIndex(["data/indexes/test_file_1.txt", "data/indexes/test_file_2.txt", "data/indexes/test_file_3.txt"])


b = test.create_key_values(["data/indexes/test_file_1.txt", "data/indexes/test_file_2.txt", "data/indexes/test_file_3.txt"])
a, _ = test.partitioningStep(["data/indexes/test_file_1.txt", "data/indexes/test_file_2.txt", "data/indexes/test_file_3.txt"])

print(a)
print(b)

#
#for key, value in a.items():
#    print(key + " -> " + ", ".join(map(str, value)))
#

{'l': {1}, 'i': {1, 2}, 'a': {1, 3}, 't': {1, 2}, 'c': {1, 2}, 'b': {1, 3}, 'm': {2}, 'd': {2, 3}, 'e': {2}, 'g': {3}, 'o': {3}, 'w': {3}}
{'lot': {1}, 'in': {1, 2}, 'a': {1, 3}, 'the': {1, 2}, 'course': {1, 2}, 'learned': {1, 2}, 'bde': {1, 3}, 'i': {1, 2}, 'more': {2}, 'dbsys': {2, 3}, 'even': {2}, 'idea': {3}, 'good': {3}, 'overall': {3}, 'was': {3}, 'before': {3}, 'to': {3}, 'it': {3}, 'take': {3}, 'taking': {3}}


In [112]:
word = "bde"
documents = set()
for key, value in b.items():
    if word == key:
        documents.update(value)
print(documents)

{1, 3}


In [113]:
print(test.search("bde"))

set()


### Unit tests

Note that test cases are by no means exhaustive!

In [None]:
import unittest

class RecursiveInvertedIndexTest(unittest.TestCase):
    
    def setUp(self):
        self.documents = ["data/indexes/test_file_1.txt", "data/indexes/test_file_2.txt", "data/indexes/test_file_3.txt"]
        self.rec_inverted_index = RecursiveInvertedIndex(self.documents)
    
    def test_no_result(self):
        search_result = self.rec_inverted_index.search("Im")
        self.assertEqual(len(search_result), 0)
        search_result = self.rec_inverted_index.search("BDER")
        self.assertEqual(len(search_result), 0)
        search_result = self.rec_inverted_index.search("lotin")
        self.assertEqual(len(search_result), 0)
        search_result = self.rec_inverted_index.search("")
        self.assertEqual(len(search_result), 0)
        search_result = self.rec_inverted_index.search(" ")
        self.assertEqual(len(search_result), 0)
        search_result = self.rec_inverted_index.search("ar")
        self.assertEqual(len(search_result), 0)
    
    def test_with_result(self):
        search_result = self.rec_inverted_index.search("bde")
        self.assertEqual(len(search_result), 2)
        self.assertTrue("data/indexes/test_file_1.txt" in search_result and "data/indexes/test_file_3.txt" in search_result)
        search_result = self.rec_inverted_index.search("DBS")
        self.assertEqual(len(search_result), 2)
        self.assertTrue("data/indexes/test_file_2.txt" in search_result and "data/indexes/test_file_3.txt" in search_result)
        search_result = self.rec_inverted_index.search("I")
        self.assertEqual(len(search_result), 3)
        for document in self.documents:
            self.assertTrue(document in search_result)
        search_result = self.rec_inverted_index.search("course")
        self.assertEqual(len(search_result), 2)
        self.assertTrue("data/indexes/test_file_1.txt" in search_result and "data/indexes/test_file_2.txt" in search_result)
        search_result = self.rec_inverted_index.search("o")
        self.assertEqual(len(search_result), 1)
        self.assertTrue("data/indexes/test_file_3.txt" in search_result)

In [None]:
# Run the unit test without shutting down the jupyter kernel
unittest.main(argv=['ignored', '-v'], verbosity=2, exit=False)