In [1]:
# 2.2 - simple search of documents in a corpus
#
# code for the Intuitive Text Mining book
# Tariq Rashid, 2018

In [2]:
# collections provides defaultdict
import collections

# glob module for finding files that match a pattern
import glob

In [3]:
# function to clean and simplify text for indexing and also querying

def clean_text(text):
    
    # make lowercase
    cleaned_text = text.lower()

    # keep only normal letters a-z, numbers, spaces, and new lines, remove punctuation
    cleaned_text = [char for char in cleaned_text if char.isalnum() or char==" " or char=='\n']
    cleaned_text = "".join(cleaned_text)
    
    return cleaned_text

In [4]:
# read text documents and build word index

# corpus location and text filename pattern
corpus_directory = 'data_sets/recipes/'
text_filename_pattern = 'txt/??.txt'

# list of text files
list_of_text_files = glob.glob(corpus_directory + text_filename_pattern)

# create empty index
index_dict = collections.defaultdict(list)

# read text from every text file
for document_name in list_of_text_files:
    print("reading from ...", document_name)
    with open(document_name, "r") as f:
        
        text_content = f.read()
        
        cleaned_text = clean_text(text_content)

        # split the text into words
        word_list = cleaned_text.split()
        
        # work through every word and build index
        for word in word_list:
            # add word to index
            index_dict[word].append(document_name)
            pass
        
        pass
    pass

reading from ... data_sets/recipes/txt/15.txt
reading from ... data_sets/recipes/txt/01.txt
reading from ... data_sets/recipes/txt/00.txt
reading from ... data_sets/recipes/txt/14.txt
reading from ... data_sets/recipes/txt/02.txt
reading from ... data_sets/recipes/txt/16.txt
reading from ... data_sets/recipes/txt/17.txt
reading from ... data_sets/recipes/txt/03.txt
reading from ... data_sets/recipes/txt/07.txt
reading from ... data_sets/recipes/txt/13.txt
reading from ... data_sets/recipes/txt/12.txt
reading from ... data_sets/recipes/txt/06.txt
reading from ... data_sets/recipes/txt/10.txt
reading from ... data_sets/recipes/txt/04.txt
reading from ... data_sets/recipes/txt/05.txt
reading from ... data_sets/recipes/txt/11.txt
reading from ... data_sets/recipes/txt/08.txt
reading from ... data_sets/recipes/txt/20.txt
reading from ... data_sets/recipes/txt/21.txt
reading from ... data_sets/recipes/txt/09.txt
reading from ... data_sets/recipes/txt/19.txt
reading from ... data_sets/recipes

In [5]:
# look at a part of the index

list(index_dict.items())[:5]

[('fried',
  ['data_sets/recipes/txt/15.txt',
   'data_sets/recipes/txt/14.txt',
   'data_sets/recipes/txt/07.txt']),
 ('eggplants',
  ['data_sets/recipes/txt/15.txt',
   'data_sets/recipes/txt/15.txt',
   'data_sets/recipes/txt/16.txt',
   'data_sets/recipes/txt/16.txt']),
 ('melanzane',
  ['data_sets/recipes/txt/15.txt', 'data_sets/recipes/txt/16.txt']),
 ('fritte', ['data_sets/recipes/txt/15.txt']),
 ('eggplant', ['data_sets/recipes/txt/15.txt'])]

In [6]:
# look up "milk" in the index directly

index_dict["milk"]

['data_sets/recipes/txt/17.txt',
 'data_sets/recipes/txt/03.txt',
 'data_sets/recipes/txt/12.txt',
 'data_sets/recipes/txt/06.txt',
 'data_sets/recipes/txt/08.txt',
 'data_sets/recipes/txt/08.txt',
 'data_sets/recipes/txt/18.txt']

In [7]:
# look up "water" in the index directly

index_dict["water"]

['data_sets/recipes/txt/01.txt',
 'data_sets/recipes/txt/14.txt',
 'data_sets/recipes/txt/16.txt',
 'data_sets/recipes/txt/17.txt',
 'data_sets/recipes/txt/17.txt',
 'data_sets/recipes/txt/03.txt',
 'data_sets/recipes/txt/03.txt',
 'data_sets/recipes/txt/03.txt',
 'data_sets/recipes/txt/07.txt',
 'data_sets/recipes/txt/07.txt',
 'data_sets/recipes/txt/13.txt',
 'data_sets/recipes/txt/13.txt',
 'data_sets/recipes/txt/13.txt',
 'data_sets/recipes/txt/12.txt',
 'data_sets/recipes/txt/10.txt',
 'data_sets/recipes/txt/10.txt',
 'data_sets/recipes/txt/10.txt',
 'data_sets/recipes/txt/04.txt',
 'data_sets/recipes/txt/11.txt',
 'data_sets/recipes/txt/11.txt',
 'data_sets/recipes/txt/08.txt',
 'data_sets/recipes/txt/20.txt',
 'data_sets/recipes/txt/21.txt',
 'data_sets/recipes/txt/21.txt',
 'data_sets/recipes/txt/21.txt',
 'data_sets/recipes/txt/21.txt',
 'data_sets/recipes/txt/18.txt',
 'data_sets/recipes/txt/18.txt']

In [8]:
# doc contains "(Salsa", query with "Salsa"

index_dict[clean_text("Salsa")]

['data_sets/recipes/txt/10.txt',
 'data_sets/recipes/txt/04.txt',
 'data_sets/recipes/txt/11.txt',
 'data_sets/recipes/txt/09.txt']

In [9]:
# doc contains "(Salsa", query with "salsa"

index_dict[clean_text("salsa")]

['data_sets/recipes/txt/10.txt',
 'data_sets/recipes/txt/04.txt',
 'data_sets/recipes/txt/11.txt',
 'data_sets/recipes/txt/09.txt']