# Step 1: Tokenizing Documents

The first step in creating an index is tokenization. You must convert a document into a stream of tokens suitable for indexing.
Your tokenizer should follow these steps:
1. Accept a directory name as a command line argument, and process all files found in that directory.
2. Extract the document text with an HTML parsing library, ignoring the headers at the beginning of the file and all HTML tags.
3. Split the text into tokens (You can use some library for regular expression matching. To learn about regular expressions go to this link http://www.rexegg.com/regex-quickstart.html).
4. Convert all tokens to lowercase (this is not always ideal, but indexing intelligently in a case-sensitive manner is tricky).
5. Apply stop-wording to the document by ignoring any tokens found in this list <font color=blue>(\\Cactus \xeon\Maryam Bashir\Information Retrieval\stoplist)</font>.
6. Apply stemming to the document using any standard algorithm – <font color=green> Porter, Snowball, and KStem stemmers </font> are appropriate. You should use a stemming library for this step.
7. Your tokenizer will write two files:
- docids.txt – A file mapping a document's filename (without path) to a unique integer, its DOCID. Each line should be formatted with a DOCID and filename separated by a tab, as follows:<font color=blue> 1234\t32435</font>
- termids.txt – A file mapping a token found during tokenization to a unique integer, its TERMID. Each line should be formatted with a TERMID and token separated by a tab, as follows: <font color=blue> 567\tapple </font>

<font color=blue>  </font>

In [1]:
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import random
import os
import operator
import numpy as np
#from sets import Set
#from html.parser import HTMLParser

In [2]:
def get_directory_path(mode):
    """
    It takes only path of folder, no file name.
    It only returns the folder which contain all the text file.
    
    Argument:
    #nothing
    
    Returns:
    dp -- directory path which contains all the txt files.
    """
    if (mode == "input"):   
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/"
    elif (mode == "output"):
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/out/"
    else:
        raise ValueError('Unspecified mode.')

    return dp

In [3]:
# Function : read_stop_list
def read_text_in_list_form(file_path):
    """
    This function takes the path of stop words file and reads it and returns a list of words.
    
    Argument:
    stop_file_path -- path should be like: path + file name.extension
        "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt".
    
    Returns:
    lineList -- list of words containg all the stop_words.
    """
    
    lst = [line.rstrip('\n') for line in open(file_path)]
    return lst

In [4]:
# Function = remove_html_headers
def remove_html_headers(file_path):
    """
    This function takes in path of file + file name and opens it. 
    Then it remove all the headers of html and returns the plain text.
    
    Argument:
    file_path -- concateted: path folder + file name
    
    Returns:
    soup -- the plain text of str type.
    """
    with open(file_path, encoding='utf-8',errors='ignore') as fin:
        soup = BeautifulSoup(BeautifulSoup(fin, 'html.parser').prettify()).text
        soup = soup.lower()
        return soup

In [5]:
# FUNCTION: split_text
def tokenize_the_text(raw_text):
    """
    This function converts raw text into list[i] i.e into tokens.
    
    Argument:
    raw_text -- class 'str' type.
    
    Returns:
    splitText -- list of unique words extracted from raw_text.
    len(splitText) -- length of words in splitText.
    """
    #lower_text = raw_text.lower()
    lower_text = raw_text
    """
    # Method 1 : Using regex
    """
    ####splitTextDig = re.split('[a-f]+',lower_text) # comment it
    splitTextAlp = str(re.findall(r"[a-z]+", lower_text))
    
    # Uncommenting This line incurrs "sns_x" type words 
    ####splitTextAlp = str(re.split(" [\s.,!?:;}{)(>]",lower_text))
    
    ####splitTextAlp = str(re.split('[0-9]+', splitTextAlp,flags=re.IGNORECASE)) #comment it
    
    splitTextAlp = list(re.split(r'\W+', (splitTextAlp)))
#     splitTextAlp = list(re.split(r' ', str(splitTextAlp)))
#     splitTextAlp = list(re.split(r'\W+', str(splitTextAlp)))
    # Now, take only those words which make sense and are greater than 3.
    splitText = [x for x in splitTextAlp if len(x) > 3]
    
    # By converting into set it will take only unique words from list.
    # Also by type casting converting back to list
    splitText = list(set(splitText))
    #splitText.sort()
    
    """
    #Method 2 : Using ntlk
    """
    #from nltk.tokenize import word_tokenize 
    #splitTextAlp = word_tokenize(lower_text)
    ###splitTextAlp = str(re.findall(r"[a-z]+", str(splitTextAlp)))
    #splitTextAlp = list(re.split('[0-9]+',str(splitTextAlp)))
    #wt = [x for x in word_tokens if len(x) > 3]
    #wt = list(set(wt))

    
    #print(type(splitText))
    #splitText = list(set(splitTextAlp).difference(splitTextDig))
    return splitText, len(splitText)
    #return wt, len(wt)

In [6]:
# Function : remove_stop_words
def remove_stop_words(document_words, stop_words):
    """
    This function removes the stop_list from tokens of documents.
    Stop_words are those words which occurs in abundance in text.
    
    Argument:
    document_words -- list of all the tokens/words extracted from document.
    stop_words -- list of all the stop_list extracted from file.
    
    Returns:
    cleaned -- list of all words which do not have stop_list words.
    """
    #print(type(document_words))
    #print(type(stop_words))
    cleaned_tokens_from_stop_words =  list(set(document_words) - set(stop_words))
    #cleaned_tokens_from_stop_words.sort()
    return cleaned_tokens_from_stop_words

In [7]:
# Function : stem_words
def stem_words(tokenized_words_without_stop_words):
    """
    This function takes in list of words which do not contain stop_words.
    It uses the PorterStemmer() to reduce the word to their root words.
    
    Argument:
    removed_all_stop_words -- list of all words which do not have stop_words.
    
    Returns:
    stemmed_words -- list of words which are reduced to their origin word.
    """
    ps = PorterStemmer()
    stemmed_words = list()
    for w in tokenized_words_without_stop_words:
        stemmed_words.append(ps.stem(w))
    stemmed_words.sort()
    return stemmed_words

In [8]:
# Function : docid_write_to_file
def docid_write_to_file(output_file_name, doc_id, doc_name):
    """
    This function takes in name of output file along with it's directory,
    and name of document i.e. name of text file. Then it generates a random 
    number and then writes random number and text file name to output
    file.
    
    Argument:
    output_file_name -- directory path concatenated to output file name.
    doc_name -- the name of single text file to write in output file.
    
    Returns:
    Nothing
    """
    fh = open(output_file_name, "a+")
    idx = random.randint(1,5000)
    
    line = str(doc_id) + "\t" + doc_name
           
    fh.write(line)
    fh.write('\n')
    
    fh.close()

In [9]:
# Function : termid_write_to_file
def termid_write_to_file(output_file_name, all_collection_of_words, all_collection_of_word_ids):
    """
    This function takes in name of output file along with it's directory,
    and opens in append mode, and it takes in list of all words found in
    all documents in sorted form.
    Argument:
    output_file_name -- directory path concatenated to output file name.
    list_of_all_unique_words -- list of all the unique words in sorted form.
    
    Returns:
    Nothing.
    """
    fh = open(output_file_name, "a+")
    for i in range(0,len(all_collection_of_words)):
        new_line = str(all_collection_of_word_ids[i]) + "\t" + all_collection_of_words[i]
        fh.write(new_line)
        fh.write('\n')
    fh.close()

In [10]:
# Function : clear_collection_from_repetition
def clear_collection_from_repetition(collection_of_all_words, stemmed_words):
    """
    This function 
    
    Argument:
    collection_of_all_words -- .
    stemmed_words -- 
    Returns:
    s -- .
    """
    
    cleaned = list(set(collection_of_all_words) - set(stemmed_words))
    common = list(set(collection_of_all_words) & set(stemmed_words))
    
    for w in range (0,len(stemmed_words)):
            collection_of_all_words.append(stemmed_words[w])
    #print(cleaned[0])
    #cleaned.sort()
    return cleaned

In [11]:
# Function : tokenize_the_directory_path
def tokenize_the_directory_path(path):
    """
    This function converts directory path into list[i] i.e into tokens.
    
    Argument:
    path -- class 'str' type of path of file.
    
    Returns:
    splitText -- list of unique words extracted from raw_text.
    """
    
    split_text = re.split("[/]",path)
    
    return split_text

In [89]:
def find_all_files_in_directory(directory):
    # Read all .txt files from directory and save to files list.
    files = []
    #r=root, d=directories, f = files
    for r, d, f in os.walk(directory):
        for file in f:
            if '.txt' not in file:
            #if '.txt' in file:
                #files.append(os.path.join(r, file))
                files.append(file)
    return files

# Step 2: Inverted Index

<font color=blue> term_index.txt - </font> An inverted index containing the file position for each occurrence of each term in the collection. Each line should contain the complete inverted list for a single term. Each line should contain a list of DOCID,POSITION values. Each line of this file should contain a TERMID followed by a space-separated list of properties as follows:
<font color=green> 347 1542 567 432,43 456,33 456,41 </font>
- 347: TERMID
- 1542: Total number of occurrences of the term in the entire corpus 
- 567: Total number of documents in which the term appears
- 432: Document Id in which term appears
- 43: Position of term in document 432

In order to support more efficient compression you must apply delta encoding to the inverted list. The first DOCID for a term and the first POSITION for a document will be stored normally. Subsequent values should be stored as the offset from the prior value.

Instead of encoding an inverted list like this: <font color=green>347 1542 567 432,43 456,33 456,41 </font>
- You should encode it like this:
<font color=green>347 1542 567 432,43 24,33 0,8  </font>
- <font color=red> Note that </font> in order to do this, your DOCIDs and POSITIONs must be sorted in ascending order.



In [90]:
def term_index_write_to_file(output_file_name, inverted_index_dictionary):
    fh = open(output_file_name, "a+")
    for k , v in inverted_index_dictionary.items():
        total_number_of_occurance = len(v)
        #     term_id.      # it's total count.                     # all doc_id or postings
        lne = str(k) + "\t" + str(len(v)) + "\t" + str(v)
        fh.write(lne)
        fh.write('\n')
    fh.close()

In [91]:
def create_postings_for_unhashed_ii(list_of_doc_and_terms):
    my_dict = dict()
    for tup in list_of_doc_and_terms:
        temp = list()
        key = 0
        i = 1
        for element in tup:
            if(np.mod(i,2) != 0): # value = doc_id
                if (my_dict.get(key,0) == 0):
                    temp.append(element)
                else:
                    history = my_dict[key]
                    for e in range(0, len(history)):
                        temp.append(history[e])
                    temp.append(element)
            elif(np.mod(i,2) == 0): # key = term_id
                key = element
            i+=1
        my_dict[key] = list(set(temp))
    return my_dict

# Hashed Inverted Index

In [94]:
#build version 2.0
def create_hashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_hashed" + extension
    term_output_file_name = output_directory + "termid_hashed" + extension
    
    term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    hashed_ii = dict() ## (key:terms_id, )
    all_collection_of_words = list()
    all_collection_of_word_ids = list()
    dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
    
    ## One doc file is opened.
    for i in range(0,int(len(files)/500)):
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        print(file_path)
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        
        word_id_postings_list = list()
        ## Append all words of a doc to list
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,100000)
            while term_id in hashed_ii.keys():
                term_id = random.randint(1,100000)
                
            ## If that word already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                
                lst = list()
                history = hashed_ii[term_id]
                
                for e in range(0, len(history)):
                    lst.append(history[e])
                lst.append(doc_id)
                
                lst = list(set(lst))
                hashed_ii.update({term_id :lst})
            else:
                ## already not present that word
                ## new place created in dictionary
                lst = list()
                lst.append((doc_id))
                hashed_ii[str(term_id)] = lst
                
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append(str(term_id))
        docid_write_to_file(document_output_file_name,doc_id,files[i])
    
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
    print("Happy Ending")
    print(len(all_collection_of_words))

To create hashed inverted index execute following function.

In [93]:
create_hashed_inverted_index()

/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-1202wb-26-10513
/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-1102wb-73-18046
/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-1505wb-68-30103
/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-0303wb-53-27200
/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-1905wb-44-08158
/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/clueweb12-1012wb-63-19337
Happy Ending
1738


# Unhashed Inverted Index

In [68]:
def create_unhashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_unhashed" + extension
    term_output_file_name = output_directory + "termid_unhashed" + extension
    
    term_index_dot_txt = get_directory_path("output") + "term_index_unhashed" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    
    list_of_docs = list() # for storing the ids of doc
    list_of_term_ids = list() # for storing the ids of all terms
    list_of_doc_and_terms = list() # for storing as tuples
    all_collection_of_words = list() # for storing all the unique words
    all_collection_of_word_ids = list() # for storing all the unique ids
    
    for i in range(0,len(files)):
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        
        word_id_postings_list = list()
        ## Append all words of a doc to list
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,100000)
            while term_id in all_collection_of_word_ids:
                term_id = random.randint(1,100000)
            
            # if that word is not new, already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX)
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            else:
                #list_of_term_ids.append(term_id)
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX[1])
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append((term_id))
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            
        #list_of_docs.append(doc_id)
        docid_write_to_file(document_output_file_name,doc_id,files[i])
        # sort on the basis of term_ids
    #list_of_doc_and_terms.sort(key = operator.itemgetter(1))
    
    list_of_doc_and_terms = list(zip(list_of_docs, list_of_term_ids))
    list_of_doc_and_terms.sort(key = operator.itemgetter(1))
#     print((list_of_doc_and_terms))

    dict_for_termid_and_postings = create_postings_for_unhashed_ii(list_of_doc_and_terms)
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    
    term_index_write_to_file(term_index_dot_txt, dict_for_termid_and_postings)
    
    print("Happy Ending")
    print(len(all_collection_of_words))
#    print(dict_for_termid_and_postings)

To create hashed inverted index execute following function.

In [69]:
create_unhashed_inverted_index()

Happy Ending
8781
