# Step 1: Tokenizing Documents

The first step in creating an index is tokenization. You must convert a document into a stream of tokens suitable for indexing.
Your tokenizer should follow these steps:
1. Accept a directory name as a command line argument, and process all files found in that directory.
2. Extract the document text with an HTML parsing library, ignoring the headers at the beginning of the file and all HTML tags.
3. Split the text into tokens (You can use some library for regular expression matching. To learn about regular expressions go to this link http://www.rexegg.com/regex-quickstart.html).
4. Convert all tokens to lowercase (this is not always ideal, but indexing intelligently in a case-sensitive manner is tricky).
5. Apply stop-wording to the document by ignoring any tokens found in this list <font color=blue>(\\Cactus \xeon\Maryam Bashir\Information Retrieval\stoplist)</font>.
6. Apply stemming to the document using any standard algorithm – <font color=green> Porter, Snowball, and KStem stemmers </font> are appropriate. You should use a stemming library for this step.
7. Your tokenizer will write two files:
- docids.txt – A file mapping a document's filename (without path) to a unique integer, its DOCID. Each line should be formatted with a DOCID and filename separated by a tab, as follows:<font color=blue> 1234\t32435</font>
- termids.txt – A file mapping a token found during tokenization to a unique integer, its TERMID. Each line should be formatted with a TERMID and token separated by a tab, as follows: <font color=blue> 567\tapple </font>

<font color=blue>  </font>

In [1]:
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import random
import os
import operator
import numpy as np
#from sets import Set
#from html.parser import HTMLParser

In [2]:
def get_directory_path(mode):
    """
    It takes only path of folder, no file name.
    It only returns the folder which contain all the text file.
    
    Argument:
    #nothing
    
    Returns:
    dp -- directory path which contains all the txt files.
    """
    if (mode == "input"):   
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/"
    elif (mode == "output"):
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw1/out/"
    else:
        raise ValueError('Unspecified mode.')
        dp = None

    return dp

In [3]:
# Function : read_stop_list
def read_text_in_list_form(file_path):
    """
    This function takes the path of stop words file and reads it and returns a list of words.
    
    Argument:
    stop_file_path -- path should be like: path + file name.extension
        "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt".
    
    Returns:
    lineList -- list of words containg all the stop_words.
    """
    
    lst = [line.rstrip('\n') for line in open(file_path)]
    return lst

In [4]:
# Function = remove_html_headers
def remove_html_headers(file_path):
    """
    This function takes in path of file + file name and opens it. 
    Then it remove all the headers of html and returns the plain text.
    
    Argument:
    file_path -- concateted: path folder + file name
    
    Returns:
    soup -- the plain text of str type.
    """
    with open(file_path, encoding='utf-8', errors='ignore') as fin:
        soup = BeautifulSoup(BeautifulSoup(fin, 'html.parser').prettify()).text
        soup = soup.lower()
        return soup

In [5]:
# version 2.0: removed #splitText = list(set(splitText))
# FUNCTION: split_text
def tokenize_the_text(raw_text):
    """
    This function converts raw text into list[i] i.e into tokens.
    
    Argument:
    raw_text -- class 'str' type.
    
    Returns:
    splitText -- list of unique words extracted from raw_text.
    len(splitText) -- length of words in splitText.
    """
    #lower_text = raw_text.lower()
    lower_text = raw_text
    """
    # Method 1 : Using regex
    """
    splitTextAlp = str(re.findall(r"[a-z]+", lower_text))
    splitTextAlp = list(re.split(r'\W+', (splitTextAlp)))

    # Now, take only those words which make sense and are greater than 3.
    splitText = [x for x in splitTextAlp if len(x) > 3]
    
    # By converting into set it will take only unique words from list.
    # Also by type casting converting back to list
    #splitText = list(set(splitText))
    
    
    """
    #Method 2 : Using ntlk
    """
    #from nltk.tokenize import word_tokenize 
    #splitTextAlp = word_tokenize(lower_text)
    ###splitTextAlp = str(re.findall(r"[a-z]+", str(splitTextAlp)))
    #splitTextAlp = list(re.split('[0-9]+',str(splitTextAlp)))
    #wt = [x for x in word_tokens if len(x) > 3]
    #wt = list(set(wt))

    
    #print(type(splitText))
    #splitText = list(set(splitTextAlp).difference(splitTextDig))
    return splitText, len(splitText)
    #return wt, len(wt)

In [6]:
# Function : remove_stop_words
def remove_stop_words(document_words, stop_words):
    """
    This function removes the stop_list from tokens of documents.
    Stop_words are those words which occurs in abundance in text.
    
    Argument:
    document_words -- list of all the tokens/words extracted from document.
    stop_words -- list of all the stop_list extracted from file.
    
    Returns:
    cleaned -- list of all words which do not have stop_list words.
    """
    cleaned_tokens_from_stop_words =  list(set(document_words) - set(stop_words))
    return cleaned_tokens_from_stop_words

In [7]:
# Function : stem_words
def stem_words(tokenized_words_without_stop_words):
    """
    This function takes in list of words which do not contain stop_words.
    It uses the PorterStemmer() to reduce the word to their root words.
    
    Argument:
    removed_all_stop_words -- list of all words which do not have stop_words.
    
    Returns:
    stemmed_words -- list of words which are reduced to their origin word.
    """
    ps = PorterStemmer()
    stemmed_words = list()
    for w in tokenized_words_without_stop_words:
        stemmed_words.append(ps.stem(w))
    stemmed_words.sort()
    return stemmed_words

In [8]:
# Function : docid_write_to_file
def docid_write_to_file(output_file_name, doc_id, doc_name, doc_length):
    """
    This function takes in name of output file along with it's directory,
    and name of document i.e. name of text file. Then it generates a random 
    number and then writes random number and text file name to output
    file.
    
    Argument:
    output_file_name -- directory path concatenated to output file name.
    doc_name -- the name of single text file to write in output file.
    
    Returns:
    Nothing
    """
    fh = open(output_file_name, "a+")
    idx = random.randint(1,5000)
    line = str(doc_id) + "\t" + doc_name +  "\t" + str(doc_length)
    fh.write(line)
    fh.write('\n')
    
    fh.close()

In [9]:
# Function : termid_write_to_file
def termid_write_to_file(output_file_name, all_collection_of_words, all_collection_of_word_ids):
    """
    This function takes in name of output file along with it's directory,
    and opens in append mode, and it takes in list of all words found in
    all documents in sorted form.
    Argument:
    output_file_name -- directory path concatenated to output file name.
    list_of_all_unique_words -- list of all the unique words in sorted form.
    
    Returns:
    Nothing.
    """
    fh = open(output_file_name, "a+")
    for i in range(0,len(all_collection_of_words)):
        new_line = str(all_collection_of_word_ids[i]) + "\t" + all_collection_of_words[i]
        fh.write(new_line)
        fh.write('\n')
    fh.close()

In [10]:
# Function : tokenize_the_directory_path
def tokenize_the_directory_path(path):
    """
    This function converts directory path into list[i] i.e into tokens.
    
    Argument:
    path -- class 'str' type of path of file.
    
    Returns:
    split_Text -- list of unique words extracted from raw_text.
    """
    
    split_text = re.split("[/]",path)
    
    return split_text

In [11]:
def find_all_files_in_directory(directory):
    # Read all files from directory and save to files list.
    files = []
    #r=root, d=directories, f = files
    for r, d, f in os.walk(directory):
        for file in f:
            if '.txt' in file:
                files.append(file)
            elif file == ".DS_Store": #.DS_Store
                print("\n")
            else:
                files.append(file)
    return files

# Step 2: Inverted Index

<font color=blue> term_index.txt - </font> An inverted index containing the file position for each occurrence of each term in the collection. Each line should contain the complete inverted list for a single term. Each line should contain a list of DOCID,POSITION values. Each line of this file should contain a TERMID followed by a space-separated list of properties as follows:
<font color=green> 347 1542 567 432,43 456,33 456,41 </font>
- 347: TERMID
- 1542: Total number of occurrences of the term in the entire corpus 
- 567: Total number of documents in which the term appears
- 432: Document Id in which term appears
- 43: Position of term in document 432

In order to support more efficient compression you must apply delta encoding to the inverted list. The first DOCID for a term and the first POSITION for a document will be stored normally. Subsequent values should be stored as the offset from the prior value.

Instead of encoding an inverted list like this: <font color=green>347 1542 567 432,43 456,33 456,41 </font>
- You should encode it like this:
<font color=green>347 1542 567 432,43 24,33 0,8  </font>
- <font color=red> Note that </font> in order to do this, your DOCIDs and POSITIONs must be sorted in ascending order.



In [12]:
def term_index_write_to_file(output_file_name, inverted_index_dictionary):
    fh = open(output_file_name, "a+")
    for term_id , dicti in inverted_index_dictionary.items():
                # term_id 
        t_id = str(term_id)
        fh.write(t_id)
        for d_id, poss in dicti.items():
            total_number_of_occurance = len(poss)
                       # doc_id          # count               # positions
            lne = "\t"+ str(d_id) +"\t"+ str(len(poss)) +"\t"+ str(poss)
            fh.write(lne)
            fh.write('\n')
    fh.close()

In [13]:
def create_postings_for_unhashed_ii(zip_of_doc_and_terms):
    my_dict = dict()
    # One element is a tuple of doc_id and term_id.
    for tup in zip_of_doc_and_terms:
        temp = list()
        term_id = tup[1]
        i = 1
        # now pick one element.
        for curr in tup:
            # Getting doc_id and appending it into a list
            if(np.mod(i,2) != 0): # thus here we will encouter doc_id
                # If key is not found.
                #if (my_dict.get(key,0) == 0):
                if term_id not in my_dict:
                    temp = list()
                    temp.append(curr)
                    my_dict[term_id] = list(set(temp))
                else:
                    history = my_dict[term_id]
                    for e in range(0, len(history)):
                        temp.append(history[e])
                    temp.append(curr)
                    my_dict[term_id] = list(set(temp))
            # If it is not a doc_id then it is term_id,
            # thus make a key out of it, and insert that
            # element into it.
            elif(np.mod(i,2) == 0): # key = term_id
                term_id = curr
                my_dict[term_id] = temp
            i+=1
            
    return my_dict

In [14]:
# Utility function: for document_postings purpose
#
# " output" = term_id "\t" len(document_postings) "\t" document_postings
#
def create_doc_length_txt(output_file_name, hashed_ii2):
    fh = open(output_file_name, "a+")
    for k , v in hashed_ii2.items():
        l = len(v)
        line = str(k) + "\t" + str(l) + "\t" + str(v)
        fh.write(line)
        fh.write('\n')
    fh.close()

In [15]:
def delta_encoding(output_file_name, hashed_ii):
    fh = open(output_file_name, "a+")
    for term_id , doc_dicti in hashed_ii.items():
        t_id = str(term_id)
        fh.write(t_id)
        for doc_id, positions in doc_dicti.items():
            lst = list()
            for indx in range(0, (len(positions))):
                if indx == 0:
                    lst.append(int(positions[indx]))
                else:
                    num = int(positions[indx]) - int(positions[(indx-1)])
                    lst.append(num)
            lne = "\t"+ str(doc_id) +"\t"+ str(len(positions)) + "\t" + str(lst)
            fh.write(lne)
            fh.write('\n')    
    fh.close()         

# Hashed Inverted Index

In [16]:
# # build version 2.0 #list with dict #WOrking
# # Only puts documents in which that term_id exists
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
    
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files))):
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
        
#         word_id_postings_list = list()
#         ## Append all words of a doc to list
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
            
#             term_id = random.randint(1,1000000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,1000000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
                
#                 lst = list()
#                 history = hashed_ii[term_id] # hashed_ii[str(term_id)]
                
#                 for e in range(0, len(history)):
#                     lst.append(history[e])
#                 lst.append(doc_id)
                
#                 lst = list(set(lst))
#                 hashed_ii.update({term_id :lst})
#             else:
#                 ## already not present that word
#                 ## new place created in dictionary
#                 lst = list()
#                 lst.append((doc_id))
#                 hashed_ii[str(term_id)] = lst
                
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
    
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [17]:
# #build version 2.1 #dictionary within dictionary
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
    
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
#     history = dict()
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files)/500)):
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
        
#         word_id_postings_list = list()
#         doc_and_position_container = dict()
        
#         ## Append all words of a doc to list
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
            
#             term_id = random.randint(1,100000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,100000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
                
#                 # Now history will be of type = dict()
#                 #if str(doc_id) in 
#                 history = hashed_ii[str(term_id)]
#                 lst = list()
#                 print("history = ",history)
#                 already_existed_positions = (history[str(doc_id)])
#                 #print("type = ",type(already_existed_positions))
#                 print("already_existed_positions = ", already_existed_positions)
                
#                 for e in range(0, len(already_existed_positions)):
#                         lst.append(already_existed_positions[e])
#                 #lst.append(already_existed_positions)
#                 lst.append(stemmed_tokens_of_words.index(current_word))
#                 print("lst = ", lst)
#                 history.update({str(doc_id) : lst})
#                 hashed_ii.update({str(term_id) : history})
#             else:
#                 ## already not present that word
#                 ## new place created in dictionary
#                 # key -> doc_id  par positions as values rakh di
#                 #lst = list()
#                 #lst.append(stemmed_tokens_of_words.index(current_word))
#                 hashed_ii[str(term_id)] = dict()
#                 lsr = list()
                
#                 if str(doc_id) in doc_and_position_container:
#                     temp = doc_and_position_container[str(doc_id)]
#                     for e in range(0, len(doc_and_position_container)):
#                         lsr.append(temp[e])
#                 lsr.append(stemmed_tokens_of_words.index(current_word))
#                 doc_and_position_container[str(doc_id)] = list() #stemmed_tokens_of_words.index(current_word) 
#                 doc_and_position_container[str(doc_id)] = lsr

#                 #print("doc_and_position_container = ", doc_and_position_container)
                
#                 hashed_ii[str(term_id)] = doc_and_position_container
#                 #print("hashed_ii = ", hashed_ii)
                
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
    
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [18]:
# #build version 2.2 #dictionary within dictionary #WORKING
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed_v4.0" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
#     history = dict()
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files)/200)):
#         #print(files[i])
#         #print("\n", i)
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
#         doc_and_position_container = dict()

#         ## Append all words of a doc to list
        
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
#             term_id = random.randint(1,1000000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,100000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
#                 history = dict()
#                 history = hashed_ii[str(term_id)] 
                
#                 for d_id, positions_list in history.items():
#                     #if d_id == str(doc_id):
#                         already_existed_positions = list()
#                         already_existed_positions = positions_list #(history[d_id])
#                     #print("positions_list ", positions_list)
#                     #lst = list()
#                     #for e in range(0, len(already_existed_positions)):
#                     #    lst.append(already_existed_positions[e])
#                         already_existed_positions.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                         #print("already_existed = ",d_id, " ", already_existed_positions)
#                         #print(d_id,term_id, current_word)
#                     #lst.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                     #history[kys] = lst
#                         #history.update({d_id : already_existed_positions})
#                         history[str(d_id)] = already_existed_positions
#                     #lst = list(set(lst))
#                 #hashed_ii[str(term_id)] = history
#                 #history.update({(doc_id) : lst})
#                         hashed_ii[str(term_id)] = history
#                         #hashed_ii.update({str(term_id) : history})  # <--- ye line dekh zara
#                         #hashed_ii.update(history)
#                 #print("nai idher, if mein")
#             ## If already not present that word
#             else:
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#                 lsr = list()
#                 position = stemmed_tokens_of_words.index(current_word) 
#                 #print("positions = " , position)
#                 lsr.append(position)
#                 doc_and_position_container = dict()
#                 doc_and_position_container[str(doc_id)] = lsr
#                 hashed_ii[str(term_id)] = dict()
#                 #hashed_ii[str(term_id)] = doc_and_position_container
#                 hashed_ii.update({str(term_id) : doc_and_position_container})
#                 #print("\nKia Ider")
#                 #print("hashed_ii = ", hashed_ii)
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
#     print("hashed_ii = ", hashed_ii)
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [19]:
# #build version 2.3 #dictionary within dictionary #Correct Output
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed_v5.0" + extension
#     document_frequency_file = get_directory_path("output") + "document_postings" + extension
#     #document_length = get_directory_path("output") + "document_length" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:{terms_id: {doc_id: positions}} )
#     hashed_ii2 = dict()
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: term_id, value:docs_list)
#     history = dict()
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files)/200)):
#         #print(files[i])
#         #print("\n", i)
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
#         doc_and_position_container = dict()

#         ## Append all words of a doc to list
        
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
            
#             term_id = random.randint(1,1000000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,1000000)
#             doc_and_position_container = dict()
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
                
#                 ####### This part for assign 2: for term and it's documents in which ot exists. ######
#                 # In short, later used for doc_length
#                 lst2 = list()
#                 history2 = hashed_ii2[str(term_id)]
#                 for e in range(0, len(history2)):
#                     lst2.append(history2[e])
#                 lst2.append(doc_id)
                
#                 lst2 = list(set(lst2))
#                 hashed_ii2.update({term_id :lst2})
#                 ########################################################################
                
#                 #- new dict bna k uske ander new entry kron new doc_id ki
#                 #- yahan per ye bhe dekha jae k kia wo doc_id mein pehli mil gya hai (word repeat zror 
#                 #magar new document mein )
#                 #- aur agar usi same document mein hai tou, matlab doc_id as key already mujood hai.
#                 history = dict()
#                 history = hashed_ii[str(term_id)]
                
#                 ### -> Word repeat tou lazmi huva hai.
                
#                 # Case: Agar usi document mein dobara word mil gaya hai, tou wo doc_id tou pehle se mujood hoga
#                 if str(doc_id) in history:
#                     for d_id, positions in history.items():
#                         if d_id == str(doc_id):
#                             positions_list = history[str(d_id)]
#                             already_existed_positions = list()
#                             already_existed_positions = positions_list #(history[d_id])
#                             already_existed_positions.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                             #print("Before updating HISTORY", history)
#                             history.update({(d_id) :  already_existed_positions})
#                             #print("After updating HISTORY", history)
#                             #print("history = ", history)

#                             #print("Before updating hashed_ii", hashed_ii)
#                             hashed_ii.update({str(term_id): history})
#                             #print("After updating hashed_ii", hashed_ii)
#                 else: #agar naya doc aya hai tou, new doc ki key bnani paregi.
#                     # naya doc hai, tou new key bnani paregi doc_id ki history ki dict mein
#                     first_time_position_found_in_new_doc = list()
#                     first_time_position_found_in_new_doc.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                     history[str(doc_id)] = first_time_position_found_in_new_doc
#                     hashed_ii.update({str(term_id): history})
#                     #history.update({(d_id) :  first_time_position_found_in_new_doc})
#             ## If already NOT present that word
#             else:
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
                
#                 ####### This part: for term and it's documents in which ot exists. ####
#                 lst2 = list()
#                 lst2.append((doc_id))
#                 hashed_ii2[str(term_id)] = lst2
#                 ####################################################################
                
#                 lsr = list()
#                 position = stemmed_tokens_of_words.index(current_word) 
#                 #print("positions = " , position)
#                 lsr.append(position)
#                 doc_and_position_container = dict()
#                 # usi same doc_id k ander pos change ki ja raha ha
#                 doc_and_position_container[str(doc_id)] = lsr
#                 #print("doc_and_position_container = ", doc_and_position_container)
#                 #hashed_ii[str(term_id)] = dict()
#                 #print("Before update", hashed_ii)
#                 hashed_ii.update({str(term_id) : doc_and_position_container})
#                 #print("\nAfter update", hashed_ii)
#         docid_write_to_file(document_output_file_name, doc_id, files[i], len(stemmed_tokens_of_words))
#     #print("hashed_ii = ", hashed_ii)
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     create_doc_length_txt(document_frequency_file, hashed_ii2)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [27]:
#build version 2.4 #dictionary within dictionary #Correct Output
# build info: added delta encoding
# 
def create_hashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_hashed" + extension
    term_output_file_name = output_directory + "termid_hashed" + extension
    term_index_dot_txt = get_directory_path("output") + "term_index_hashed_v5.0" + extension
    document_postings_file = get_directory_path("output") + "document_postings" + extension
    delta_encoding_file = get_directory_path("output") + "delta_encoding" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    hashed_ii = dict() ## (key:{terms_id: {doc_id: positions}} )
    hashed_ii2 = dict() ## {key:{terms_id: doc_ids}}
    all_collection_of_words = list()
    all_collection_of_word_ids = list()
    dict_for_docs = dict() ## (key: term_id, value:docs_list)
    history = dict()
    
    ## One doc file is opened.
    for i in range(0,int(len(files))):
        print(i ," = ", files[i])
        #print("\n", i)
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        doc_and_position_container = dict()
        
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,1000000)
            while term_id in hashed_ii.keys():
                term_id = random.randint(1,1000000)
            doc_and_position_container = dict()
            ## If that word already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                
                ####### This part for assign 2: for term and it's documents in which ot exists. ######
                # In short, later used for doc_length
                lst2 = list()
                history2 = hashed_ii2[str(term_id)]
                for e in range(0, len(history2)):
                    lst2.append(history2[e])
                lst2.append(doc_id)
                
                lst2 = list(set(lst2))
                hashed_ii2.update({term_id :lst2})
                ########################################################################
                
                #- new dict bna k uske ander new entry kron new doc_id ki
                #- yahan per ye bhe dekha jae k kia wo doc_id mein pehli mil gya hai (word repeat zror 
                #magar new document mein )
                #- aur agar usi same document mein hai tou, matlab doc_id as key already mujood hai.
                history = dict()
                history = hashed_ii[str(term_id)]
                
                ### -> Word repeat tou lazmi huva hai.
                
                # Case: Agar usi document mein dobara word mil gaya hai, tou wo doc_id tou pehle se mujood hoga
                if str(doc_id) in history:
                    for d_id, positions in history.items():
                        if d_id == str(doc_id):
                            positions_list = history[str(d_id)]
                            already_existed_positions = list()
                            already_existed_positions = positions_list #(history[d_id])
                            already_existed_positions.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
                            history.update({(d_id) :  already_existed_positions})
                            hashed_ii.update({str(term_id): history})
                else: #agar naya doc aya hai tou, new doc ki key bnani paregi.
                    # naya doc hai, tou new key bnani paregi doc_id ki history ki dict mein
                    first_time_position_found_in_new_doc = list()
                    first_time_position_found_in_new_doc.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
                    history[str(doc_id)] = first_time_position_found_in_new_doc
                    hashed_ii.update({str(term_id): history})
                    #history.update({(d_id) :  first_time_position_found_in_new_doc})
            ## If already NOT present that word
            else:
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append(str(term_id))
                
                ####### This part for assignment 2: for term and it's documents in which ot exists. ####
                lst2 = list()
                lst2.append((doc_id))
                hashed_ii2[str(term_id)] = lst2
                ####################################################################
                
                lsr = list()
                position = stemmed_tokens_of_words.index(current_word) 
                #print("positions = " , position)
                lsr.append(position)
                doc_and_position_container = dict()
                # usi same doc_id k ander pos change ki ja raha ha
                doc_and_position_container[str(doc_id)] = lsr
                hashed_ii.update({str(term_id) : doc_and_position_container})
        docid_write_to_file(document_output_file_name, doc_id, files[i], len(stemmed_tokens_of_words))
    #print("hashed_ii = ", hashed_ii)
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    create_doc_length_txt(document_postings_file, hashed_ii2)
    term_index_write_to_file(term_index_dot_txt, hashed_ii)
    delta_encoding(delta_encoding_file, hashed_ii)
    
    print("Happy Ending")
    print(len(all_collection_of_words))

To create hashed inverted index execute following function.

In [28]:
create_hashed_inverted_index()



0  =  clueweb12-1202wb-26-10513
1  =  clueweb12-1102wb-73-18046
2  =  clueweb12-1505wb-68-30103
3  =  clueweb12-0303wb-53-27200
4  =  clueweb12-1905wb-44-08158
5  =  clueweb12-1012wb-63-19337
6  =  clueweb12-1118wb-77-23080
7  =  clueweb12-0800tw-39-05237
8  =  clueweb12-0211wb-75-04122
9  =  clueweb12-0001wb-96-10862
10  =  clueweb12-0200wb-25-11228
11  =  clueweb12-1905wb-14-19033
12  =  clueweb12-1302wb-14-07756
13  =  clueweb12-1101wb-78-26737
14  =  clueweb12-1705wb-99-01272
15  =  clueweb12-1504wb-72-14179
16  =  clueweb12-1002wb-18-28831
17  =  clueweb12-0005wb-98-11303
18  =  clueweb12-1007wb-24-34046
19  =  clueweb12-0003wb-22-29862
20  =  clueweb12-0715wb-83-27742
21  =  clueweb12-0001wb-67-37051
22  =  clueweb12-1710wb-89-11471
23  =  clueweb12-0207wb-46-06884
24  =  clueweb12-1202wb-83-23274
25  =  clueweb12-0511wb-25-03910
26  =  clueweb12-0302wb-96-19667
27  =  clueweb12-1103wb-44-28062
28  =  clueweb12-1609wb-30-05777
29  =  clueweb12-1901wb-46-09870
30  =  clueweb12-0

244  =  clueweb12-0207wb-92-34457
245  =  clueweb12-1807wb-57-22320
246  =  clueweb12-1101tw-01-13761
247  =  clueweb12-0110wb-60-26411
248  =  clueweb12-1112wb-71-02492
249  =  clueweb12-0605wb-56-11894
250  =  clueweb12-1020wb-43-24428
251  =  clueweb12-1505wb-30-08089
252  =  clueweb12-1104wb-79-08217
253  =  clueweb12-1412wb-17-24994
254  =  clueweb12-0006wb-33-06695
255  =  clueweb12-1102wb-73-18049
256  =  clueweb12-0206wb-92-02625
257  =  clueweb12-0012wb-11-29675
258  =  clueweb12-0100wb-43-25360
259  =  clueweb12-0011wb-04-02431
260  =  clueweb12-0701wb-49-05609
261  =  clueweb12-1503wb-08-06602
262  =  clueweb12-0000wb-17-09638.txt
263  =  clueweb12-0818wb-73-17457
264  =  clueweb12-0110wb-67-18508
265  =  clueweb12-1414wb-72-03951
266  =  clueweb12-0901wb-49-18208
267  =  clueweb12-1003wb-26-06940
268  =  clueweb12-0409wb-70-14674
269  =  clueweb12-0412wb-34-25025
270  =  clueweb12-0001wb-67-37050
271  =  clueweb12-0010wb-59-00779
272  =  clueweb12-1709wb-91-25436
273  =  cl

485  =  clueweb12-1500tw-28-00767
486  =  clueweb12-1813wb-28-03870
487  =  clueweb12-0402wb-99-20324
488  =  clueweb12-0405wb-50-23725
489  =  clueweb12-1410wb-47-17974
490  =  clueweb12-0407wb-79-12486
491  =  clueweb12-0903wb-18-12490
492  =  clueweb12-0700tw-06-08493
493  =  clueweb12-0405wb-89-04395
494  =  clueweb12-1200tw-58-16162
495  =  clueweb12-0411wb-92-10614
496  =  clueweb12-0010wb-17-14863
497  =  clueweb12-0102wb-70-25965
498  =  clueweb12-1703wb-04-27857
499  =  clueweb12-0402wb-69-17675
500  =  clueweb12-1302wb-81-10066
501  =  clueweb12-1413wb-37-09761
502  =  clueweb12-1500tw-94-09765
503  =  clueweb12-1501wb-23-30099
504  =  clueweb12-1001wb-45-26799
505  =  clueweb12-0207wb-21-05464
506  =  clueweb12-0506wb-62-22811
507  =  clueweb12-1902wb-49-30749
508  =  clueweb12-0410wb-88-19112
509  =  clueweb12-1503wb-13-14814
510  =  clueweb12-1708wb-91-00140
511  =  clueweb12-0300tw-84-16440
512  =  clueweb12-1608wb-99-04594
513  =  clueweb12-0311wb-31-09390
514  =  cluewe

726  =  clueweb12-0301tw-10-13712
727  =  clueweb12-0000wb-98-25992
728  =  clueweb12-0013wb-65-19605
729  =  clueweb12-0001wb-97-24625
730  =  clueweb12-0400tw-77-05817
731  =  clueweb12-0207wb-68-24357
732  =  clueweb12-1411wb-58-08134
733  =  clueweb12-0611wb-67-31849
734  =  clueweb12-0308wb-17-13370
735  =  clueweb12-0510wb-95-32597
736  =  clueweb12-1906wb-87-37928
737  =  clueweb12-0402wb-24-02466
738  =  clueweb12-1018wb-51-19926
739  =  clueweb12-1714wb-27-16467
740  =  clueweb12-1500tw-07-09849
741  =  clueweb12-0307wb-54-03330
742  =  clueweb12-0002wb-13-17389
743  =  clueweb12-1602wb-36-11840
744  =  clueweb12-1602wb-81-10221
745  =  clueweb12-0402wb-69-05743
746  =  clueweb12-1400tw-57-14839
747  =  clueweb12-0200tw-70-07646
748  =  clueweb12-0403wb-79-04141
749  =  clueweb12-1606wb-68-27571
750  =  clueweb12-0602wb-63-25231
751  =  clueweb12-1107wb-75-21249
752  =  clueweb12-1510wb-29-22867
753  =  clueweb12-1702wb-52-18834
754  =  clueweb12-0200tw-20-20283
755  =  cluewe

967  =  clueweb12-1200tw-39-17654
968  =  clueweb12-1908wb-15-03967
969  =  clueweb12-1811wb-27-08062
970  =  clueweb12-1714wb-21-23241
971  =  clueweb12-0101wb-56-05508
972  =  clueweb12-0600wb-96-08509
973  =  clueweb12-1803wb-54-12715
974  =  clueweb12-1704wb-53-08649
975  =  clueweb12-0412wb-17-24220
976  =  clueweb12-1210wb-85-30030
977  =  clueweb12-0911wb-86-05875
978  =  clueweb12-1007wb-42-29087
979  =  clueweb12-0608wb-90-09879
980  =  clueweb12-0010wb-85-27475
981  =  clueweb12-0008wb-27-11074
982  =  clueweb12-0809wb-92-11210
983  =  clueweb12-1415wb-64-10843
984  =  clueweb12-0108wb-94-13837
985  =  clueweb12-0701wb-14-06076
986  =  clueweb12-0206wb-36-00656
987  =  clueweb12-0210wb-63-06188
988  =  clueweb12-1202wb-97-15939
989  =  clueweb12-0814wb-79-16461
990  =  clueweb12-0200wb-25-30603
991  =  clueweb12-1805wb-09-24816
992  =  clueweb12-0104wb-71-09949
993  =  clueweb12-0200tw-26-12674
994  =  clueweb12-1606wb-94-02140
995  =  clueweb12-0818wb-54-18647
996  =  cluewe

1202  =  clueweb12-1213wb-81-07573
1203  =  clueweb12-1803wb-70-25590
1204  =  clueweb12-0813wb-88-21848
1205  =  clueweb12-1708wb-69-13560
1206  =  clueweb12-0700wb-41-28465
1207  =  clueweb12-1208wb-09-17212
1208  =  clueweb12-1310wb-78-29402
1209  =  clueweb12-0000wb-36-05465.txt
1210  =  clueweb12-0700tw-32-08628
1211  =  clueweb12-0209wb-37-19515
1212  =  clueweb12-0503wb-13-07699
1213  =  clueweb12-0211wb-42-32270
1214  =  clueweb12-1412wb-23-27027
1215  =  clueweb12-1013wb-53-32077
1216  =  clueweb12-1202wb-27-16820
1217  =  clueweb12-0601wb-95-19856
1218  =  clueweb12-0002wb-74-22367
1219  =  clueweb12-0412wb-17-24219
1220  =  clueweb12-0907wb-41-26032
1221  =  clueweb12-0410wb-08-25519
1222  =  clueweb12-0910wb-13-05134
1223  =  clueweb12-0102wb-37-07417
1224  =  clueweb12-1000tw-65-02132
1225  =  clueweb12-1804wb-06-24686
1226  =  clueweb12-1601wb-52-27100
1227  =  clueweb12-0402wb-69-17666
1228  =  clueweb12-1911wb-45-10089
1229  =  clueweb12-0800tw-34-20390
1230  =  clueweb

1436  =  clueweb12-0908wb-40-21398
1437  =  clueweb12-0007wb-77-16069
1438  =  clueweb12-1708wb-02-14339
1439  =  clueweb12-0808wb-79-06928
1440  =  clueweb12-1018wb-64-13216
1441  =  clueweb12-0103wb-36-29626
1442  =  clueweb12-1205wb-61-29293
1443  =  clueweb12-0302wb-05-04478
1444  =  clueweb12-0800tw-61-07202
1445  =  clueweb12-1615wb-46-31509.txt
1446  =  clueweb12-0209wb-62-29857
1447  =  clueweb12-0817wb-97-03585
1448  =  clueweb12-1313wb-19-02347
1449  =  clueweb12-0705wb-95-19855
1450  =  clueweb12-1907wb-04-23627
1451  =  clueweb12-0602wb-45-15703
1452  =  clueweb12-0902wb-22-17317
1453  =  clueweb12-1601wb-41-23587
1454  =  clueweb12-1404wb-58-10064
1455  =  clueweb12-0206wb-45-09895
1456  =  clueweb12-1000tw-06-04549
1457  =  clueweb12-0809wb-15-16523
1458  =  clueweb12-1515wb-31-06377
1459  =  clueweb12-0111wb-91-33474
1460  =  clueweb12-1210wb-72-29192
1461  =  clueweb12-1608wb-29-10367
1462  =  clueweb12-1512wb-14-10563
1463  =  clueweb12-1008wb-66-19209
1464  =  clueweb

1670  =  clueweb12-1304wb-13-16346
1671  =  clueweb12-0000wb-11-24195
1672  =  clueweb12-0000wb-27-14207.txt
1673  =  clueweb12-1511wb-29-29458
1674  =  clueweb12-1600wb-64-12849
1675  =  clueweb12-0503wb-26-03483
1676  =  clueweb12-0007wb-51-14954
1677  =  clueweb12-1410wb-51-07307
1678  =  clueweb12-1809wb-26-26787
1679  =  clueweb12-0713wb-23-25058
1680  =  clueweb12-0311wb-88-27661
1681  =  clueweb12-1200wb-96-09416
1682  =  clueweb12-1513wb-06-28243
1683  =  clueweb12-1705wb-99-01257
1684  =  clueweb12-0500wb-28-26763
1685  =  clueweb12-0009wb-20-04575
1686  =  clueweb12-0815wb-56-19270
1687  =  clueweb12-0506wb-65-00004
1688  =  clueweb12-0200tw-71-18458
1689  =  clueweb12-1200wb-45-12169
1690  =  clueweb12-0304wb-02-08999
1691  =  clueweb12-1304wb-13-16384
1692  =  clueweb12-0406wb-57-31748
1693  =  clueweb12-1415wb-04-30009
1694  =  clueweb12-0200wb-93-11027
1695  =  clueweb12-1305wb-51-21851
1696  =  clueweb12-0602wb-88-12051
1697  =  clueweb12-1910wb-28-14207
1698  =  clueweb

1904  =  clueweb12-0201wb-52-05240
1905  =  clueweb12-0807wb-30-05635
1906  =  clueweb12-1600wb-09-16841
1907  =  clueweb12-0402wb-91-33148
1908  =  clueweb12-1705wb-05-04305
1909  =  clueweb12-0701wb-76-01669
1910  =  clueweb12-0607wb-55-14514
1911  =  clueweb12-1702wb-53-17299
1912  =  clueweb12-1304wb-13-16367
1913  =  clueweb12-0306wb-50-22339
1914  =  clueweb12-1606wb-92-16309
1915  =  clueweb12-0102wb-78-31914
1916  =  clueweb12-1505wb-37-11329
1917  =  clueweb12-0210wb-22-00204
1918  =  clueweb12-0512wb-38-04097
1919  =  clueweb12-1301wb-87-26314
1920  =  clueweb12-0412wb-09-03723
1921  =  clueweb12-0304wb-36-19053
1922  =  clueweb12-1606wb-37-31264
1923  =  clueweb12-0412wb-23-03565
1924  =  clueweb12-0102wb-48-19454
1925  =  clueweb12-0412wb-19-05873
1926  =  clueweb12-1909wb-14-17814
1927  =  clueweb12-0210wb-97-18779
1928  =  clueweb12-1210wb-42-10910
1929  =  clueweb12-1511wb-66-02968
1930  =  clueweb12-0303wb-45-24796
1931  =  clueweb12-1415wb-64-04772
1932  =  clueweb12-0

2139  =  clueweb12-1713wb-52-07758
2140  =  clueweb12-1008wb-60-06325
2141  =  clueweb12-0409wb-70-14683
2142  =  clueweb12-0003wb-11-26230
2143  =  clueweb12-0400tw-53-02323
2144  =  clueweb12-0401wb-05-29199
2145  =  clueweb12-0305wb-01-31241
2146  =  clueweb12-0500wb-76-05998
2147  =  clueweb12-1201tw-23-06297
2148  =  clueweb12-1501wb-43-30072
2149  =  clueweb12-0202wb-03-09581
2150  =  clueweb12-0303wb-04-08428
2151  =  clueweb12-0410wb-98-25407
2152  =  clueweb12-0301wb-63-26904
2153  =  clueweb12-1413wb-42-32942
2154  =  clueweb12-0100tw-83-07477
2155  =  clueweb12-1204wb-04-00793
2156  =  clueweb12-1205wb-18-29328
2157  =  clueweb12-0101wb-14-06809
2158  =  clueweb12-1509wb-48-07515
2159  =  clueweb12-0609wb-47-23142
2160  =  clueweb12-0103wb-10-16813
2161  =  clueweb12-0500wb-39-06448
2162  =  clueweb12-0500wb-82-26713
2163  =  clueweb12-1404wb-11-30850
2164  =  clueweb12-1116wb-59-24329
2165  =  clueweb12-1216wb-41-05666
2166  =  clueweb12-0108wb-05-29114
2167  =  clueweb12-0

2373  =  clueweb12-0006wb-57-04635
2374  =  clueweb12-0807wb-82-24366
2375  =  clueweb12-0403wb-92-03826
2376  =  clueweb12-0011wb-37-04896
2377  =  clueweb12-0311wb-46-23587
2378  =  clueweb12-0311wb-31-09393
2379  =  clueweb12-0003wb-75-32101
2380  =  clueweb12-0402wb-69-17671
2381  =  clueweb12-1701wb-86-25211
2382  =  clueweb12-0800wb-29-03800
2383  =  clueweb12-0201wb-57-07048
2384  =  clueweb12-0205wb-52-01308
2385  =  clueweb12-0709wb-58-16187
2386  =  clueweb12-1204wb-12-08967
2387  =  clueweb12-1009wb-56-14683
2388  =  clueweb12-0209wb-21-14900
2389  =  clueweb12-1804wb-12-17154
2390  =  clueweb12-1807wb-11-13673
2391  =  clueweb12-1811wb-19-05006
2392  =  clueweb12-0407wb-10-18823
2393  =  clueweb12-1810wb-51-18514
2394  =  clueweb12-0805wb-15-16555
2395  =  clueweb12-1017wb-33-07476
2396  =  clueweb12-0604wb-08-17596
2397  =  clueweb12-0900tw-39-08786
2398  =  clueweb12-1913wb-33-16047
2399  =  clueweb12-1613wb-25-12962
2400  =  clueweb12-0817wb-28-00882
2401  =  clueweb12-0

2607  =  clueweb12-1114wb-72-16114
2608  =  clueweb12-0200tw-27-20400
2609  =  clueweb12-0208wb-14-27366
2610  =  clueweb12-1000wb-32-10498
2611  =  clueweb12-0711wb-88-02111
2612  =  clueweb12-1313wb-13-10284
2613  =  clueweb12-0400wb-45-19923
2614  =  clueweb12-0304wb-60-11014
2615  =  clueweb12-0601wb-24-33119
2616  =  clueweb12-1113wb-64-04980
2617  =  clueweb12-0013wb-47-32508
2618  =  clueweb12-1203wb-90-09474
2619  =  clueweb12-0816wb-22-26746
2620  =  clueweb12-1604wb-08-07908
2621  =  clueweb12-0907wb-50-13279
2622  =  clueweb12-0402wb-14-27021
2623  =  clueweb12-1700tw-26-19458
2624  =  clueweb12-0807wb-11-14857
2625  =  clueweb12-1201wb-78-15546
2626  =  clueweb12-0012wb-23-03355
2627  =  clueweb12-0512wb-32-17619
2628  =  clueweb12-0102wb-02-09029
2629  =  clueweb12-0205wb-60-27330
2630  =  clueweb12-0100wb-91-16022
2631  =  clueweb12-1602wb-93-04258
2632  =  clueweb12-1900wb-66-02508
2633  =  clueweb12-0203wb-23-02468
2634  =  clueweb12-1007wb-24-01638
2635  =  clueweb12-1

2841  =  clueweb12-0311wb-31-09389
2842  =  clueweb12-0704wb-45-22618
2843  =  clueweb12-0013wb-58-07328
2844  =  clueweb12-1600wb-34-12183
2845  =  clueweb12-1215wb-33-08369
2846  =  clueweb12-1200tw-31-03155
2847  =  clueweb12-1309wb-19-15124
2848  =  clueweb12-0300wb-81-23778
2849  =  clueweb12-0105wb-87-18729
2850  =  clueweb12-0105wb-54-01196
2851  =  clueweb12-0101wb-56-05503
2852  =  clueweb12-0203wb-61-21667
2853  =  clueweb12-0700tw-61-14704
2854  =  clueweb12-0212wb-01-12140
2855  =  clueweb12-0311wb-46-23551
2856  =  clueweb12-0705wb-49-22528
2857  =  clueweb12-1602wb-93-04257
2858  =  clueweb12-0102wb-02-09026
2859  =  clueweb12-0606wb-91-07838
2860  =  clueweb12-1118wb-13-20917
2861  =  clueweb12-0000wb-42-34021.txt
2862  =  clueweb12-0401tw-12-19807
2863  =  clueweb12-0408wb-45-28307
2864  =  clueweb12-0704wb-34-05892
2865  =  clueweb12-1704wb-44-15346
2866  =  clueweb12-0511wb-83-03657
2867  =  clueweb12-0713wb-31-04852
2868  =  clueweb12-0901wb-20-07075
2869  =  clueweb

3075  =  clueweb12-0102wb-05-32106
3076  =  clueweb12-1304wb-13-16381
3077  =  clueweb12-0300tw-48-13629
3078  =  clueweb12-1606wb-98-13993
3079  =  clueweb12-0607wb-48-18484
3080  =  clueweb12-1105wb-30-04997
3081  =  clueweb12-0208wb-80-07362
3082  =  clueweb12-1502wb-65-10621
3083  =  clueweb12-1617wb-18-04963
3084  =  clueweb12-0800tw-61-07206
3085  =  clueweb12-0508wb-52-15052
3086  =  clueweb12-0504wb-29-22694
3087  =  clueweb12-0716wb-55-21961
3088  =  clueweb12-0105wb-80-30821
3089  =  clueweb12-1709wb-27-06213
3090  =  clueweb12-0700tw-14-05689
3091  =  clueweb12-1910wb-70-20127
3092  =  clueweb12-1505wb-37-11332
3093  =  clueweb12-0917wb-00-29101
3094  =  clueweb12-0211wb-67-18399
3095  =  clueweb12-1913wb-33-30020
3096  =  clueweb12-0006wb-92-03162
3097  =  clueweb12-1500tw-91-10901
3098  =  clueweb12-1606wb-92-16312
3099  =  clueweb12-0103wb-96-09775
3100  =  clueweb12-0609wb-34-31497
3101  =  clueweb12-0713wb-23-25054
3102  =  clueweb12-0712wb-98-02605
3103  =  clueweb12-0

3309  =  clueweb12-0606wb-02-03232
3310  =  clueweb12-0402wb-26-09985
3311  =  clueweb12-1616wb-78-03336
3312  =  clueweb12-0203wb-70-20735
3313  =  clueweb12-1800wb-56-25056
3314  =  clueweb12-1607wb-17-04560
3315  =  clueweb12-0712wb-98-02604
3316  =  clueweb12-1305wb-06-26685
3317  =  clueweb12-0001wb-56-33209
3318  =  clueweb12-1602wb-41-12621
3319  =  clueweb12-0713wb-23-25055
3320  =  clueweb12-1200wb-78-24430
3321  =  clueweb12-1011wb-55-32387
3322  =  clueweb12-1304wb-17-02291
3323  =  clueweb12-0700tw-32-19474
3324  =  clueweb12-1218wb-04-00345
3325  =  clueweb12-1608wb-18-27557
3326  =  clueweb12-0409wb-05-26641
3327  =  clueweb12-1200wb-96-09412
3328  =  clueweb12-1609wb-26-16093
3329  =  clueweb12-0302wb-18-33324
3330  =  clueweb12-0100tw-68-10731
3331  =  clueweb12-1207wb-36-06779
3332  =  clueweb12-1304wb-07-11214
3333  =  clueweb12-0100wb-44-30245
3334  =  clueweb12-0919wb-14-02646
3335  =  clueweb12-0305wb-34-10672
3336  =  clueweb12-0105wb-44-30399
3337  =  clueweb12-0

# Unhashed Inverted Index

In [39]:
def unhashed_term_index_write_to_file(term_index_dot_txt, dict_for_termid_and_postings):
    fh = open(term_index_dot_txt, "a+")
    for term_id, d_id in dict_for_termid_and_postings.items(): 
            lne = str(term_id) +"\t"+ str(len(d_id)) +"\t"+ str(d_id)
            fh.write(lne)
            fh.write('\n')
    fh.close()

In [45]:
def create_unhashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_unhashed" + extension
    term_output_file_name = output_directory + "termid_unhashed" + extension
    
    term_index_dot_txt = get_directory_path("output") + "term_index_unhashed" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    
    list_of_docs = list() # for storing the ids of doc
    list_of_term_ids = list() # for storing the ids of all terms
    list_of_doc_and_terms = list() # for storing as tuples
    all_collection_of_words = list() # for storing all the unique words
    all_collection_of_word_ids = list() # for storing all the unique ids
    
    for i in range(0,int(len(files)/200)):
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,1000000)
            while term_id in all_collection_of_word_ids:
                term_id = random.randint(1,1000000)
            
            # if that word is not new, already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX)
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            else:
                #list_of_term_ids.append(term_id)
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX[1])
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append((term_id))
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            
        docid_write_to_file(document_output_file_name,doc_id,files[i], len(stemmed_tokens_of_words))
        
    # Sort on the basis of term_ids
    zip_of_doc_and_terms = list(zip(list_of_docs, list_of_term_ids))
    zip_of_doc_and_terms.sort(key = operator.itemgetter(1))
    dict_for_termid_and_postings = create_postings_for_unhashed_ii(zip_of_doc_and_terms)
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    unhashed_term_index_write_to_file(term_index_dot_txt, dict_for_termid_and_postings)
    
    print("Happy Ending")
    print(len(all_collection_of_words))
#    print(dict_for_termid_and_postings)

To create hashed inverted index execute following function.

In [46]:
create_unhashed_inverted_index()



Happy Ending
3776


In [1]:
D = dict()

In [2]:
D[1] = 'A'
D[2] = 'B'
D[3] = 'C'

In [3]:
F = dict()

In [4]:
F[11] = ['G', 'L', 'M']
F[12] = 'H'
F[13] = 'I'

In [5]:
D[1] = F
T = F[11]
print((T))

['G', 'L', 'M']


In [6]:
print(D)

{1: {11: ['G', 'L', 'M'], 12: 'H', 13: 'I'}, 2: 'B', 3: 'C'}


In [9]:
if 11 in D[1][11]:
    print("yes")

In [107]:
his = D[1]

In [108]:
print(his)

{11: ['G', 'L', 'M'], 12: 'H', 13: 'I'}


In [109]:
lst = his[11]
lst.append('N')
print(lst)
#his.update({11: 'Q'})


['G', 'L', 'M', 'N']


In [110]:
his.update({11: lst})

In [111]:
print(his)

{11: ['G', 'L', 'M', 'N'], 12: 'H', 13: 'I'}


In [None]:
Y = dict()

In [None]:
Y[20] = 'LMO'
Y[21] = 'PQR'

In [None]:
his.update({11:Y})

In [None]:
print(his)

In [123]:
FooDiy = dict()

In [125]:
empty_list = list()
FooDiy = {'1' : empty_list}

In [127]:
x = FooDiy['1']

<class 'list'>
