# Step 1: Tokenizing Documents

The first step in creating an index is tokenization. You must convert a document into a stream of tokens suitable for indexing.
Your tokenizer should follow these steps:
1. Accept a directory name as a command line argument, and process all files found in that directory.
2. Extract the document text with an HTML parsing library, ignoring the headers at the beginning of the file and all HTML tags.
3. Split the text into tokens (You can use some library for regular expression matching. To learn about regular expressions go to this link http://www.rexegg.com/regex-quickstart.html).
4. Convert all tokens to lowercase (this is not always ideal, but indexing intelligently in a case-sensitive manner is tricky).
5. Apply stop-wording to the document by ignoring any tokens found in this list <font color=blue>(\\Cactus \xeon\Maryam Bashir\Information Retrieval\stoplist)</font>.
6. Apply stemming to the document using any standard algorithm – <font color=green> Porter, Snowball, and KStem stemmers </font> are appropriate. You should use a stemming library for this step.
7. Your tokenizer will write two files:
- docids.txt – A file mapping a document's filename (without path) to a unique integer, its DOCID. Each line should be formatted with a DOCID and filename separated by a tab, as follows:<font color=blue> 1234\t32435</font>
- termids.txt – A file mapping a token found during tokenization to a unique integer, its TERMID. Each line should be formatted with a TERMID and token separated by a tab, as follows: <font color=blue> 567\tapple </font>

<font color=blue>  </font>

In [1]:
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import random
import os
import operator
import numpy as np
#from sets import Set
#from html.parser import HTMLParser

In [2]:
def get_directory_path(mode):
    """
    It takes only path of folder, no file name.
    It only returns the folder which contain all the text file.
    
    Argument:
    #nothing
    
    Returns:
    dp -- directory path which contains all the txt files.
    """
    if (mode == "input"):   
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/corpus/corpus/"
    elif (mode == "output"):
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/out/"
    else:
        raise ValueError('Unspecified mode.')
        dp = None

    return dp

In [3]:
# Function : read_stop_list
def read_text_in_list_form(file_path):
    """
    This function takes the path of stop words file and reads it and returns a list of words.
    
    Argument:
    stop_file_path -- path should be like: path + file name.extension
        "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt".
    
    Returns:
    lineList -- list of words containg all the stop_words.
    """
    
    lst = [line.rstrip('\n') for line in open(file_path)]
    return lst

In [4]:
# Function = remove_html_headers
def remove_html_headers(file_path):
    """
    This function takes in path of file + file name and opens it. 
    Then it remove all the headers of html and returns the plain text.
    
    Argument:
    file_path -- concateted: path folder + file name
    
    Returns:
    soup -- the plain text of str type.
    """
    with open(file_path, encoding='utf-8',errors='ignore') as fin:
        soup = BeautifulSoup(BeautifulSoup(fin, 'html.parser').prettify()).text
        soup = soup.lower()
        return soup

In [5]:
# FUNCTION: split_text
def tokenize_the_text(raw_text):
    """
    This function converts raw text into list[i] i.e into tokens.
    
    Argument:
    raw_text -- class 'str' type.
    
    Returns:
    splitText -- list of unique words extracted from raw_text.
    len(splitText) -- length of words in splitText.
    """
    #lower_text = raw_text.lower()
    lower_text = raw_text
    """
    # Method 1 : Using regex
    """
    ####splitTextDig = re.split('[a-f]+',lower_text) # comment it
    splitTextAlp = str(re.findall(r"[a-z]+", lower_text))
    
    # Uncommenting This line incurrs "sns_x" type words 
    ####splitTextAlp = str(re.split(" [\s.,!?:;}{)(>]",lower_text))
    
    ####splitTextAlp = str(re.split('[0-9]+', splitTextAlp,flags=re.IGNORECASE)) #comment it
    
    splitTextAlp = list(re.split(r'\W+', (splitTextAlp)))
#     splitTextAlp = list(re.split(r' ', str(splitTextAlp)))
#     splitTextAlp = list(re.split(r'\W+', str(splitTextAlp)))

    # Now, take only those words which make sense and are greater than 3.
    splitText = [x for x in splitTextAlp if len(x) > 3]
    
    # By converting into set it will take only unique words from list.
    # Also by type casting converting back to list
    splitText = list(set(splitText))
    #splitText.sort()
    
    """
    #Method 2 : Using ntlk
    """
    #from nltk.tokenize import word_tokenize 
    #splitTextAlp = word_tokenize(lower_text)
    ###splitTextAlp = str(re.findall(r"[a-z]+", str(splitTextAlp)))
    #splitTextAlp = list(re.split('[0-9]+',str(splitTextAlp)))
    #wt = [x for x in word_tokens if len(x) > 3]
    #wt = list(set(wt))

    
    #print(type(splitText))
    #splitText = list(set(splitTextAlp).difference(splitTextDig))
    return splitText, len(splitText)
    #return wt, len(wt)

In [6]:
# Function : remove_stop_words
def remove_stop_words(document_words, stop_words):
    """
    This function removes the stop_list from tokens of documents.
    Stop_words are those words which occurs in abundance in text.
    
    Argument:
    document_words -- list of all the tokens/words extracted from document.
    stop_words -- list of all the stop_list extracted from file.
    
    Returns:
    cleaned -- list of all words which do not have stop_list words.
    """
    #print(type(document_words))
    #print(type(stop_words))
    cleaned_tokens_from_stop_words =  list(set(document_words) - set(stop_words))
    #cleaned_tokens_from_stop_words =  list(document_words - stop_words)
    #cleaned_tokens_from_stop_words.sort()
    return cleaned_tokens_from_stop_words

In [7]:
# Function : stem_words
def stem_words(tokenized_words_without_stop_words):
    """
    This function takes in list of words which do not contain stop_words.
    It uses the PorterStemmer() to reduce the word to their root words.
    
    Argument:
    removed_all_stop_words -- list of all words which do not have stop_words.
    
    Returns:
    stemmed_words -- list of words which are reduced to their origin word.
    """
    ps = PorterStemmer()
    stemmed_words = list()
    for w in tokenized_words_without_stop_words:
        stemmed_words.append(ps.stem(w))
    stemmed_words.sort()
    return stemmed_words

In [8]:
# Function : docid_write_to_file
def docid_write_to_file(output_file_name, doc_id, doc_name, doc_length):
    """
    This function takes in name of output file along with it's directory,
    and name of document i.e. name of text file. Then it generates a random 
    number and then writes random number and text file name to output
    file.
    
    Argument:
    output_file_name -- directory path concatenated to output file name.
    doc_name -- the name of single text file to write in output file.
    
    Returns:
    Nothing
    """
    fh = open(output_file_name, "a+")
    idx = random.randint(1,5000)
    
    line = str(doc_id) + "\t" + doc_name +  "\t" + str(doc_length)
           
    fh.write(line)
    fh.write('\n')
    
    fh.close()

In [9]:
# Function : termid_write_to_file
def termid_write_to_file(output_file_name, all_collection_of_words, all_collection_of_word_ids):
    """
    This function takes in name of output file along with it's directory,
    and opens in append mode, and it takes in list of all words found in
    all documents in sorted form.
    Argument:
    output_file_name -- directory path concatenated to output file name.
    list_of_all_unique_words -- list of all the unique words in sorted form.
    
    Returns:
    Nothing.
    """
    fh = open(output_file_name, "a+")
    for i in range(0,len(all_collection_of_words)):
        new_line = str(all_collection_of_word_ids[i]) + "\t" + all_collection_of_words[i]
        fh.write(new_line)
        fh.write('\n')
    fh.close()

In [10]:
# Function : clear_collection_from_repetition
def clear_collection_from_repetition(collection_of_all_words, stemmed_words):
    """
    This function 
    
    Argument:
    collection_of_all_words -- .
    stemmed_words -- 
    Returns:
    s -- .
    """
    
    cleaned = list(set(collection_of_all_words) - set(stemmed_words))
    common = list(set(collection_of_all_words) & set(stemmed_words))
    
    for w in range (0,len(stemmed_words)):
            collection_of_all_words.append(stemmed_words[w])
    #print(cleaned[0])
    #cleaned.sort()
    return cleaned

In [11]:
# Function : tokenize_the_directory_path
def tokenize_the_directory_path(path):
    """
    This function converts directory path into list[i] i.e into tokens.
    
    Argument:
    path -- class 'str' type of path of file.
    
    Returns:
    splitText -- list of unique words extracted from raw_text.
    """
    
    split_text = re.split("[/]",path)
    
    return split_text

In [12]:
def find_all_files_in_directory(directory):
    # Read all .txt files from directory and save to files list.
    files = []
    #r=root, d=directories, f = files
    for r, d, f in os.walk(directory):
        for file in f:
            if '.txt' in file:
                files.append(file)
            elif file == ".DS_Store": #.DS_Store
                print("\n")
                    # does nothing
                    #elif '.txt' not in file:
            else:
                files.append(file)
            
            #if '.txt' in file:
                #files.append(os.path.join(r, file))
    return files

# Step 2: Inverted Index

<font color=blue> term_index.txt - </font> An inverted index containing the file position for each occurrence of each term in the collection. Each line should contain the complete inverted list for a single term. Each line should contain a list of DOCID,POSITION values. Each line of this file should contain a TERMID followed by a space-separated list of properties as follows:
<font color=green> 347 1542 567 432,43 456,33 456,41 </font>
- 347: TERMID
- 1542: Total number of occurrences of the term in the entire corpus 
- 567: Total number of documents in which the term appears
- 432: Document Id in which term appears
- 43: Position of term in document 432

In order to support more efficient compression you must apply delta encoding to the inverted list. The first DOCID for a term and the first POSITION for a document will be stored normally. Subsequent values should be stored as the offset from the prior value.

Instead of encoding an inverted list like this: <font color=green>347 1542 567 432,43 456,33 456,41 </font>
- You should encode it like this:
<font color=green>347 1542 567 432,43 24,33 0,8  </font>
- <font color=red> Note that </font> in order to do this, your DOCIDs and POSITIONs must be sorted in ascending order.



In [13]:
def term_index_write_to_file(output_file_name, inverted_index_dictionary):
    fh = open(output_file_name, "a+")
    for term_id , dicti in inverted_index_dictionary.items():
                # term_id 
        t_id = str(term_id)
        fh.write(t_id)
        for d_id, poss in dicti.items():
            total_number_of_occurance = len(poss)
                       # doc_id          # count               # positions
            lne = "\t"+ str(d_id) +"\t"+ str(len(poss)) +"\t"+ str(poss)
            fh.write(lne)
            fh.write('\n')
    fh.close()

In [14]:
def create_postings_for_unhashed_ii(list_of_doc_and_terms):
    my_dict = dict()
    # One element is a tuple. 
    for tup in list_of_doc_and_terms:
        temp = list()
        key = 0
        i = 1
        #print("tup = ", tup)
        # now pick one element.
        for element in tup:
            #print("element = ", element)
            # Getting doc_id and appending it into a list
            if(np.mod(i,2) != 0): # value = doc_id
                # If key is not found.
                if (my_dict.get(key,0) == 0):
                    temp.append(element)
                else:
                    history = my_dict[key]
                    for e in range(0, len(history)):
                        temp.append(history[e])
                    temp.append(element)
            # If it is not a doc_id then it is term_id,
            # thus make a key out of it, and insert that
            # element into it.
            elif(np.mod(i,2) == 0): # key = term_id
                key = element
            i+=1
        my_dict[key] = list(set(temp))
    return my_dict

In [56]:
# Utility function: for document_length purpose 

#
# " output" = term_id : list_of_documents
#
def create_doc_length_txt(output_file_name, hashed_ii2):
    fh = open(output_file_name, "a+")
    for k , v in hashed_ii2.items():
        l = len(v)
        line = str(k) + "\t" + str(l) + "\t" + str(v)
        fh.write(line)
        fh.write('\n')
    fh.close()

# Hashed Inverted Index

In [16]:
# # build version 2.0 #list with dict #WOrking
# # Only puts documents in which that term_id exists
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
    
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files))):
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
        
#         word_id_postings_list = list()
#         ## Append all words of a doc to list
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
            
#             term_id = random.randint(1,1000000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,1000000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
                
#                 lst = list()
#                 history = hashed_ii[term_id] # hashed_ii[str(term_id)]
                
#                 for e in range(0, len(history)):
#                     lst.append(history[e])
#                 lst.append(doc_id)
                
#                 lst = list(set(lst))
#                 hashed_ii.update({term_id :lst})
#             else:
#                 ## already not present that word
#                 ## new place created in dictionary
#                 lst = list()
#                 lst.append((doc_id))
#                 hashed_ii[str(term_id)] = lst
                
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
    
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [17]:
# #build version 3.0 #dictionary within dictionary
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
    
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
#     history = dict()
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files)/500)):
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
        
#         word_id_postings_list = list()
#         doc_and_position_container = dict()
        
#         ## Append all words of a doc to list
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
            
#             term_id = random.randint(1,100000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,100000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
                
#                 # Now history will be of type = dict()
#                 #if str(doc_id) in 
#                 history = hashed_ii[str(term_id)]
#                 lst = list()
#                 print("history = ",history)
#                 already_existed_positions = (history[str(doc_id)])
#                 #print("type = ",type(already_existed_positions))
#                 print("already_existed_positions = ", already_existed_positions)
                
#                 for e in range(0, len(already_existed_positions)):
#                         lst.append(already_existed_positions[e])
#                 #lst.append(already_existed_positions)
#                 lst.append(stemmed_tokens_of_words.index(current_word))
#                 print("lst = ", lst)
#                 history.update({str(doc_id) : lst})
#                 hashed_ii.update({str(term_id) : history})
#             else:
#                 ## already not present that word
#                 ## new place created in dictionary
#                 # key -> doc_id  par positions as values rakh di
#                 #lst = list()
#                 #lst.append(stemmed_tokens_of_words.index(current_word))
#                 hashed_ii[str(term_id)] = dict()
#                 lsr = list()
                
#                 if str(doc_id) in doc_and_position_container:
#                     temp = doc_and_position_container[str(doc_id)]
#                     for e in range(0, len(doc_and_position_container)):
#                         lsr.append(temp[e])
#                 lsr.append(stemmed_tokens_of_words.index(current_word))
#                 doc_and_position_container[str(doc_id)] = list() #stemmed_tokens_of_words.index(current_word) 
#                 doc_and_position_container[str(doc_id)] = lsr

#                 #print("doc_and_position_container = ", doc_and_position_container)
                
#                 hashed_ii[str(term_id)] = doc_and_position_container
#                 #print("hashed_ii = ", hashed_ii)
                
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
    
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [18]:
# def doc_lengths():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
#     history = dict()
    
#     output_doc_length_file = output_directory + "doc_lengths" + extension
    
#     fh = open(output_doc_length_file, "a+")

#     ## One doc file is opened.
#     for i in range(0,int(len(files)/200)):
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
#         l = len(stemmed_tokens_of_words)
#         new_line = str(l)?
#         fh.write(new_line)
#         fh.write('\n')
#     fh.close()

In [19]:
# doc_lengths()

In [20]:
# #build version 4.0 #dictionary within dictionary #WORKING
# def create_hashed_inverted_index():
#     # Get directory where all the .txt files are present.
#     directory = get_directory_path("input")
#     output_directory = get_directory_path("output")
    
#     extension = ".txt"
#     document_output_file_name = output_directory + "docid_hashed" + extension
#     term_output_file_name = output_directory + "termid_hashed" + extension
#     term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    
#     # Load all stop words into a stop_list.
#     stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
#     stop_list = read_text_in_list_form(stop_list_path)
    
#     files = find_all_files_in_directory(directory)
#     hashed_ii = dict() ## (key:terms_id, )
#     all_collection_of_words = list()
#     all_collection_of_word_ids = list()
#     dict_for_docs = dict() ## (key: doc_id, value:terms_found_in_this_doc)
#     history = dict()
    
#     ## One doc file is opened.
#     for i in range(0,int(len(files)/200)):
#         #print(files[i])
#         #print("\n", i)
#         file_path = directory + files[i] #+ extension # no need for extension in both cases of file
#         #print(file_path)
#         all_text_without_html_headers = remove_html_headers(file_path) #str
#         tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
#         tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
#         stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
#         doc_id = random.randint(1,8000)
#         doc_and_position_container = dict()

#         ## Append all words of a doc to list
        
#         for w in range(0,len(stemmed_tokens_of_words)):     
#             current_word = stemmed_tokens_of_words[w]
#             term_id = random.randint(1,1000000)
#             while term_id in hashed_ii.keys():
#                 term_id = random.randint(1,100000)
                
#             ## If that word already present
#             if (all_collection_of_words.count(current_word)) != 0:
#                 first_occurance = all_collection_of_words.index(current_word)
#                 term_id = all_collection_of_word_ids[first_occurance]
#                 history = dict()
#                 history = hashed_ii[str(term_id)] 
                
#                 for d_id, positions_list in history.items():
#                     #if d_id == str(doc_id):
#                         already_existed_positions = list()
#                         already_existed_positions = positions_list #(history[d_id])
#                     #print("positions_list ", positions_list)
#                     #lst = list()
#                     #for e in range(0, len(already_existed_positions)):
#                     #    lst.append(already_existed_positions[e])
#                         already_existed_positions.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                         #print("already_existed = ",d_id, " ", already_existed_positions)
#                         #print(d_id,term_id, current_word)
#                     #lst.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
#                     #history[kys] = lst
#                         #history.update({d_id : already_existed_positions})
#                         history[str(d_id)] = already_existed_positions
#                     #lst = list(set(lst))
#                 #hashed_ii[str(term_id)] = history
#                 #history.update({(doc_id) : lst})
#                         hashed_ii[str(term_id)] = history
#                         #hashed_ii.update({str(term_id) : history})  # <--- ye line dekh zara
#                         #hashed_ii.update(history)
#                 #print("nai idher, if mein")
#             ## If already not present that word
#             else:
#                 all_collection_of_words.append(current_word)
#                 all_collection_of_word_ids.append(str(term_id))
#                 lsr = list()
#                 position = stemmed_tokens_of_words.index(current_word) 
#                 #print("positions = " , position)
#                 lsr.append(position)
#                 doc_and_position_container = dict()
#                 doc_and_position_container[str(doc_id)] = lsr
#                 hashed_ii[str(term_id)] = dict()
#                 #hashed_ii[str(term_id)] = doc_and_position_container
#                 hashed_ii.update({str(term_id) : doc_and_position_container})
#                 #print("\nKia Ider")
#                 #print("hashed_ii = ", hashed_ii)
#         docid_write_to_file(document_output_file_name,doc_id,files[i])
#     print("hashed_ii = ", hashed_ii)
#     termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
#     term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
#     print("Happy Ending")
#     print(len(all_collection_of_words))

In [133]:
#build version 5.0 #dictionary within dictionary #Correct Output
def create_hashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_hashed" + extension
    term_output_file_name = output_directory + "termid_hashed" + extension
    term_index_dot_txt = get_directory_path("output") + "term_index_hashed" + extension
    document_frequency_file = get_directory_path("output") + "document_frequency" + extension
    document_length = get_directory_path("output") + "document_length" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    hashed_ii = dict() ## (key:{terms_id: {doc_id: positions}} )
    hashed_ii2 = dict()
    all_collection_of_words = list()
    all_collection_of_word_ids = list()
    dict_for_docs = dict() ## (key: term_id, value:docs_list)
    history = dict()
    
    ## One doc file is opened.
    for i in range(0,int(len(files)/200)):
        #print(files[i])
        #print("\n", i)
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        #print(file_path)
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        doc_and_position_container = dict()

        ## Append all words of a doc to list
        
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,1000000)
            while term_id in hashed_ii.keys():
                term_id = random.randint(1,100000)
            doc_and_position_container = dict()
            ## If that word already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                
                print("\nFound for term_id = ", term_id," doc_id = ", doc_id)
                
                #print("history = ", history)
                
                ####### This part: for term and it's documents in which ot exists. ######
                # In short, later used for doc_length
                lst2 = list()
                history2 = hashed_ii2[str(term_id)]
                
                
                for e in range(0, len(history2)):
                    lst2.append(history2[e])
                lst2.append(doc_id)
                
                lst2 = list(set(lst2))
                hashed_ii2.update({term_id :lst2})
                ########################################################################
                
                #- new dict bna k uske ander new entry kron new doc_id ki
                #- yahan per ye bhe dekha jae k kia wo doc_id mein pehli mil gya hai (word repeat zror 
                #magar new document mein )
                #- aur agar usi same document mein hai tou, matlab doc_id as key already mujood hai.
                history = dict()
                history = hashed_ii[str(term_id)]
                
                ### -> Word repeat tou lazmi huva hai.
                
                # Case: Agar usi document mein dobara word mil gaya hai, tou wo doc_id tou pehle se mujood hoga
                if str(doc_id) in history:
                    for d_id, positions in history.items():
                        if d_id == str(doc_id):
                            positions_list = history[str(d_id)]
                            already_existed_positions = list()
                            already_existed_positions = positions_list #(history[d_id])
                            already_existed_positions.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
                            #print("Before updating HISTORY", history)
                            history.update({(d_id) :  already_existed_positions})
                            #print("After updating HISTORY", history)
                            #print("history = ", history)

                            #print("Before updating hashed_ii", hashed_ii)
                            hashed_ii.update({str(term_id): history})
                            #print("After updating hashed_ii", hashed_ii)
                else: #agar naya doc aya hai tou, new doc ki key bnani paregi.
                    # naya doc hai, tou new key bnani paregi doc_id ki history ki dict mein
                    first_time_position_found_in_new_doc = list()
                    first_time_position_found_in_new_doc.append(stemmed_tokens_of_words.index(current_word,w,len(stemmed_tokens_of_words)))
                    history[str(doc_id)] = first_time_position_found_in_new_doc
                    hashed_ii.update({str(term_id): history})
                    #history.update({(d_id) :  first_time_position_found_in_new_doc})
            ## If already NOT present that word
            else:
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append(str(term_id))
                
                ####### This part: for term and it's documents in which ot exists. ####
                lst2 = list()
                lst2.append((doc_id))
                hashed_ii2[str(term_id)] = lst2
                ####################################################################
                
                lsr = list()
                position = stemmed_tokens_of_words.index(current_word) 
                #print("positions = " , position)
                lsr.append(position)
                doc_and_position_container = dict()
                # usi same doc_id k ander pos change ki ja raha ha
                doc_and_position_container[str(doc_id)] = lsr
                #print("doc_and_position_container = ", doc_and_position_container)
                #hashed_ii[str(term_id)] = dict()
                #print("Before update", hashed_ii)
                hashed_ii.update({str(term_id) : doc_and_position_container})
                #print("\nAfter update", hashed_ii)
        docid_write_to_file(document_output_file_name, doc_id, files[i], len(stemmed_tokens_of_words))
    print("hashed_ii = ", hashed_ii)
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    create_doc_length_txt(document_frequency_file, hashed_ii2)
    term_index_write_to_file(term_index_dot_txt, hashed_ii)
    
    print("Happy Ending")
    print(len(all_collection_of_words))

To create hashed inverted index execute following function.

In [134]:
create_hashed_inverted_index()




Found for term_id =  715980  doc_id =  6616

Found for term_id =  502717  doc_id =  6616

Found for term_id =  951404  doc_id =  6616

Found for term_id =  951404  doc_id =  6616

Found for term_id =  868797  doc_id =  6616

Found for term_id =  340298  doc_id =  6616

Found for term_id =  941926  doc_id =  6616

Found for term_id =  716541  doc_id =  6616

Found for term_id =  679164  doc_id =  6616

Found for term_id =  172532  doc_id =  6616

Found for term_id =  846502  doc_id =  6616

Found for term_id =  960992  doc_id =  4935

Found for term_id =  960992  doc_id =  4935

Found for term_id =  243013  doc_id =  4935

Found for term_id =  243013  doc_id =  4935

Found for term_id =  964280  doc_id =  4935

Found for term_id =  133999  doc_id =  4935

Found for term_id =  801954  doc_id =  4935

Found for term_id =  236802  doc_id =  4935

Found for term_id =  489397  doc_id =  4935

Found for term_id =  300910  doc_id =  4935

Found for term_id =  158256  doc_id =  4935

Found f


Found for term_id =  383262  doc_id =  5963

Found for term_id =  964280  doc_id =  5963

Found for term_id =  720123  doc_id =  5963

Found for term_id =  459513  doc_id =  5963

Found for term_id =  801954  doc_id =  5963

Found for term_id =  236802  doc_id =  5963

Found for term_id =  489397  doc_id =  5963

Found for term_id =  14910  doc_id =  5963

Found for term_id =  640048  doc_id =  5963

Found for term_id =  38735  doc_id =  5963

Found for term_id =  117907  doc_id =  5963

Found for term_id =  526846  doc_id =  5963

Found for term_id =  137547  doc_id =  5963

Found for term_id =  922343  doc_id =  5963

Found for term_id =  922343  doc_id =  5963

Found for term_id =  779819  doc_id =  5963

Found for term_id =  534445  doc_id =  5963

Found for term_id =  534445  doc_id =  5963

Found for term_id =  576804  doc_id =  5963

Found for term_id =  463074  doc_id =  5963

Found for term_id =  857554  doc_id =  5963

Found for term_id =  345675  doc_id =  5963

Found for t


Found for term_id =  960992  doc_id =  1799

Found for term_id =  512585  doc_id =  1799

Found for term_id =  964280  doc_id =  1799

Found for term_id =  368660  doc_id =  1799

Found for term_id =  236802  doc_id =  1799

Found for term_id =  489397  doc_id =  1799

Found for term_id =  837713  doc_id =  1799

Found for term_id =  310069  doc_id =  1799

Found for term_id =  16569  doc_id =  1799

Found for term_id =  978679  doc_id =  1799

Found for term_id =  47814  doc_id =  1799

Found for term_id =  918302  doc_id =  1799

Found for term_id =  889318  doc_id =  1799

Found for term_id =  779819  doc_id =  1799

Found for term_id =  929518  doc_id =  1799

Found for term_id =  158256  doc_id =  1799

Found for term_id =  463074  doc_id =  1799

Found for term_id =  857554  doc_id =  1799

Found for term_id =  345675  doc_id =  1799

Found for term_id =  345675  doc_id =  1799

Found for term_id =  607228  doc_id =  1799

Found for term_id =  706154  doc_id =  1799

Found for t


Found for term_id =  383262  doc_id =  2790

Found for term_id =  960992  doc_id =  2790

Found for term_id =  960992  doc_id =  2790

Found for term_id =  960992  doc_id =  2790

Found for term_id =  960992  doc_id =  2790

Found for term_id =  960992  doc_id =  2790

Found for term_id =  583925  doc_id =  2790

Found for term_id =  881768  doc_id =  2790

Found for term_id =  512585  doc_id =  2790

Found for term_id =  675585  doc_id =  2790

Found for term_id =  243013  doc_id =  2790

Found for term_id =  243013  doc_id =  2790

Found for term_id =  964280  doc_id =  2790

Found for term_id =  964280  doc_id =  2790

Found for term_id =  150734  doc_id =  2790

Found for term_id =  103205  doc_id =  2790

Found for term_id =  784426  doc_id =  2790

Found for term_id =  784426  doc_id =  2790

Found for term_id =  699864  doc_id =  2790

Found for term_id =  133999  doc_id =  2790

Found for term_id =  904436  doc_id =  2790

Found for term_id =  765157  doc_id =  2790

Found for

Found for term_id =  951312  doc_id =  2790

Found for term_id =  951312  doc_id =  2790

Found for term_id =  154767  doc_id =  2790

Found for term_id =  296127  doc_id =  2790

Found for term_id =  92128  doc_id =  2790

Found for term_id =  870556  doc_id =  2790

Found for term_id =  873657  doc_id =  2790

Found for term_id =  683074  doc_id =  2790

Found for term_id =  425902  doc_id =  2790

Found for term_id =  173630  doc_id =  2790

Found for term_id =  811447  doc_id =  2790

Found for term_id =  997803  doc_id =  2790

Found for term_id =  2267  doc_id =  2790

Found for term_id =  2267  doc_id =  2790

Found for term_id =  510008  doc_id =  2790

Found for term_id =  361944  doc_id =  2790

Found for term_id =  422738  doc_id =  2790

Found for term_id =  422738  doc_id =  2790

Found for term_id =  735963  doc_id =  2790

Found for term_id =  735963  doc_id =  2790

Found for term_id =  561972  doc_id =  2790

Found for term_id =  921721  doc_id =  2790

Found for term_


Found for term_id =  383262  doc_id =  5256

Found for term_id =  675585  doc_id =  5256

Found for term_id =  243013  doc_id =  5256

Found for term_id =  964280  doc_id =  5256

Found for term_id =  784426  doc_id =  5256

Found for term_id =  420044  doc_id =  5256

Found for term_id =  404392  doc_id =  5256

Found for term_id =  236802  doc_id =  5256

Found for term_id =  489397  doc_id =  5256

Found for term_id =  27356  doc_id =  5256

Found for term_id =  351347  doc_id =  5256

Found for term_id =  175201  doc_id =  5256

Found for term_id =  25952  doc_id =  5256

Found for term_id =  133917  doc_id =  5256

Found for term_id =  863987  doc_id =  5256

Found for term_id =  978679  doc_id =  5256

Found for term_id =  47814  doc_id =  5256

Found for term_id =  47814  doc_id =  5256

Found for term_id =  580255  doc_id =  5256

Found for term_id =  804503  doc_id =  5256

Found for term_id =  889318  doc_id =  5256

Found for term_id =  794198  doc_id =  5256

Found for ter


Found for term_id =  583925  doc_id =  1156

Found for term_id =  881768  doc_id =  1156

Found for term_id =  129884  doc_id =  1156

Found for term_id =  152145  doc_id =  1156

Found for term_id =  512585  doc_id =  1156

Found for term_id =  423064  doc_id =  1156

Found for term_id =  423064  doc_id =  1156

Found for term_id =  885966  doc_id =  1156

Found for term_id =  243013  doc_id =  1156

Found for term_id =  964280  doc_id =  1156

Found for term_id =  966129  doc_id =  1156

Found for term_id =  784426  doc_id =  1156

Found for term_id =  551085  doc_id =  1156

Found for term_id =  460569  doc_id =  1156

Found for term_id =  368660  doc_id =  1156

Found for term_id =  592690  doc_id =  1156

Found for term_id =  367820  doc_id =  1156

Found for term_id =  812235  doc_id =  1156

Found for term_id =  489397  doc_id =  1156

Found for term_id =  607851  doc_id =  1156

Found for term_id =  606671  doc_id =  1156

Found for term_id =  310069  doc_id =  1156

Found for


Found for term_id =  881768  doc_id =  719

Found for term_id =  881768  doc_id =  719

Found for term_id =  512585  doc_id =  719

Found for term_id =  675585  doc_id =  719

Found for term_id =  675585  doc_id =  719

Found for term_id =  484069  doc_id =  719

Found for term_id =  653264  doc_id =  719

Found for term_id =  243013  doc_id =  719

Found for term_id =  964280  doc_id =  719

Found for term_id =  150734  doc_id =  719

Found for term_id =  984367  doc_id =  719

Found for term_id =  984367  doc_id =  719

Found for term_id =  455339  doc_id =  719

Found for term_id =  455339  doc_id =  719

Found for term_id =  92510  doc_id =  719

Found for term_id =  615390  doc_id =  719

Found for term_id =  459513  doc_id =  719

Found for term_id =  253865  doc_id =  719

Found for term_id =  489397  doc_id =  719

Found for term_id =  95154  doc_id =  719

Found for term_id =  534036  doc_id =  719

Found for term_id =  27356  doc_id =  719

Found for term_id =  837713  doc_i

Found for term_id =  369232  doc_id =  719

Found for term_id =  240792  doc_id =  719

Found for term_id =  986380  doc_id =  719

Found for term_id =  986380  doc_id =  719

Found for term_id =  380016  doc_id =  719

Found for term_id =  104810  doc_id =  719

Found for term_id =  831755  doc_id =  719

Found for term_id =  848245  doc_id =  719

Found for term_id =  848245  doc_id =  719

Found for term_id =  94699  doc_id =  719

Found for term_id =  759349  doc_id =  719

Found for term_id =  191232  doc_id =  719

Found for term_id =  191232  doc_id =  719

Found for term_id =  403666  doc_id =  719

Found for term_id =  941906  doc_id =  719

Found for term_id =  941906  doc_id =  719

Found for term_id =  792765  doc_id =  719

Found for term_id =  985718  doc_id =  719

Found for term_id =  985718  doc_id =  719

Found for term_id =  985718  doc_id =  719

Found for term_id =  985718  doc_id =  719

Found for term_id =  874400  doc_id =  719

Found for term_id =  653283  doc_


Found for term_id =  960992  doc_id =  7640

Found for term_id =  964280  doc_id =  7640

Found for term_id =  745188  doc_id =  7640

Found for term_id =  534556  doc_id =  7640

Found for term_id =  750401  doc_id =  7640

Found for term_id =  489397  doc_id =  7640

Found for term_id =  534036  doc_id =  7640

Found for term_id =  16569  doc_id =  7640

Found for term_id =  38735  doc_id =  7640

Found for term_id =  54425  doc_id =  7640

Found for term_id =  576804  doc_id =  7640

Found for term_id =  576804  doc_id =  7640

Found for term_id =  577374  doc_id =  7640

Found for term_id =  589584  doc_id =  7640

Found for term_id =  706154  doc_id =  7640

Found for term_id =  807601  doc_id =  7640

Found for term_id =  438323  doc_id =  7640

Found for term_id =  699230  doc_id =  7640

Found for term_id =  978324  doc_id =  7640

Found for term_id =  806041  doc_id =  7640

Found for term_id =  451014  doc_id =  7640

Found for term_id =  798680  doc_id =  7640

Found for te

Found for term_id =  451014  doc_id =  4900

Found for term_id =  852425  doc_id =  4900

Found for term_id =  107714  doc_id =  4900

Found for term_id =  100105  doc_id =  4900

Found for term_id =  579591  doc_id =  4900

Found for term_id =  695339  doc_id =  4900

Found for term_id =  499926  doc_id =  4900

Found for term_id =  502717  doc_id =  4900

Found for term_id =  430292  doc_id =  4900

Found for term_id =  108133  doc_id =  4900

Found for term_id =  187294  doc_id =  4900

Found for term_id =  188285  doc_id =  4900

Found for term_id =  348342  doc_id =  4900

Found for term_id =  117128  doc_id =  4900

Found for term_id =  340298  doc_id =  4900

Found for term_id =  325447  doc_id =  4900

Found for term_id =  948041  doc_id =  4900

Found for term_id =  921412  doc_id =  4900

Found for term_id =  43086  doc_id =  4900

Found for term_id =  674519  doc_id =  4900

Found for term_id =  92128  doc_id =  4900

Found for term_id =  997803  doc_id =  4900

Found for te

Found for term_id =  870556  doc_id =  7572

Found for term_id =  999683  doc_id =  7572

Found for term_id =  95595  doc_id =  7572

Found for term_id =  56650  doc_id =  7572

Found for term_id =  997803  doc_id =  7572

Found for term_id =  498676  doc_id =  7572

Found for term_id =  755303  doc_id =  7572

Found for term_id =  755303  doc_id =  7572

Found for term_id =  511418  doc_id =  7572

Found for term_id =  2267  doc_id =  7572

Found for term_id =  448983  doc_id =  7572

Found for term_id =  90011  doc_id =  7572

Found for term_id =  398631  doc_id =  7572

Found for term_id =  821511  doc_id =  7572

Found for term_id =  393522  doc_id =  7572

Found for term_id =  544216  doc_id =  7572

Found for term_id =  544216  doc_id =  7572

Found for term_id =  292146  doc_id =  7572

Found for term_id =  846502  doc_id =  7572

Found for term_id =  901002  doc_id =  7572

Found for term_id =  405479  doc_id =  7572

Found for term_id =  468020  doc_id =  7572

Found for term_


Found for term_id =  960992  doc_id =  267

Found for term_id =  964280  doc_id =  267

Found for term_id =  984367  doc_id =  267

Found for term_id =  404392  doc_id =  267

Found for term_id =  236802  doc_id =  267

Found for term_id =  489397  doc_id =  267

Found for term_id =  95154  doc_id =  267

Found for term_id =  27356  doc_id =  267

Found for term_id =  837713  doc_id =  267

Found for term_id =  591563  doc_id =  267

Found for term_id =  16569  doc_id =  267

Found for term_id =  175201  doc_id =  267

Found for term_id =  300910  doc_id =  267

Found for term_id =  580255  doc_id =  267

Found for term_id =  801876  doc_id =  267

Found for term_id =  749029  doc_id =  267

Found for term_id =  24822  doc_id =  267

Found for term_id =  807601  doc_id =  267

Found for term_id =  438323  doc_id =  267

Found for term_id =  825820  doc_id =  267

Found for term_id =  825820  doc_id =  267

Found for term_id =  825820  doc_id =  267

Found for term_id =  451014  doc_id

# Unhashed Inverted Index

In [None]:
def unhashed_term_index_write_to_file(term_index_dot_txt, dict_for_termid_and_postings):
    fh = open(term_index_dot_txt, "a+")
    for term_id, d_id in dict_for_termid_and_postings.items(): 
            lne = str(term_id) +"\t"+ str(len(d_id)) +"\t"+ str(d_id)
            fh.write(lne)
            fh.write('\n')
    fh.close()

In [None]:
def create_unhashed_inverted_index():
    # Get directory where all the .txt files are present.
    directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    
    extension = ".txt"
    document_output_file_name = output_directory + "docid_unhashed" + extension
    term_output_file_name = output_directory + "termid_unhashed" + extension
    
    term_index_dot_txt = get_directory_path("output") + "term_index_unhashed" + extension
    
    # Load all stop words into a stop_list.
    stop_list_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt"
    stop_list = read_text_in_list_form(stop_list_path)
    
    files = find_all_files_in_directory(directory)
    
    list_of_docs = list() # for storing the ids of doc
    list_of_term_ids = list() # for storing the ids of all terms
    list_of_doc_and_terms = list() # for storing as tuples
    all_collection_of_words = list() # for storing all the unique words
    all_collection_of_word_ids = list() # for storing all the unique ids
    
    for i in range(0,int(len(files)/200)):
        file_path = directory + files[i] #+ extension # no need for extension in both cases of file
        all_text_without_html_headers = remove_html_headers(file_path) #str
        tokenized_words, count_of_words = tokenize_the_text(all_text_without_html_headers) #list
        tokenized_words_without_stop_words = remove_stop_words(tokenized_words, stop_list) #list
        stemmed_tokens_of_words = stem_words(tokenized_words_without_stop_words) #list 
        
        doc_id = random.randint(1,8000)
        
        word_id_postings_list = list()
        ## Append all words of a doc to list
        for w in range(0,len(stemmed_tokens_of_words)):     
            current_word = stemmed_tokens_of_words[w]
            
            term_id = random.randint(1,100000)
            while term_id in all_collection_of_word_ids:
                term_id = random.randint(1,100000)
            
            # if that word is not new, already present
            if (all_collection_of_words.count(current_word)) != 0:
                first_occurance = all_collection_of_words.index(current_word)
                term_id = all_collection_of_word_ids[first_occurance]
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX)
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            else:
                #list_of_term_ids.append(term_id)
                #tupleX = (doc_id, str(term_id))
                #list_of_doc_and_terms.append(tupleX[1])
                all_collection_of_words.append(current_word)
                all_collection_of_word_ids.append((term_id))
                list_of_docs.append(doc_id)
                list_of_term_ids.append(term_id)
            
        #list_of_docs.append(doc_id)
        docid_write_to_file(document_output_file_name,doc_id,files[i])
        # sort on the basis of term_ids
    #list_of_doc_and_terms.sort(key = operator.itemgetter(1))
    
    list_of_doc_and_terms = list(zip(list_of_docs, list_of_term_ids))
    list_of_doc_and_terms.sort(key = operator.itemgetter(1))
#     print((list_of_doc_and_terms))

    dict_for_termid_and_postings = create_postings_for_unhashed_ii(list_of_doc_and_terms)
    termid_write_to_file(term_output_file_name, all_collection_of_words, all_collection_of_word_ids)
    
    unhashed_term_index_write_to_file(term_index_dot_txt, dict_for_termid_and_postings)
    
    print("Happy Ending")
    print(len(all_collection_of_words))
#    print(dict_for_termid_and_postings)

To create hashed inverted index execute following function.

In [None]:
create_unhashed_inverted_index()

In [101]:
D = dict()

In [102]:
D[1] = 'A'
D[2] = 'B'
D[3] = 'C'

In [103]:
F = dict()

In [104]:
F[11] = ['G', 'L', 'M']
F[12] = 'H'
F[13] = 'I'

In [105]:
D[1] = F
T = F[11]
print((T))

['G', 'L', 'M']


In [106]:
print(D)

{1: {11: ['G', 'L', 'M'], 12: 'H', 13: 'I'}, 2: 'B', 3: 'C'}


In [107]:
his = D[1]

In [108]:
print(his)

{11: ['G', 'L', 'M'], 12: 'H', 13: 'I'}


In [109]:
lst = his[11]
lst.append('N')
print(lst)
#his.update({11: 'Q'})


['G', 'L', 'M', 'N']


In [110]:
his.update({11: lst})

In [111]:
print(his)

{11: ['G', 'L', 'M', 'N'], 12: 'H', 13: 'I'}


In [None]:
Y = dict()

In [None]:
Y[20] = 'LMO'
Y[21] = 'PQR'

In [None]:
his.update({11:Y})

In [None]:
print(his)

In [123]:
FooDiy = dict()

In [125]:
empty_list = list()
FooDiy = {'1' : empty_list}

In [127]:
x = FooDiy['1']

In [130]:
print(type(x))

<class 'list'>
