In [1]:
# NOTE: To get files for this assignment # 2, run inverted index 5.0 and get files from there.

# Overview

In this assignment, you will use the index you created in Assignment 1 to rank documents 
and create a search engine. You will implement two different scoring functions and compare 
their results against a baseline ranking produced by expert analysts.

# Running Queries


For this assignment, you will need the following two files:

<font color=red>  </font> <font color=blue> topics.xml (\\sandata\xeon\Maryam Bashir\Information Retrieval\topics.xml) </font> contains the queries you will be testing. 

You should run the queries using the text stored in the <font color=green> query </font> elements. The <font color=green> description </font> elements are only there to clarify the information need which the query is trying to express</font> .


<font color=red>  </font> <font color=blue> corpus.qrel (\\sandata\xeon\Maryam Bashir\Information Retrieval\corpus.qrel)</font> contains the relevance grades from expert assessors. While these grades are not necessarily entirely correct (and defining correctness unambiguously is quite difficult), they are fairly reliable and we will treat them as being correct here. 

The format here is:
<font color=green> topic </font> <font color=green> 0 </font> <font color=green> docid </font> <font color=green> grade </font>

<font color=red> o </font> <font color=green> topic </font> is the ID of the query for which the document was assessed.

<font color=red> o </font> <font color=green> 0 </font> is part of the format and can be ignored.

<font color=red> o </font> <font color=green> docid </font> is the name of one of the documents which you have indexed.

<font color=red> o </font> <font color=green> grade </font> is a value in the set <font color=blue> {-2, 0, 1, 2, 3, 4} </font>, where a higher value means that the document is more relevant to the query. 
The value -2 indicates a spam document, and 0 indicates a non-spam document which is completely non- relevant. 
Most queries do not have any document with a grade of 4, and many queries do not have any document with a grade of 3.
This is a consequence of the specific meaning assigned to these grades here and the manner in which the documents were collected.

This <font color=green> QREL </font> does not have assessments for every  <font color=blue>(query, document) </font> pair. If an assessment is missing, we assume the correct grade for the pair is 0 (non-relevant).

You will write a program which takes the name of a scoring function as a command line argument and which prints a ranked list of documents for all queries found in topics.xml using that scoring function. 

For example:

<font color=red> $ </font>  <font color=green> ./query.py --score TF-IDF </font> 

<font color=blue> 202 clueweb12-0000tw-13-04988 1 0.73 run1 </font> 

<font color=blue> 202 clueweb12-0000tw-13-04901 2 0.33 run1 </font>  

<font color=blue> 202 clueweb12-0000tw-13-04932 3 0.32 run1 </font>  ...

<font color=blue> 214 clueweb12-0000tw-13-05088 1 0.73 run1 </font> 

<font color=blue> 214 clueweb12-0000tw-13-05001 2 0.33 run1 </font> 

<font color=blue> 214 clueweb12-0000tw-13-05032 3 0.32 run1 </font> ...

<font color=blue> 250 clueweb12-0000tw-13-05032 500 0.002 run1 </font>


The output should have one row for each document which your program ranks for each query it runs. 
These lines should have the format:

<font color=green> topic </font> <font color=green> docid </font> <font color=green> rank </font> <font color=green> score </font> <font color=green> run </font>

<font color=red>  </font> <font color=green> topic </font> is the ID of the query for which the document was ranked.

<font color=red>  </font> <font color=green> docid </font> is the document identifier.

<font color=red>  </font> <font color=green> rank </font> is the order in which to present the document to the user. The document with the highest score will be assigned a rank of 1, the second highest a rank of 2, and so on.

<font color=red>  </font> <font color=green> score </font> is the actual score the document obtained for that query.

<font color=red>  </font> <font color=green> run </font> is the name of the run. You can use any value here. It is meant to allow research teams to submit multiple runs for evaluation in competitions such as TREC.

In [2]:
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import random
import os
import operator
import xml.dom.minidom
import numpy as np
import math
import sys
#from sets import Set
#from html.parser import HTMLParser

In [3]:
def get_directory_path(mode):
    """
    It takes only path of folder, no file name.
    It only returns the folder which contain all the text file.
    
    Argument:
    mode -- string specifying input or output for directory
    
    Returns:
    dp -- directory path which contains all the txt files.
    """
    if (mode == "input"):   
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/"
    elif (mode == "output"):
        dp = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/out/"
    else:
        raise ValueError('Unspecified mode for I/O.')
        dp = None

    return dp

In [4]:
# Function : read_stop_list
def read_text_in_list_form(file_path):
    """
    This function takes the path of stop words file and reads it and returns a list of words.
    
    Argument:
    stop_file_path -- path should be like: "(dir) + file_Name.extension"
        "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/stoplist.txt".
    
    Returns:
    lst -- list of words containg all the stop_words.
    """
    
    lst = [line.rstrip('\n') for line in open(file_path)]
    return lst

In [5]:
# stop_word_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/stoplist.txt"
# stop_words = read_text_in_list_form(stop_word_file)

In [6]:
## Version 2.0 using dictionary
def xml_parser(file_name):
    """
    It takes file name of xml file, which will contain the
    queries with their unique topics-ids.
    It returns the dictionary which contain topic-ids as keys and query of words values.
    
    Argument:
    file_name -- this will be "(dir)+topics.xml"
    
    Returns:
    queries -- directory with topic-ids as keys and query of words values.
    """
    doc = xml.dom.minidom.parse(file_name)
    qrys = doc.getElementsByTagName('query')
    tpcs = doc.getElementsByTagName('topic')
    queries = dict()
    i = 0
    for elem in qrys:
        queries[tpcs[i].attributes['number'].value] = elem.firstChild.data
        i = i + 1
    return queries

In [7]:
# file_name = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/topics.xml"
# qrys = xml_parser(file_name)
# #print(qrys[str(202)])
# print(qrys)

# Query Processing

Before running any scoring function, you should process the text of the query in exactly the same way that you processed the text of a document. That is:
1. Split the query into tokens (it is most correct to use the regular expression, but for these queries it suffices to split on whitespace)
2. Convert all tokens to lowercase
3. Apply stop-wording to the query using the same list you used in assignment 1
4. Apply the same stemming algorithm to the query which you used in your indexer

In [8]:
def stem_words(tokenized_words_without_stop_words):
    """
    This function takes in list of words which do not contain stop_words.
    It uses the PorterStemmer() to reduce the words to their root word.
    
    Argument:
    removed_all_stop_words -- list of all words which do not have stop_words.
    
    Returns:
    stemmed_words -- list of words which are reduced to their origin word.
    """
    ps = PorterStemmer()
    stemmed_words = list()
    for w in tokenized_words_without_stop_words:
        stemmed_words.append(ps.stem(w))
    stemmed_words.sort()
    return stemmed_words

In [9]:
def query_processing(query_string):
    """
    This function takes in a query string and does the pre-processing on it.
    It will first load stop words from directory, then split the query into
    single-single terms. Then remove stop words from it.
    
    Argument:
    query_string -- a string of query.
    
    Returns:
    stemmed_tokens -- tokens of query after being stemmed.
    """
    path_to_stop_words = get_directory_path("input") +"stoplist.txt"
    stop_words = read_text_in_list_form(path_to_stop_words)
    #splited_query = list(re.split(query))
    splited_query = list(query_string.split())
    #splited_query.lower()
    cleaned_tokens_from_stop_words =  list(set(splited_query) - set(stop_words))
    stemmed_tokens = stem_words(cleaned_tokens_from_stop_words)
    return stemmed_tokens

In [10]:
# #print(type(qrys[str(202)]))
# x = query_processing(qrys[str(202)])
# print((x))
# print(len(x))

# Scoring Function 1: Okapi BM25

Implement BM25 scores. This should use the following scoring function for document d and query q:
    
Where k1,k2, and b are constants. For start, you can use the values suggested in the lecture on BM25 (k1 = 1.2, k2 varies from 0 to 1000, b = 0.75). Feel free to experiment with different values for these
constants to learn their effect and try to improve performance.

In [11]:
def get_all_documents_length(docid_hashed_file):
    """
    It takes file name of a file which will contain doc-ids and
    adjacent to it will be document name and length of each document.
    
    It only returns the dictionary which will have unique DOC_IDS as KEYS
    and documents length as values.
    
    Argument:
    docid_hashed_file -- this will be "(dir)+docid_hashed.txt"
    
    Returns:
    doc_lengths -- is a dictionary as {doc-id : length_of_doc}.
    """
    doc_lengths = dict()
    file = open(docid_hashed_file, 'r' , encoding = "utf-8")
    for each_line in file:
        x = each_line.split()
        doc_lengths[x[0]] = x[2]
    return doc_lengths 

In [12]:
# docid_hashed_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/docid_hashed.txt"
# all_doc_lengths = get_all_documents_length(docid_hashed_file)
# #c = int(all_doc_lengths[str(3058)])
# #print(int(all_doc_lengths[str(3058)]))
# print((all_doc_lengths))

In [13]:
def get_all_documents_name(docid_hashed_file):
    """
    It takes file name of a file which will contain doc-ids and
    adjacent to it will be document name and length of each document.
    
    It only returns the dictionary which will have unique DOC_IDS as KEYS
    and document names as values.
    
    Argument:
    docid_hashed_file -- this will be "(dir)+docid_hashed.txt"
    
    Returns:
    doc_name-- it is a dictionary as {doc-id : doc-name}.
    """
    doc_names = dict()
    i = 0
    file = open(docid_hashed_file,'r',encoding = "utf-8")
    
    for each_line in file:
        x = each_line.split()
        doc_names[x[0]] = x[1]
    return doc_names

In [14]:
# docid_hashed_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/docid_hashed.txt"
# all_doc_names = get_all_documents_name(docid_hashed_file)
# print((all_doc_names))

In [15]:
def get_all_vocablury(file_name):
    """
    It takes file name of term-ids which will contain termids and
    adjacent to termids will be terms.
    It only returns the dictionary which will have unique TERMS as KEYS
    and TERM_IDS will be the VALUES..
    
    Argument:
    file_name -- this will be "(dir)+termid_hashed.txt"
    
    Returns:
    vocablury -- it is a dictionary as {terms: term-ids}.
    """
    vocablury = dict()
    file = open(file_name,'r',encoding = "utf-8")
    for each_line in file:
        x = each_line.split()
        #Interesting: making term as key and term_id as value (;
        vocablury[x[1]] = x[0]
    return vocablury

In [16]:
# voc_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/termid_hashed.txt"
# vocablury = get_all_vocablury(voc_file)
# #print(vocablury)

In [17]:
#### version 2.0
def get_document_postings(document_postings_file):
    """
    It takes file name of a file which will contain termids and
    adjacent to it will be list of documents in which this termid appears.
    
    It only returns the dictionary which will have unique TERMS_IDS as KEYS
    and document list as values.
    
    Argument:
    document_postings_file -- this will be "(dir)+document_postings.txt"
    
    Returns:
    termid_with_doc_postings -- it is a dictionary as {terms-ids: list(documents)}.
    """
    
    termid_with_doc_postings = dict()
    file = open(document_postings_file,'r',encoding = "utf-8")
    for each_line in file:
        x = (re.split("\n",each_line))
        #print("x = " , (x), " len of x = " ,len(x))
        y = (re.split("\t", x[0]))
        #print("y = " , (y), " len of y = " ,len(y))
        
        temp_str = y[(len(y)-1)]
        temp_str = temp_str.strip("[]")
        temp_str = temp_str.replace(" ", "")
        
        lst = list()
        lst = temp_str.split(",")
        
        current_term_id = y[0]
        # create a key in dict_with_docid_and_its_positions
        termid_with_doc_postings[current_term_id] = lst
    return termid_with_doc_postings

In [18]:
# doc_postings_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/document_postings.txt"
# dict_termid_with_docs_postings = get_document_postings(doc_postings_file)
# #print(dict_termid_with_docs_postings)

In [19]:
# version 2.0
def get_inverted_index(term_index_hashed_file):
    """
    It takes file name of a inverted index file which will contain
    term-ids and all of it's documents with all of it's positions.
    
    It returns the Nested dictionary which will have unique term-id
    as key to outer dictionary and on a single term-id there may 
    have multiple documents, these documents will be used a keys to
    inner dictionary, and on a single-document-id there may have
    multiple positions on which that term appeared.
    
    Argument:
    term_index_hashed_file -- this will be file name as "(dir)+docid_hashed.txt"
    
    Returns:
    nested_dict_with_termid_and_its_docs_and_occurance -- it is a dictionary as {term-id : { doc-id : positions }}.
    """
    nested_dict_with_termid_and_its_docs_and_occurance = dict()
    doc_id_with_positions = dict()
    file = open(term_index_hashed_file, 'r' , encoding = "utf-8")
    length = 0
    current_term_id = 0
    for each_line in file:
        x = (re.split("\n",each_line))
        y = (re.split("\t", x[0]))
        
        if y[0] == '':
            y = y[1:]
        
        # It means a New term_id aya hai, tou dict ki new key bnani
        if (len(y) == 4):
            # pick up last element, which will be list of positions in one document.
            # and filter it with strip(), replace() methods.
            temp_str = y[(len(y)-1)]
            temp_str = temp_str.strip("[]")
            temp_str = temp_str.replace(" ", "")
            lst = list()
            lst = temp_str.split(",")
            
            # reset length variable
            doc_id_with_positions = dict()
            length = 0
            # new term_id now found
            current_term_id = y[0]
            current_document_id = y[1]
            # create a key in dict_with_docid_and_its_positions
            doc_id_with_positions[current_document_id] = lst
            
            #length = len(lst) #len((y[(len(y)-1)]))
            nested_dict_with_termid_and_its_docs_and_occurance[current_term_id] = doc_id_with_positions
            
        elif (len(y) == 3):
            # pick up last element, which will be list of positions in one document.
            # and filter it with strip(), replace() methods.
            temp_str = y[(len(y)-1)]
            temp_str = temp_str.strip("[]")
            temp_str = temp_str.replace(" ", "")
            lst = list()
            lst = temp_str.split(",")
            current_document_id = y[0]
            doc_id_with_positions[current_document_id] = lst

            nested_dict_with_termid_and_its_docs_and_occurance.update({current_term_id:doc_id_with_positions})
        else:
            print("\nI Don't know what to do.\n")   
    return nested_dict_with_termid_and_its_docs_and_occurance

In [20]:
# hashed_term_id = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/term_index_hashed.txt"
# dict_term_id_with_frequencies = get_term_frequency(hashed_term_id)

# print((dict_term_id_with_frequencies[str(583007)]))
# #print((dict_term_id_with_frequencies[str(583007)][str(5256)]))
# #print(len(dict_term_id_with_frequencies[str(583007)][str(5256)]))
# #print(dict_term_id_with_frequencies)

In [21]:
### build version 2.3 : Final version
# Changes Made: 
# finally removed unnecessary comments & print statements, and added proper comments &
# replaced doc_id with doc_names as key of docs_score_for_each_query()
def calculate_okapi_bm25(parameters):
    k_1 = parameters['k1']
    k_2 = parameters['k2']
    b = parameters['b']
    D = parameters['D']
    
    input_directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    txt_extension = ".txt"
    xml_extension = ".xml"
    
    queries_file_name = input_directory + "topics" + xml_extension
    queries_dict = xml_parser(queries_file_name) 
    
    doc_info_file = input_directory+ "docid_hashed" + txt_extension
    all_doc_lengths = get_all_documents_length(doc_info_file)
    
    docid_hashed_file = input_directory + "docid_hashed" + txt_extension
    all_doc_names = get_all_documents_name(docid_hashed_file)
    
    voc_file = input_directory + "termid_hashed" + txt_extension
    vocablury = get_all_vocablury(voc_file)
    
    doc_postings_file = input_directory + "document_postings" + txt_extension
    dict_termid_with_docs_postings = get_document_postings(doc_postings_file)
    
    inverted_index_file = input_directory + "term_index_hashed" + txt_extension
    hashed_ii = get_inverted_index(inverted_index_file)
    #hashed_ii
    
    avg_length = 0
    for doc_id, length in all_doc_lengths.items():
        avg_length += int(length)
    avg_length /= len(all_doc_lengths) 
       
    scores_dictionary = dict()
    # Will run for number of times of queries in topics.xml
    for query_id, query in queries_dict.items(): # run for 10 times
        
        # Split one query in terms
        splitted_query = query_processing(query)
        score_of_each_term_for_single_doc = 0
        # Reset : docs_score_for_each_query.
        docs_score_for_each_query = dict()
        #scores_dictionary[query_id] = dict()    
        # For loop for each term in single queries,
        # i.e. if there is 3 word query, it will run for 3 times.
        for i in range(0,len(splitted_query)): 
#             score_of_each_term_for_single_doc = 0
            # Check If this splitted term (from query) exists in my vocablury.
            if splitted_query[i] in vocablury: # if that term exists
                # If YES, then get term_id of this splitted term (from query).
                # Now, get value = term_id by passing term as key.
                term_id = vocablury[splitted_query[i]]
                
                # Get list of documents in which this term exists
                list_of_all_docs_in_which_term_exists = dict_termid_with_docs_postings[term_id]
                
                # Get it's document_frequency, i.e. In how many docs it is present.
                df_i = len(list_of_all_docs_in_which_term_exists)
                
                # Now, run this loop for all docs in which it is present.
                # i.e if it is present in 3 docs, it will run for 3 
                for j in range(0,len(list_of_all_docs_in_which_term_exists)):
                    # Now, pick one by one doc_id, and compute score.
                    doc_id = list_of_all_docs_in_which_term_exists[j]
                    doc_name = all_doc_names[doc_id]
                    # Check IF that doc_id is present in my doc_postings file
                    if doc_id in all_doc_lengths:
                        # Get this term's frquency in this document
                        tf_d_i = len(hashed_ii[str(term_id)][doc_id])
                        tf_q_i = 1
                        length_of_doc_id = int(all_doc_lengths[doc_id])
                        capital_K = k_1 * ((1-b) + (b * (length_of_doc_id/avg_length)))
                        a = float(D + 0.5)
                        b = float(df_i + 0.5)
                        c = float(math.log(a/b))
                        d = float((1+k_1) * tf_d_i)
                        e = float(capital_K + tf_d_i)
                        f = float((1+k_2) * tf_q_i)
                        g = float(k_2+tf_q_i)
                        score_of_each_term_for_single_doc = c * (d/e) * (f/g)
                        
                        # Check IF already a term might have calculated score for this document (for same query).
                        # Or we can say, multiple term words of single query might present in same document.
                        # If YES: Else NO
                        if doc_name in docs_score_for_each_query:
                            prev_score = docs_score_for_each_query[doc_name]
                            new_score = prev_score + score_of_each_term_for_single_doc
                            docs_score_for_each_query[doc_name] = new_score
                            sorted_docs_score_for_each_query = sorted(docs_score_for_each_query.items(), key=operator.itemgetter(1), reverse=True)
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                        # Or maybe we found new document. Let's create a new key of doc_id on same query_id
                        else:
                            docs_score_for_each_query[doc_name] = dict()
                            docs_score_for_each_query[doc_name] = score_of_each_term_for_single_doc
                            sorted_docs_score_for_each_query = sorted(docs_score_for_each_query.items(), key=operator.itemgetter(1),reverse=True)
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                            #scores_dictionary[query_id] = dict()
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                    # Or maybe there is a document which is not in my possession.
                    else:
                        docs_score_for_each_query[doc_name] = 0
                        scores_dictionary[query_id] = docs_score_for_each_query
            # If this term is not in my Vocablury
            else: 
                print("Terms of Queries which are not in my collection = ", splitted_query[i])
    
    return scores_dictionary

In [22]:
parameters = dict()
parameters['k1'] = 1.2 
parameters['k2'] = 500
parameters['b'] = 0.75
parameters['D'] = 17

okapi_bmi25_score = calculate_okapi_bm25(parameters)
#print(okapi_bmi25_score[str(214)])
#a = (okapi_bmi25_score[str(214)])[0]
#print(a[1])
#print((okapi_bmi25_score[str(214)])[1])

Terms of Queries which are not in my collection =  uss
Terms of Queries which are not in my collection =  2008
Terms of Queries which are not in my collection =  world'


In [119]:
## version v3.0
def dirichlet_smoothing():
    
    input_directory = get_directory_path("input")
    output_directory = get_directory_path("output")
    txt_extension = ".txt"
    xml_extension = ".xml"
    
    queries_file_name = input_directory + "topics" + xml_extension
    queries_dict = xml_parser(queries_file_name) 
    
    doc_info_file = input_directory+ "docid_hashed" + txt_extension
    all_doc_lengths = get_all_documents_length(doc_info_file)
    
    docid_hashed_file = input_directory + "docid_hashed" + txt_extension
    all_doc_names = get_all_documents_name(docid_hashed_file)
    
    voc_file = input_directory + "termid_hashed" + txt_extension
    vocablury = get_all_vocablury(voc_file)
    
    doc_postings_file = input_directory + "document_postings" + txt_extension
    dict_termid_with_docs_postings = get_document_postings(doc_postings_file)
    
    inverted_index_file = input_directory + "term_index_hashed" + txt_extension
    hashed_ii = get_inverted_index(inverted_index_file)
    
    mu = 0
    total_length_of_collection = 0
    for doc_id, length in all_doc_lengths.items():
        total_length_of_collection += int(length)
    mu = total_length_of_collection/len(all_doc_lengths) 
    
    scores_dictionary = dict()
    # Will run for number of times of queries in topics.xml
    for query_id, query in queries_dict.items(): # run for 10 times
        # Split one query in terms
        splitted_query = query_processing(query)
        score_of_each_term_for_single_doc = 0
        # Reset : docs_score_for_each_query.
        docs_score_for_each_query = dict()
        
        # For loop for each term in single queries,
        # i.e. if there is 3 word query, it will run for 3 times.
        for i in range(0,len(splitted_query)):            
            # Check If this splitted term (from query) exists in my vocablury.
            if splitted_query[i] in vocablury: # if that term exists
                # If YES, then get term_id of this splitted term (from query).
                # Now, get value = term_id by passing term as key.
                term_id = vocablury[splitted_query[i]]
                
                # Get list of documents in which this term exists.
                list_of_all_docs_in_which_term_exists = dict_termid_with_docs_postings[(term_id)]
                
                # Count the number of times each word occurs in Corpora, divide by total length.
                # Basically add-up all lengths of documents.
                sum_of_term_in_whole_corpora = 0
                for d_idx, positinos in hashed_ii[term_id].items():
                    sum_of_term_in_whole_corpora += len(positinos) # len(hashed_ii[term_id][positinos]) #
                prob_of_term_occuring_in_whole_corpora = sum_of_term_in_whole_corpora/total_length_of_collection
                
                # Now, run this loop for all docs in which it is present.
                # i.e if it is present in 3 docs, it will run for 3.
                for j in range(0,len(list_of_all_docs_in_which_term_exists)):
                    # Now, pick one by one doc_id, and compute score.
                    doc_id = list_of_all_docs_in_which_term_exists[j]
                    doc_name = all_doc_names[doc_id]
                    # Check IF that doc_id is present in my doc_postings file.
                    if doc_id in all_doc_lengths:
                        N = int(all_doc_lengths[(doc_id)]) # doc length
                        lamdba = N/(N+mu)
                        one_minus_lamdba = 1 - lamdba
                        
                        # Count the number of times word occurs in document, divide by document length.
                        count_of_term_in_single_doc = len(hashed_ii[term_id][doc_id])
                        prob_occuring_in_single_doc = count_of_term_in_single_doc / N
                        
                        score_of_each_term_for_single_doc = (lamdba * prob_occuring_in_single_doc) + (one_minus_lamdba * prob_of_term_occuring_in_whole_corpora)
                                                
                        # Check IF already a term might have calculated score for this document (for same query).
                        # Or we can say, multiple term words of single query might present in same document.
                        # If YES: Else NO
                        if doc_name in docs_score_for_each_query:
                            prev_score = docs_score_for_each_query[doc_name]
                            new_score = prev_score + score_of_each_term_for_single_doc
                            docs_score_for_each_query.update({doc_name : new_score})
                            sorted_docs_score_for_each_query = sorted(docs_score_for_each_query.items(), key=operator.itemgetter(1), reverse=True)
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                        # Or maybe we found new document. Let's create a new key of doc_id on same query_id.
                        else:
                            docs_score_for_each_query[doc_name] = dict()
                            docs_score_for_each_query[doc_name] = score_of_each_term_for_single_doc
                            sorted_docs_score_for_each_query = sorted(docs_score_for_each_query.items(), key=operator.itemgetter(1),reverse=True)
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                            #scores_dictionary[query_id] = dict()
                            scores_dictionary[query_id] = sorted_docs_score_for_each_query
                    # Or maybe there is a document which is not in my possession.
                    else:
                        docs_score_for_each_query[doc_name] = 0
                        scores_dictionary[query_id] = docs_score_for_each_query
            # If this term is not in my Vocablury.
            else: 
                print("Terms of Queries which are not in my collection = ", splitted_query[i])

    return scores_dictionary

In [120]:
dirichlet_score = dirichlet_smoothing()
#print(dirichlet_score)

Terms of Queries which are not in my collection =  uss
Terms of Queries which are not in my collection =  vinson
Terms of Queries which are not in my collection =  gain
Terms of Queries which are not in my collection =  2008
Terms of Queries which are not in my collection =  dog
Terms of Queries which are not in my collection =  world'
Terms of Queries which are not in my collection =  war


In [33]:
def save_scores(output_file_name, scores_dictionary):
    fh = open(output_file_name, "a+")
    for query_id , doc_score_list in scores_dictionary.items():
        # term_id 
        q_id = str(query_id)
        i = 1
        for j in range(0, len(doc_score_list)):
            doc_name = doc_score_list[j][0]
            score = doc_score_list[j][1]
            lne = q_id  + "\t"+ doc_name + "\t" + str(i)+"\t"+ str(score) + (" run 1")

            fh.write(lne)
            fh.write('\n')
            i+=1
    fh.close()

In [None]:
okapi_output_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/out/okapi_scores.txt"
### okapi_bmi25_score -> getting from okapi_bmi25_score = calculate_okapi_bm25(parameters)
save_scores(output_file, okapi_bmi25_score) 


#dirichlet_output_file = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/out/dirichlet_scores.txt"
#### dirichlet_score -> getting from dirichlet_score = dirichlet_smoothing()
#save_scores(output_file, dirichlet_score) 

In [35]:
def print_socre(scores_dictionary):
    for query_id , doc_score_list in scores_dictionary.items():
                # term_id 
        q_id = str(query_id)
        i = 1
        for j in range(0, len(doc_score_list)):
            doc_name = doc_score_list[j][0]
            score = doc_score_list[j][1]
            lne = q_id  + "\t"+ doc_name + "\t" + str(i)+"\t"+ str(score) + (" run 1")
            print(lne)
            i+=1

In [36]:
print("\nOkapi BM-25 Score :\n")
print_socre(okapi_bmi25_score)


#print("\nDirichlet Score :\n")
#print_socre(dirichlet_score)


Okapi BM-25 Score :

202	clueweb12-0204wb-17-00480	1	8.126698979794838 run 1
202	clueweb12-1415wb-76-20214	2	4.670830536077839 run 1
202	clueweb12-1203wb-58-13078	3	0.9337631738906944 run 1
202	clueweb12-0406wb-30-19250	4	0.8612996517533233 run 1
202	clueweb12-0412wb-03-03995	5	0.6205382964103825 run 1
202	clueweb12-1810wb-56-08993	6	0.6036646938714822 run 1
202	clueweb12-1802wb-29-21108	7	0.5725286974766479 run 1
202	clueweb12-0500tw-06-07919	8	0.5444473136066619 run 1
202	clueweb12-0209wb-09-11391	9	0.4372246791503997 run 1
202	clueweb12-0700tw-75-05625	10	0.42065626177402704 run 1
202	clueweb12-0805wb-29-00101	11	0.3777164354007483 run 1
202	clueweb12-1703wb-12-05247	12	0.3640580200574394 run 1
202	clueweb12-1304wb-56-00880	13	0.3601921004502982 run 1
202	clueweb12-1019wb-55-07377	14	0.3427313469400397 run 1
202	clueweb12-1118wb-26-26616	15	0.31817313540208736 run 1
202	clueweb12-1006wb-17-21364	16	0.30651100069531034 run 1
202	clueweb12-0611wb-11-24228	17	0.3009232230752728 run 1


221	clueweb12-0311wb-40-25169	152	0.10227421691898303 run 1
221	clueweb12-0815wb-87-20872	153	0.1019724240509754 run 1
221	clueweb12-0700tw-75-05625	154	0.09942422250252939 run 1
221	clueweb12-1914wb-04-28610	155	0.09880154211672762 run 1
221	clueweb12-0405wb-70-11124	156	0.09812767810565348 run 1
221	clueweb12-0006wb-55-32231	157	0.097601183713935 run 1
221	clueweb12-0510wb-07-22753	158	0.09755453251747857 run 1
221	clueweb12-0102wb-06-30154	159	0.09737086926260723 run 1
221	clueweb12-0208wb-79-12562	160	0.09680715566046152 run 1
221	clueweb12-0102wb-02-09029	161	0.09640806033648526 run 1
221	clueweb12-0400wb-12-12146	162	0.09547630897195473 run 1
221	clueweb12-0013wb-43-12760	163	0.09504314182029651 run 1
221	clueweb12-1400tw-39-04004	164	0.09449311561439504 run 1
221	clueweb12-0403wb-44-21305	165	0.0941166913976921 run 1
221	clueweb12-0210wb-37-21495	166	0.09321683249486444 run 1
221	clueweb12-1803wb-49-14258	167	0.09108217049331194 run 1
221	clueweb12-0200wb-93-11027	168	0.09013580

230	clueweb12-0601wb-22-26137	448	-0.17143875783914375 run 1
230	clueweb12-1100tw-02-21309	449	-0.1776485974770977 run 1
230	clueweb12-1100tw-02-21308	450	-0.1793862561832037 run 1
230	clueweb12-1714wb-21-23241	451	-0.1793862561832037 run 1
230	clueweb12-1100tw-02-21307	452	-0.18115824599097585 run 1
230	clueweb12-0711wb-94-01978	453	-0.18296559452833427 run 1
230	clueweb12-1705wb-05-04305	454	-0.1831331998977908 run 1
230	clueweb12-0210wb-69-31955	455	-0.18480937085180954 run 1
230	clueweb12-0716wb-29-01979	456	-0.18669068755564336 run 1
230	clueweb12-1412wb-81-22868	457	-0.19461526327227252 run 1
230	clueweb12-1200tw-37-10025	458	-0.20267835865656791 run 1
230	clueweb12-1300tw-82-00951	459	-0.21023211525921537 run 1
230	clueweb12-1304wb-64-01725	460	-0.24056790471295328 run 1
230	clueweb12-1310wb-92-05169	461	-0.25389011467445005 run 1
230	clueweb12-1104wb-63-08767	462	-0.28551304246822473 run 1
230	clueweb12-1713wb-61-31957	463	-0.29475234286412194 run 1
230	clueweb12-0917wb-00-2910

246	clueweb12-0502wb-40-05005	139	0.1350034272841182 run 1
246	clueweb12-0609wb-72-06631	140	0.13437036528824867 run 1
246	clueweb12-1913wb-77-14480	141	0.1340620992273794 run 1
246	clueweb12-0611wb-11-24228	142	0.1337516441162454 run 1
246	clueweb12-1017wb-33-07476	143	0.13365189618926415 run 1
246	clueweb12-0609wb-34-31489	144	0.13279526478744536 run 1
246	clueweb12-1415wb-97-03335	145	0.13065446230928174 run 1
246	clueweb12-0207wb-02-00245	146	0.130582982905954 run 1
246	clueweb12-1416wb-03-28994	147	0.13004997847072858 run 1
246	clueweb12-1415wb-97-03327	148	0.12945106229056147 run 1
246	clueweb12-0608wb-06-27359	149	0.1287123991513566 run 1
246	clueweb12-0311wb-88-27660	150	0.1287123991513566 run 1
246	clueweb12-0402wb-62-27154	151	0.12806843495460532 run 1
246	clueweb12-1415wb-58-20068	152	0.1276869609574076 run 1
246	clueweb12-0208wb-14-27366	153	0.1266964278852824 run 1
246	clueweb12-0609wb-34-31491	154	0.1254430004511518 run 1
246	clueweb12-0608wb-48-08482	155	0.12511475541853

250	clueweb12-0002wb-14-02885	826	-0.059649571487837706 run 1
250	clueweb12-1500tw-08-02990	827	-0.06134905141299965 run 1
250	clueweb12-0311wb-40-25200	828	-0.06154093461704889 run 1
250	clueweb12-0311wb-60-09175	829	-0.06167871376033636 run 1
250	clueweb12-0500wb-45-06174	830	-0.06181711123606788 run 1
250	clueweb12-0406wb-98-34015	831	-0.06194582962196406 run 1
250	clueweb12-0103wb-45-23812	832	-0.06206690533883254 run 1
250	clueweb12-0306wb-74-10457	833	-0.06270216584552359 run 1
250	clueweb12-0712wb-98-02604	834	-0.06285169609879931 run 1
250	clueweb12-1714wb-21-23241	835	-0.06317490840379891 run 1
250	clueweb12-1118wb-10-11735	836	-0.06349515978221955 run 1
250	clueweb12-1300tw-13-03461	837	-0.06415445443933496 run 1
250	clueweb12-0900tw-05-12646	838	-0.0656933820637276 run 1
250	clueweb12-1412wb-81-22868	839	-0.06608404748267477 run 1
250	clueweb12-0607wb-55-14514	840	-0.0668080010626082 run 1
250	clueweb12-0402wb-26-09985	841	-0.06902221340992644 run 1
250	clueweb12-1118wb-60-0

# Evaluation


To evaluate your results, we will write a program that computes mean average precision of the 
rank list of documents for different queries. The input to program will be the <font color=blue> qrel file 
(relevance judgments) </font> and scoring file that has rank list of documents. 

The output should be following measures: 
    
<font color=red>  </font> <font color=green> P@5  </font>

<font color=red>  </font> <font color=green> P@10 </font>

<font color=red>  </font> <font color=green> P@20 </font>

<font color=red>  </font> <font color=green> P@30 </font>

<font color=red>  </font> <font color=green> MAP </font>

These measures should be computed for each query. Average for all queries should also be computed.


In [37]:
def qrel_reader_and_parser(qrel_file):
    nested_dict_with_topics_and_its_docs_and_grades = dict()
    doc_id_with_grades = dict()
    file = open(qrel_file, 'r' , encoding = "utf-8")
    length = 0

    current_topic_id = 0
    for each_line in file:
        x = (re.split("\n",each_line))
        y = (re.split(" ", x[0]))
        #print(y)
        
        current_topic_id = y[0]
        # If current_topic already is already in my dict
        if current_topic_id in nested_dict_with_topics_and_its_docs_and_grades: 
            current_doc_name = y[2]
            doc_id_with_grades[current_doc_name] = int()
            
            val = 0
            if int(y[3]) > 0:
                val = 1
            
            # name_of_doc                        # grade
            doc_id_with_grades[current_doc_name] = val
            nested_dict_with_topics_and_its_docs_and_grades.update({current_topic_id : doc_id_with_grades})
        
        # Current_topic pehle se mujood naaii hai
        else:
            # Now new topic found, thus reset.
            nested_dict_with_topics_and_its_docs_and_grades[current_topic_id] = dict()
            doc_id_with_grades = dict()

            current_doc_name = y[2]
            doc_id_with_grades[current_doc_name] = int()
            
            val = 0
            if int(y[3]) > 0:
                val = 1
            
            # name_of_doc                       # grade
            doc_id_with_grades[current_doc_name] = val
            nested_dict_with_topics_and_its_docs_and_grades.update({current_topic_id : doc_id_with_grades})
            
    return nested_dict_with_topics_and_its_docs_and_grades

In [38]:
qrel_file_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/relevance judgements.qrel"
dict_for_qrel = qrel_reader_and_parser(qrel_file_path)
#print(dict_for_qrel)

In [44]:
def calculate_precison(qrel_file_path, precision_mode, score_algorithm):
    
    dict_for_qrel = qrel_reader_and_parser(qrel_file_path)
    
    if (score_algorithm == "dirichlet"):
        dirichlet_score = dirichlet_smoothing()
    elif (score_algorithm == "bm25"):
        parameters = dict()
        parameters['k1'] = 1.2 
        parameters['k2'] = 500
        parameters['b'] = 0.75
        parameters['D'] = 17
        # okapi_bmi25_score = calculate_okapi_bm25(parameters)
        # Naming it as dirichlet_score, because then I would have to change 
        # dictionary name dirichley_socre everywhere in code.
        dirichlet_score = calculate_okapi_bm25(parameters)
    else: 
        raise ValueError('Unspecified Scoring algorithm.')
        sys.exit(0)
        
    if precision_mode == "p5":
        dict_for_precision = dict()
        precision = 0
        relevant = 0
        #retrieved2 = 5
        for query_id, zipi in dirichlet_score.items():
            count = 0
            retrieved = len(dirichlet_score[query_id])
            for i in range(0,len(zipi)):
                #print(zipi[i][0]) = Doc_name
                #print(zipi[i][0]) = calculated_score
                doc_name = zipi[i][0]
                if(count == 5):
                    break
                # IF THAT FOLDER EXISTS IN QREL AND HAS SCORE 1
                if doc_name in dict_for_qrel[query_id] and dict_for_qrel[query_id][doc_name] > 0:
                    relevant+=1
                count+=1
                #retrieved2+=1
                precision = float(relevant/retrieved)
            dict_for_precision[query_id] = precision
                
    elif precision_mode == "p10":
        dict_for_precision = dict()
        precision = 0
        relevant = 0
        #retrieved2 = 10
        for query_id, zipi in dirichlet_score.items():
            count = 0
            retrieved = len(dirichlet_score[query_id])
            for i in range(0,len(zipi)):
                #print(zipi[i][0]) = Doc_name
                #print(zipi[i][0]) = calculated_score
                doc_name = zipi[i][0]
                if(count == 10):
                    break
                # IF THAT FOLDER EXISTS IN QREL AND HAS SCORE 1
                if doc_name in dict_for_qrel[query_id] and dict_for_qrel[query_id][doc_name] > 0:
                    relevant+=1
                count+=1
                #retrieved2+=1
                precision = float(relevant/retrieved)
            dict_for_precision[query_id] = precision
        
    elif precision_mode == "p20":
        dict_for_precision = dict()
        precision = 0
        relevant = 0
        #retrieved2 = 20
        for query_id, zipi in dirichlet_score.items():
            count = 0
            retrieved = len(dirichlet_score[query_id])
            for i in range(0,len(zipi)):
                #print(zipi[i][0]) = Doc_name
                #print(zipi[i][0]) = calculated_score
                doc_name = zipi[i][0]
                if(count == 20):
                    break
                # IF THAT FOLDER EXISTS IN QREL AND HAS SCORE 1
                if doc_name in dict_for_qrel[query_id] and dict_for_qrel[query_id][doc_name] > 0:
                    relevant+=1
                count+=1
                #retrieved2+=1
                precision = float(relevant/retrieved)
            dict_for_precision[query_id] = precision
    
    elif precision_mode == "p30":
        dict_for_precision = dict()
        precision = 0
        relevant = 0
        #retrieved2 = 30
        for query_id, zipi in dirichlet_score.items():
            count = 0
            retrieved = len(dirichlet_score[query_id])
            for i in range(0,len(zipi)):
                #print(zipi[i][0]) = Doc_name
                #print(zipi[i][0]) = calculated_score
                doc_name = zipi[i][0]
                if(count == 30):
                    break
                # IF THAT FOLDER EXISTS IN QREL AND HAS SCORE 1
                if doc_name in dict_for_qrel[query_id] and dict_for_qrel[query_id][doc_name] > 0:
                    relevant+=1
                count+=1
                #retrieved2+=1
                precision = float(relevant/retrieved)
            dict_for_precision[query_id] = precision
        
    elif precision_mode == "map":
        print("TODO")
    
    else:
        raise ValueError('Unspecified mode for Precision.')
        sys.exit(0)
        
    print(dict_for_precision)

In [45]:
qrel_file_path = "/Users/imbilalbutt/Documents/Semesters/Semester 9/Information Retrieval/Assignment/hw2/input/relevance judgements.qrel"
i_mode = "p30"

#score_algorithm = "dirichlet"
score_algorithm = "bm25"

calculate_precison(qrel_file_path, i_mode, score_algorithm)

Terms of Queries which are not in my collection =  uss
Terms of Queries which are not in my collection =  2008
Terms of Queries which are not in my collection =  world'
{'202': 0.0, '214': 0.013123359580052493, '216': 0.04107648725212465, '221': 0.03612167300380228, '227': 0.06144393241167435, '230': 0.08847736625514403, '234': 0.04482758620689655, '243': 0.08144796380090498, '246': 0.04344512195121951, '250': 0.05930470347648262}


In [128]:
D = dict()

In [129]:
ls1 = ['1','2','3']
ls2 = ['1','2','3']

In [130]:
s = list(zip(ls1,ls2))

In [131]:
len(s)

3

In [132]:
print(s[0][0])

1
