#  Relevance Assessments, IR Evaluation

In [156]:
from collections import defaultdict
from collections import OrderedDict
import math
import os.path  
from nltk.tokenize import word_tokenize 

##  Assessment graphical interface & Manual assessments

In [None]:
!streamlit run main.py

## Trec_eval Analysis

In [157]:
# hold the final relevant docs
# key: query id
# value:  a list of doc_id that are considered relevant or strongly relevant based on their average relevance score
rele_assess_dict = {}

### 1. Methods: parse the input

In [158]:
"""
Parses the ranked list file and returns an OrderedDict
The DocIDs are listed in the order they appear in the file, which should be sorted by score already before writing the file. 
- key: QueryID
- value: a list of DocIDs associated with that QueryID. => sorted by score
"""
def retrieve_query_results(query_result_file):
    query_results = OrderedDict()
    with open(query_result_file, "r") as file:
        for line in file:
            query_id, _, doc_id, *rest = line.strip().split() # get only the query_id and doc_id
            if query_id in query_results:
                query_results[query_id].append(doc_id) # 
            else:
                query_results[query_id] = [doc_id]
    return query_results


In [159]:
"""
Reads the qrel relevance assessment result file and returns a dictionary whose
- key: QueryID
- value: list of DocId that are relevant(relevance == "1") to the QueryID according to the relevance judgments.  
"""
def retrieve_relevance_assessment(qrel_file):
    temp_relevance_scores = {}

    with open(qrel_file, "r") as f:
        for line in f:
            query_id, _, doc_id, relevance = line.strip().split()
            relevance = int(relevance)  # convert relevance to int for calculations

            key = (query_id, doc_id)

            # if the key is already in the dictionary, update the sum and count
            if key in temp_relevance_scores:
                temp_relevance_scores[key]["sum"] += relevance
                temp_relevance_scores[key]["count"] += 1
            else:
                # if not, initialize the sum and count
                temp_relevance_scores[key] = {"sum": relevance, "count": 1}

    #  as we have multiple scores for the same query_id and doc_id pair
    for (query_id, doc_id), scores in temp_relevance_scores.items():
        # calculate the average relevance score
        average_relevance = scores["sum"] / scores["count"]
        # check if the average relevance score >= 1
        if average_relevance >= 1:
            if query_id in rele_assess_dict:
                rele_assess_dict[query_id].append(doc_id)
            else:
                rele_assess_dict[query_id] = [doc_id]
    return rele_assess_dict


### 2. Methods: Printing Helper

In [160]:
"""
Prints the mean values of sublists in a structured and aligned format.

Parameters:
- lst (list): A list of lists where each sublist contains numeric values.
- desc (str): A descriptive label for the values being printed.
- kVals (list): An optional list of indices to specify which sublists to calculate the mean for.
- qid (str): An optional identifier to include in the printed output.
"""
def print_mean_vals(lst, desc, kVals=[], qid=""):
    print(desc)
    
    if not kVals:
        # if kVals is empty, calculate and print the mean for the entire list.
        mean_val = math.fsum(lst) / len(lst)
        print("For all rel docs: {:.4f}".format(mean_val))
    else:
        # calcualte and print the mean for each specified sublist.
        for k in kVals:
            mean_val = math.fsum(lst[k]) / len(lst[k])
            # If qid is provided, include it in the print statement.
            if qid != "":
                print("  At {:5} docs for {}: {:.4f}".format(k, qid, mean_val))
            else:
                # use ':5': ensures that the number is right-aligned in a space of 5 chars.
                print("  At {:5} docs: {:.4f}".format(k, mean_val))

In [161]:
def print_metric(description, value, query_id):
    print(f"{description} for {query_id}: {value:.4f}")

### 3. Methods: Calcualte Precisions & Other Metrics

In [162]:
"""Calculate the precision
"""
def calculate_precision(relevant_number, rank):
    return relevant_number / rank if rank else 0   # avoid division by zero

In [163]:
"""Calculate the recall
The fraction of relevant documents retrieved by the system.
"""
def calculate_recall(relevant_number, total_relevant):
    return relevant_number / total_relevant if total_relevant else 0

In [164]:
"""Calculate the F1-Measure
- combine precision and recall into a single value
- harmonic mean: F= 2PR/(P+R)
"""
def calculate_f1(precision, recall):
    return (2 * precision * recall) / (precision + recall) if precision + recall else 0

In [165]:
"""Calculate the normalized Discounted Cumulative Gain (nDCG)
- normalize DCG by iDCG to find nDCG:  nDCG = DCG/IDCG
"""
def calculate_nDCG(relevance_scores):
    
    # get DCG using the relevance scores
    dcg = sum(
        score / math.log(1.0 + rank)
        for rank, score in enumerate(relevance_scores, start=1)
    )
    
    # get the IDCG using the sorted relevance scores
    sorted_scores = sorted(relevance_scores, reverse=True)
    idcg = sum(
        score / math.log(1.0 + rank)
        for rank, score in enumerate(sorted_scores, start=1)
    )
    
    return dcg / idcg if idcg else 0

In [166]:
"""Calculate R-Precision
- rp: # of relevant doc that have been retrieved.
- total_relevant: # of relevant doc that exist for the given query.
"""
def calculate_r_precision(rp, total_relevant):
    return rp / total_relevant if total_relevant else 0


# def calculate_r_precision(retrieved_relevant, total_relevant_docs):
#     return retrieved_relevant / total_relevant_docs if total_relevant_docs else 0

In [167]:
def update_metric(metric_dict, key, value):
    if key in metric_dict:
        metric_dict[key].append(value)
    else:
        metric_dict[key] = [value]

In [168]:
"""Helper methods to write each docid per queryid's result to a file
"""
def write_details(f, query_id, document, rank, is_relevant, precision, recall):
    f.write(f"{query_id} {document} {rank} {is_relevant} {precision:.4f} {recall:.4f}\n")

In [169]:
""" Calculate and print evaluation metrics for retrieved results.
Computes various evaluation metrics, such as Average Precision (AP),R-Precision (RP), and Normalized Discounted Cumulative Gain (nDCG) for a set of
queries and their retrieved results. Precision, Recall, and F1 score are calculated at predefined rank thresholds (k-values). 
It also provides the option to print detailed metrics for each individual query.
"""
def calculate_metrics(query_results, option):
    
    # initialize lists and dictionaries to store  metrics
    k_vals = [5, 10, 20, 50, 100]
    AP, RP, NDCG = [], [], [] # lists to store Average Precision, R-Precision, and nDCG for each query
    P, R, F1 = {}, {}, {}     # dicts to store Precision, Recall, and F1 scores at different k values across all queries
    
    with open("details.txt", "w") as f:
        
        for query_id, results in query_results.items():
            relevant_docs = rele_assess_dict.get(query_id, [])
            if not relevant_docs:
                continue

            # Initialize variables for calculating metrics
            relevance_scores = []
            p_temp, r_temp, f1_temp = {}, {}, {}
            p_sum, relevant_count, rp = 0, 0, 0
            
            # Iterate over each document and its rank in the results
            for rank, document in enumerate(results, start=1):
                is_relevant = document in relevant_docs                        # check if the current document is relevant
                relevant_count += is_relevant                                  # increment count if the document is relevant
                rp = relevant_count if rank <= len(relevant_docs) else rp      # calculate R-Precision based on relevant count
                precision = calculate_precision(relevant_count, rank)          # calculate precision at current rank
                recall = calculate_recall(relevant_count, len(relevant_docs))  # calculate recall at current rank
                p_sum += precision * is_relevant                               # add to precision sum for Average Precision calculation
                
                # update metrics at predefined k values
                if rank in k_vals:
                    update_metric(P, rank, precision)
                    update_metric(p_temp, rank, precision)
                    f1 = calculate_f1(precision, recall)
                    update_metric(F1, rank, f1)
                    update_metric(f1_temp, rank, f1)
                    update_metric(R, rank, recall)
                    update_metric(r_temp, rank, recall)
                    
                # append relevance score for nDCG calculation
                relevance_scores.append(is_relevant)
                # write detailed results to file => laster used in the plotting
                write_details(
                    f, query_id, document, rank, is_relevant, precision, recall
                )
                
            # calculate and store R-Precision, nDCG, and Average Precision for the current query
            r_precision = calculate_r_precision(rp, len(relevant_docs))
            RP.append(r_precision)
            
            ndcg = calculate_nDCG(relevance_scores)
            NDCG.append(ndcg)
            
            avg_precision = p_sum / len(relevant_docs) if relevant_docs else 0
            AP.append(avg_precision)
            
            # print the metrics detail for the current query if option 1 is selected
            if option == 1:
                print_metric("Average Precision", avg_precision, query_id)
                print_metric("R-Precision", r_precision, query_id)
                print_metric("nDCG", ndcg, query_id)
                print("\nPrecision@ Values")
                print_mean_vals(p_temp, "Mean Precision@", k_vals, query_id)
                print("\nRecall@ Values")
                print_mean_vals(r_temp, "Mean Recall@", k_vals, query_id)
                print("\nF1@ Values")
                print_mean_vals(f1_temp, "Mean F1@", k_vals, query_id)
                print("\n")
                
    # print summary for all queries
    print("========================================")
    print("============== Summary =================")
    print("========================================")
    print_mean_vals(AP, "\nAverage Precision:")
    print_mean_vals(RP, "\nR-Precision:")
    print_mean_vals(NDCG, "\nnDCG:")
    # print("\n========= Precision@k =========")
    print_mean_vals(P, "\n========= Precision@k =========", k_vals)
    # print("\n========= Recall@k =========")
    print_mean_vals(R, "\n=========== Recall@k ==========", k_vals)
    # print("\n========= F1@k =========")
    print_mean_vals(F1, "\n============ F1@k ============", k_vals)

### 4. Method: Main Run for Designed Trec-Eval

In [170]:
cwd = os.getcwd() # get the current working directory
PATH_SCRIPT = os.path.abspath(cwd) 
PATH_DIR_EVAL = os.path.join(PATH_SCRIPT, 'eval_files')

In [171]:
def run_eval():
    cmd = input("Command: ")
    cmd_params = cmd.split(" ")
    
    if len(cmd_params) == 4:

        PATH_QRELS_FILE = os.path.join(PATH_DIR_EVAL, f'{cmd_params[2]}')
        PATH_QUERY_RESULT_FILE =os.path.join(PATH_DIR_EVAL, f'{cmd_params[3]}')

        retrieve_relevance_assessment(PATH_QRELS_FILE)
        query_results = retrieve_query_results(PATH_QUERY_RESULT_FILE)
        calculate_metrics(query_results, 1)
        
    else:
        PATH_QRELS_FILE = os.path.join(PATH_DIR_EVAL, f'{cmd_params[1]}')
        PATH_QUERY_RESULT_FILE =os.path.join(PATH_DIR_EVAL, f'{cmd_params[2]}')
        
        retrieve_relevance_assessment(PATH_QRELS_FILE)
        query_results = retrieve_query_results(PATH_QUERY_RESULT_FILE)
        
        calculate_metrics(query_results, 2)

### 5. Call Trec Eval

#### a. For HW1 QREL file & Ranklist Files

In [172]:
# trec_eval qrels.adhoc.51-100.AP89.txt 1_query_result_es_builtin.txt
# trec_eval qrels.adhoc.51-100.AP89.txt 2_query_result_okapi_tf.txt
# trec_eval qrels.adhoc.51-100.AP89.txt 3_query_result_tfidf.txt
# trec_eval qrels.adhoc.51-100.AP89.txt 4_query_result_okapi_bm25.txt
# trec_eval qrels.adhoc.51-100.AP89.txt 5_query_result_lm_laplace.txt
# trec_eval qrels.adhoc.51-100.AP89.txt 6_query_result_lm_jm.txt

# trec_eval [-q] qrels.adhoc.51-100.AP89.txt 2_query_result_okapi_tf.txt

In [182]:
rele_assess_dict = {}
run_eval()

Command:  trec_eval qrels.adhoc.51-100.AP89.txt 2_query_result_okapi_tf.txt



Average Precision:
For all rel docs: 0.2316

R-Precision:
For all rel docs: 0.2573

nDCG:
For all rel docs: 0.6125

  At     5 docs: 0.4480
  At    10 docs: 0.3920
  At    20 docs: 0.3420
  At    50 docs: 0.2736
  At   100 docs: 0.1996

  At     5 docs: 0.0593
  At    10 docs: 0.0951
  At    20 docs: 0.1489
  At    50 docs: 0.2464
  At   100 docs: 0.3510

  At     5 docs: 0.0933
  At    10 docs: 0.1290
  At    20 docs: 0.1682
  At    50 docs: 0.2100
  At   100 docs: 0.2069


#### b. For QREL & Ranklist Files

##### Queries: Evaluation Rank list -- use elasticsearch built-in

In [91]:
queries = {
    "152901": "West African Ebola epidemic",
    "152902": "H1N1 Swine Flu pandemic",
    "152903": "COVID 19",
}

# modify the queries => tokenized
queries_map = {qid: word_tokenize(query) for qid, query in QUERIES.items()}

In [92]:
from elasticsearch import Elasticsearch

INDEX_NAME = 'crawler' 
CLOUD_ID = "6200:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyRiZTllZjE5NDRkNTg0MDE3YTU0NDg0MzcwYjk5MjQzMSQ2Zjg1ODJhNWRjMGY0NDBhODU1Njk1MDQ4NzMyNmU2Yg=="                  
es = Elasticsearch(request_timeout = 10000, 
                    cloud_id = CLOUD_ID,
                    http_auth = ('elastic', 'fwOhKti7myB3PKFHQavQBhcr'))

print(es.ping())

True


In [93]:
def es_search(q):
    res_es_search = es.search(
        index=INDEX_NAME,
        body={
            'query': {
                'match': {
                    'content': ' '.join(q)
                }
            },
            'size': 1000
        }
    )
    return res_es_search

In [174]:
"""
1. Use 'es_search' method to get the relavant document for each of the queries
2. Write the top 1000 documents - for es-model the docs are already in the sorted order based on the score 
"""
PATH_OUTPUT_ES_BUILT_IN = os.path.join(PATH_DIR_EVAL, 'ranklist.txt')

es_result_dict = {} #  a dictionary of (a list of tuples)

with open(PATH_OUTPUT_ES_BUILT_IN, "w") as output_file:
    for q_num, q_tokens in queries_map.items():
        q_results = []
        output = es_search(q_tokens)
        hits = output['hits']['hits']
        for rank, hit in enumerate(hits, start=1):
            docno = hit['_id']
            score = hit['_score']
            output_line = f"{q_num} Q0 {docno} {rank} {score} Exp"
            output_file.write(output_line + "\n")
            q_results.append((docno, score))  
        es_result_dict[q_num] = q_results

In [None]:
# trec_eval [-q] qrel.txt ranklist.txt
# trec_eval qrel.txt ranklist.txt

In [178]:
rele_assess_dict = {}
run_eval()

Command:  trec_eval qrel.txt ranklist.txt



Average Precision:
For all rel docs: 0.8966

R-Precision:
For all rel docs: 0.8765

nDCG:
For all rel docs: 0.9871

  At     5 docs: 1.0000
  At    10 docs: 1.0000
  At    20 docs: 1.0000
  At    50 docs: 0.9867
  At   100 docs: 0.9900

  At     5 docs: 0.0271
  At    10 docs: 0.0543
  At    20 docs: 0.1085
  At    50 docs: 0.2673
  At   100 docs: 0.5370

  At     5 docs: 0.0528
  At    10 docs: 0.1029
  At    20 docs: 0.1957
  At    50 docs: 0.4203
  At   100 docs: 0.6953


##  Precision-Recall Curves

In [179]:
def extract_precision_recall(details_file):
    query_dict = {}  # dict to hold query_id: {doc_id: (precision, recall), ...}

    with open(details_file, 'r') as f:
        for line in f:
            parts = line.split() 
            if len(parts) == 6: 
                query_id, doc_id, rank, is_relevant, precision, recall = parts
                precision = float(precision)  
                recall = float(recall) 

                if query_id not in query_dict:
                    query_dict[query_id] = {}

                # add the precision and recall to the dictionary using doc_id as the key
                query_dict[query_id][doc_id] = (precision, recall)

    return query_dict

In [180]:
from matplotlib import pyplot as plt

def plot_curves(query_id, interpolated_prec_list, rec_list):
    plt.figure()
    # plt.plot(rec_list, interpolated_prec_list, marker='o', linewidth=0.1)
    plt.plot(rec_list, interpolated_prec_list, marker='o', linewidth=0.1, markersize=3)
    plt.ylabel("Precision")
    plt.xlabel("Recall")
    plt.title(f"Precision-Recall Curve for Query {query_id}")
    plt.axis([0, 1, 0, 1])  # use a list for axis limits
    plt.grid(True)  # add grid
    plt.savefig(f"{query_id}.png")
    plt.close()

def generate_plot_from_dict(query_id, query_dict):

    if query_id in query_dict:
        # extract lists of precision and recall from the query_dict
        precision_list, recall_list = zip(*query_dict[query_id].values())

        # sort the pairs by recall 
        paired_list = sorted(zip(recall_list, precision_list))
        sorted_recall_list, sorted_precision_list = zip(*paired_list)

        # plot curves using the sorted lists
        plot_curves(query_id, sorted_precision_list, sorted_recall_list)
    else:
        print(f"No data found for query ID {query_id}")



In [181]:
details_file = 'details.txt'
query_precision_recall_dict = extract_precision_recall(details_file)
generate_plot_from_dict('152901', query_precision_recall_dict)
generate_plot_from_dict('152902', query_precision_recall_dict)
generate_plot_from_dict('152903', query_precision_recall_dict)