# Assignment 3 - LM retrieval

Scoring documents using the Language Model (LM) approach, i.e., on a single field. In this example, we use JM smoothing.

In [None]:
import math
import urllib
import requests
import json

Indices.

In [None]:
API = "http://gustav1.ux.uis.no:5002"

In [None]:
BASIC_INDEX_NAME = "clueweb12b"
ANCHORS_INDEX_NAME = "clueweb12b_anchors"

def get_index_name(field):
    return ANCHORS_INDEX_NAME if field == "anchors" else BASIC_INDEX_NAME

Queries.

In [None]:
QUERY_FILE = "data/queries.txt"

Some functions wrapping the API calls.

In [None]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    
    return json.loads(response)

In [None]:
def term_vectors(indexname, doc_id, term_statistics=False):
    """
    param term_statistics: Boolean; True iff term_statistics are required.
    """
    url = "/".join([API, indexname, doc_id, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": str(term_statistics).lower()})
    response = requests.get(url).text

    return json.loads(response)

In [None]:
def analyze(indexname, text):
    """
    param text: string to analyze.
    """
    query_terms = []

    url = "/".join([API, indexname, "_analyze"]) + "?" + urllib.parse.urlencode({"text": text})
    response = requests.get(url).text
    tokens = json.loads(response).get("tokens", [])
    for t in sorted(tokens, key=lambda x: x["position"]):
        query_terms.append(t["token"])

    return query_terms

Document fields used for scoring.

In [None]:
FIELDS = ["title", "content", "anchors"]

Smoothing: we use Jelinek-Mercer smoothing here with the following lambda parameter. (I.e., the same smoothing parameter is used for all fields.)

In [None]:
LAMBDA = 0.1

Load the queries from the file.

In [None]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [None]:
queries = load_queries(QUERY_FILE)
print(len(queries))

## LM scorer

### Collection Language Model class

In [None]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for field in FIELDS:
            self._probs[field] = {}
            for t in qterms:
                self._probs[field][t] = self.__get_prob(field, t)
        
    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        index_name = get_index_name(field)
        hits = search(index_name, term, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True`)
            
            tv = term_vectors(index_name, doc_id, term_statistics=True)["term_vectors"][field]
            ttf = tv["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
            sum_ttf = tv["field_statistics"]["sum_ttf"]
            return ttf / sum_ttf
        
        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

### Document scorer

In [None]:
def score_lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed
    index_name = get_index_name(field)
    tv = term_vectors(index_name, doc_id).get("term_vectors", {})

    # compute field length $|d|$
    len_d = 0  # document field length initialization
    if field in tv:  # that document field may be NOT empty
        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        if field in tv:
            Pt_d = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d  # $P(t|d)$
        else:  # that document field is empty
            Pt_d = 0
        Pt_C = clm.prob(field, t)  # $P(t|C)$
        Pt_theta_d = (1 - LAMBDA) * Pt_d + LAMBDA * Pt_C  # $P(t|\theta_{d})$ with J-M smoothing
        score += math.log(Pt_theta_d) if Pt_theta_d > 0 else 0  # Pt_theta_d is 0 if t doesn't occur in any doc for that field, even with smoothing
    
    return score

## Main

In [None]:
def score_queries(field):
    index_name = get_index_name(field)
    max_rank = 20

    output_file = "data/lm_jm_{}.runfile.txt".format(field)
    print("Outputting to {}".format(output_file))
    with open(output_file, "w") as fout:
        fout.write("QueryId,DocumentId\n")  # header
        for qid, query in sorted(queries.items()):
            # get top 200 docs using BM25
            print("\tGet baseline ranking for [%s] '%s'" % (qid, query))
            res = search(index_name, query, field, size=200).get('hits', {})

            # re-score docs using MLM
            print("\tRe-scoring documents using LM")
            # get analyzed query
            qterms = analyze(index_name, query)
            # get collection LM 
            # (this needs to be instantiated only once per query and can be used for scoring all documents)
            clm = CollectionLM(qterms)        
            scores = {}
            for doc in res.get("hits", {}):
                doc_id = doc.get("_id")
                scores[doc_id] = score_lm(clm, qterms, doc_id, field)

            # write top 20 results to file
            for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:max_rank]:            
                if rank <= max_rank:
                    fout.write(qid + "," + doc_id + "\n")

In [None]:
# for field in FIELDS:
for field in ["content"]:
    score_queries(field)