# Vector space retrieval

This exercise is about scoring a (toy-sized) document collection against a query using various retrieval functions instantiated in the vector space model.

In [None]:
import ipytest
import pytest

from typing import Dict, List, Tuple
from abc import ABC, abstractmethod

ipytest.autoconfig()

Term-document matrix.

In [None]:
TD_MATRIX_TYPE = Dict[str, List[int]]
DOCUMENT_SCORES_TYPE = List[Tuple[int, float]]
TD_MATRIX = {
    "beijing": [0, 1, 0, 0, 1],
    "dish": [0, 1, 0, 0, 1],
    "duck": [3, 2, 2, 0, 1],
    "rabbit": [0, 0, 1, 1, 0],
    "recipe": [0, 0, 1, 1, 1],
}

## Scoring

The general scoring function is 

$$score(d,q) = \sum_{t \in q} w_{t,d} \times w_{t,q}$$

where $w_{t,d}$ is the term"s weight in the document and $w_{t,q}$ is the term"s weight in the query.

The `Scorer` class below provides an abstract implementation of the above function. For a specific instantiation,  you"ll need to create a child class and implement `_get_query_term_weight()` and `_get_doc_term_weight()`.

For your convenience, the collection is provided in the form of a term-document matrix.

In [None]:
class AbstractScorer(ABC):
    def __init__(self, td_matrix: TD_MATRIX_TYPE) -> None:
        """Initialize the scorer abstract class.

        Args:
            td_matrix: Dictionary of "term: term count" pairs.
        """
        self._td_matrix = td_matrix
        self._num_docs = len(list(td_matrix.values()))
        self._query_terms = None

    def _parse_query(self, query: str) -> None:
        """Parses the input query to a sequence of vocabulary terms and stores
        it in a member variable.
        """
        self._query_terms = [term for term in query.split() if term in self._td_matrix]


        
    def score_documents(self, query: str) -> DOCUMENT_SCORES_TYPE:
        """Score all documents in the collection.
        
        Params:
            query: Query string
        
        Returns:
            List of (document ID, score) tuples ordered by score descending, then by doc ID ascending.
        """
        scores = {doc_id: 0 for doc_id in range(self._num_docs)}
        self._parse_query(query)
        
        for term in set(self._query_terms):
            for doc_id in range(self._num_docs):
                scores[doc_id] += self._get_doc_term_weight(doc_id, term) * self._get_query_term_weight(term)
                
        return [(doc_id, score) for doc_id, score in sorted(scores.items(), key=lambda x: (x[1], -x[0]), reverse=True)]
        
    @abstractmethod
    def _get_query_term_weight(self, term: str) -> int:
        return 1
    
    @abstractmethod
    def _get_doc_term_weight(self, doc_id: int, term: str) -> int:
        return 0

## Task 1: Binary scorer

Set $w_{t,d}$ to 1 if $t$ is present in the document otherwise $0$.
Similarly, Set $w_{t,q}$ to 1 if $t$ is present in the query otherwise $0$.

This method will then score documents based on the number of matching (unique) query terms.

In [None]:
class BinaryScorer(AbstractScorer):
    
    def _get_query_term_weight(self, term: str) -> int:
        # TODO
        return 0
    
    def _get_doc_term_weight(self, doc_id: int, term: str) -> int:
        # TODO
        return 0

Tests.

In [None]:
%%run_pytest[clean]

@pytest.mark.parametrize("td_matrix,query,correct_values", [
    (TD_MATRIX, "beijing", [(1, 1), (4, 1), (0, 0), (2, 0), (3, 0)]),
    (TD_MATRIX, "beijing duck recipe", [(4, 3), (1, 2), (2, 2), (0, 1), (3, 1)]),
])
def test_binary_scorer(td_matrix: TD_MATRIX_TYPE, query: str, correct_values: DOCUMENT_SCORES_TYPE):  
    scorer = BinaryScorer(td_matrix)
    assert scorer.score_documents(query) == correct_values

## Task 2: TF scorer

Set $w_{t,d}=\frac{c_{t,d}}{|d|}$, that is, the relative frequency of the term in the document.

For $w_{t,q}$, use the frequency (count) of the term in the query.

In [None]:
class TFScorer(AbstractScorer):
    
    def __init__(self, td_matrix: TD_MATRIX_TYPE) -> None:
        """Initialize TFScorer. Here, the lengths of documents are precomputed
        for more efficient scoring.

        Args:
            td_matrix: Dictionary of "term: term count" pairs.
        """
        super(TFScorer,self).__init__(td_matrix)
        # TODO Pre-compute the length of documents for more efficient scoring.
        self._doc_len = {}
    
    def _get_query_term_weight(self, term: str) -> int:
        # TODO
        return 0
    
    def _get_doc_term_weight(self, doc_id, term: str) -> float:
        # TODO
        return 0 / self._doc_len[doc_id]

Tests.

In [None]:
%%run_pytest[clean]

@pytest.mark.parametrize("td_matrix,query,correct_values", [
    (TD_MATRIX, "beijing", [(1, 0.25), (4, 0.25), (0, 0), (2, 0), (3, 0)]),
    (TD_MATRIX, "duck duck", [(0, 2), (1, 1), (2, 1), (4, 0.5), (3, 0)]),
    (TD_MATRIX, "beijing duck recipe", [(0, 1.0), (1, 0.75), (2, 0.75), (4, 0.75), (3, 0.5)]),
])
def test_tf_scorer(td_matrix: DOCUMENT_SCORES_TYPE, query: str, correct_values: DOCUMENT_SCORES_TYPE):  
    scorer = TFScorer(td_matrix)
    assert scorer.score_documents(query) == correct_values

## Task 3: TD-IDF scorer

Implement the scoring function 

$$score(d,q) = \sum_{t \in q} tf_{t,q} \times tf_{t,d} \times idf_t$$

Use normalized frequencies for TF weight, i.e., $tf_{t,d}=\frac{c_{t,d}}{|d|}$, where $c_{t,d}$ is the number of occurrences of term $t$ in document $d$ and $|d|$ is the document length (=total number of terms). (Analogously for the query.)

Compute IDF values using the following formula: $idf_{t}=\log \frac{N}{n_t}$, where $N$ is the total number of documents and $n_t$ is the number of documents that contain term $t$.  Use base-10 the logarithm.

In [None]:
class TFIDFScorer(AbstractScorer):
    
    def __init__(self, td_matrix: TD_MATRIX_TYPE) -> None:
        """Initializes TFIDFScorer. Here, both document lengts and IDF values
        are precomputes.

        Args:
            td_matrix: Dictionary of "term: term count" pairs.
        """
        super(TFIDFScorer,self).__init__(td_matrix)
        # TODO Pre-compute the length of documents for more efficient scoring.
        self._doc_len = {}
        # TODO Pre-compute IDF values.
        self._idf = {}
    
    def _get_query_term_weight(self, term: str) -> float:
        # TODO
        return 0
    
    def _get_doc_term_weight(self, doc_id: int, term: str) -> float:
        # TODO
        return 0

Tests.

In [None]:
%%run_pytest[clean]

@pytest.mark.parametrize("td_matrix,query,correct_values", [
    (TD_MATRIX, "beijing", [(1, 0.0995), (4, 0.0995), (0, 0), (2, 0), (3, 0)]),
    (TD_MATRIX, "duck duck", [(0, 0.0969), (1, 0.0485), (2, 0.0485), (4, 0.0242), (3, 0)]),
    (TD_MATRIX, "beijing duck recipe", [(4, 0.0597), (1, 0.0493), (3, 0.0369), (2, 0.0346), (0, 0.0323)]),
])
def test_tfidf_scorer(td_matrix: TD_MATRIX_TYPE, query: str, correct_values: DOCUMENT_SCORES_TYPE):  
    scorer = TFIDFScorer(td_matrix)
    ranking = scorer.score_documents(query)
    assert [x[0] for x in ranking] == [x[0] for x in correct_values]  # Checking ranking
    assert [x[1] for x in ranking] == pytest.approx([x[1] for x in correct_values], rel=1e-2)  # Checking scores