In [10]:
from sys import modules

IN_COLAB = 'google.colab' in modules
if IN_COLAB:
    !pip install -q ir_axioms[examples] python-terrier

In [11]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="auto")

In [12]:
from pyterrier.datasets import get_dataset, Dataset

# Load dataset.
dataset_name = "antique"
dataset: Dataset = get_dataset(f"irds:{dataset_name}")
dataset_train: Dataset = get_dataset(f"irds:{dataset_name}/train")
dataset_test: Dataset = get_dataset(f"irds:{dataset_name}/test")

In [13]:
from pathlib import Path

cache_dir = Path("cache/")
index_dir = cache_dir / "indices" / dataset_name.replace("/", "-")

In [14]:
from pyterrier.index import IterDictIndexer

if not index_dir.exists():
    indexer = IterDictIndexer(str(index_dir.absolute()))
    indexer.index(
        dataset.get_corpus_iter(),
        fields=["text"]
    )

In [15]:
from pyterrier.batchretrieve import BatchRetrieve

# BM25 baseline retrieval.
pipeline_bm25 = BatchRetrieve(str(index_dir.absolute()), wmodel="BM25", num_results=20)
pipeline_bm25(dataset_test.get_topics())

Unnamed: 0,qid,docid,docno,rank,score,query
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething
...,...,...,...,...,...,...
3877,1340574,136249,2323025_6,17,25.933856,why do some people only go to church on easter...
3878,1340574,116651,2036141_2,18,25.540481,why do some people only go to church on easter...
3879,1340574,123602,1757874_0,19,25.338768,why do some people only go to church on easter...
3880,1971899,234068,1971899_6,0,28.234530,what is masturbat


In [16]:
from pathlib import Path

cache_dir = Path(f"cache/")

In [17]:
from ir_axioms.axiom import (
    ArgUC, QTArg, QTPArg, aSL, PROX1, PROX2, PROX3, PROX4, PROX5, TFC1, TFC3, RS_TF, RS_TF_IDF, RS_BM25, RS_PL2, RS_QL,
    AND, LEN_AND, M_AND, LEN_M_AND, DIV, LEN_DIV, M_TDC, LEN_M_TDC, STMC1, STMC1_f, STMC2, STMC2_f, LNC1, TF_LNC, LB1,
    REG, ANTI_REG, REG_f, ANTI_REG_f, ASPECT_REG, ASPECT_REG_f
)

axioms = [
    ~ArgUC(), ~QTArg(), ~QTPArg(), ~aSL(),
    ~LNC1(), ~TF_LNC(), ~LB1(),
    ~PROX1(), ~PROX2(), ~PROX3(), ~PROX4(), ~PROX5(),
    ~REG(), ~REG_f(), ~ANTI_REG(), ~ANTI_REG_f(), ~ASPECT_REG(), ~ASPECT_REG_f(),
    ~AND(), ~LEN_AND(), ~M_AND(), ~LEN_M_AND(), ~DIV(), ~LEN_DIV(),
    ~RS_TF(), ~RS_TF_IDF(), ~RS_BM25(), ~RS_PL2(), ~RS_QL(),
    ~TFC1(), ~TFC3(), ~M_TDC(), ~LEN_M_TDC(),
    ~STMC1(), ~STMC1_f(), ~STMC2(), ~STMC2_f(),
]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from ir_axioms.backend.pyterrier.estimator import EstimatorKwikSortReranker

random_forest = RandomForestRegressor(n_estimators=400)
pipeline_random_forest = pipeline_bm25 >> EstimatorKwikSortReranker(
    axioms=axioms,
    index=index_dir,
    estimator=random_forest,
    dataset=dataset_name,
    cache_dir=cache_dir,
    verbose=True,
)
pipeline_random_forest.fit(
    (pipeline_bm25 % 3).transform(dataset_train.get_topics()),
    dataset_train.get_qrels()
)

In [None]:
from pyterrier.pipelines import Experiment

Experiment(
    [pipeline_bm25, pipeline_random_forest],
    dataset_test.get_topics(),
    dataset_test.get_qrels(),
    ["map"],
    names=["BM25 Baseline", "KwikSort Random Forest Axiom"]
)