# Installation

In [None]:
import pyterrier as pt
#import ir_measures
from pyterrier.measures import *
import os

if not pt.started():
    pt.init()

In [None]:
working_dir = "./"

# Indexing

In [None]:
# list of filenames to index
files = pt.io.find_files(working_dir + "WT2G/")

# build the index
indexer = pt.TRECCollectionIndexer(working_dir + "wt2g_index", verbose=True, blocks=False)
if "data.properties" not in os.listdir(working_dir + "wt2g_index"):
    indexref = indexer.index(files)

    # load the index, print the statistics
    index = pt.IndexFactory.of(indexref)
else:
    index = pt.IndexFactory.of(working_dir + "wt2g_index/data.properties")
    
print(index.getCollectionStatistics().toString())

In [None]:
'''
# list of filenames to index
files = pt.io.find_files(working_dir + "WT2G/")

# build the index
indexer = pt.TRECCollectionIndexer(working_dir + "wt2g_index_withoutstemstop", verbose=True, blocks=False, stemmer=None, stopwords=None)
if "data.properties" not in os.listdir(working_dir + "wt2g_index_withoutstemstop"):
    indexref = indexer.index(files)

    # load the index, print the statistics
    index = pt.IndexFactory.of(indexref)
else:
    index = pt.IndexFactory.of(working_dir + "wt2g_index_withoutstemstop/data.properties")
    
print(index.getCollectionStatistics().toString())
'''


# Read Topics and Qrels

In [None]:
topics = pt.io.read_topics(working_dir + "topics_401_450.txt")

#qrels = pt.io.read_qrels(working_dir + "qrels.trec8.small_web")
qrels = pt.io.read_qrels(working_dir + "qrels.trec8.small_web.gz")

# Retrieval Models

In [None]:
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
tfidf_new = pt.BatchRetrieve(index, wmodel="TF_IDF", controls={"tf_idf.k_1":2, "tf_idf.b":0.75})
lemurtfidf = pt.BatchRetrieve(index, wmodel="LemurTF_IDF")
bm25 = pt.BatchRetrieve(index, controls={"wmodel": "BM25"})
pl2 = pt.BatchRetrieve(index, controls={"wmodel": "PL2"})
hiemstra = pt.BatchRetrieve(index, controls={"wmodel": "Hiemstra_LM"})
dirichlet = pt.BatchRetrieve(index, controls={"wmodel": "DirichletLM"})

In [None]:
rerank_1 = (bm25 % 100) >> dirichlet

# Retrieval

In [None]:
res_tfidf = tfidf.transform(topics)
res_tfidf_new = tfidf_new.transform(topics)
res_lemurtfidf = lemurtfidf.transform(topics)
res_bm25 = bm25.transform(topics)
res_pl2 = pl2.transform(topics)
res_hiemastra = hiemstra.transform(topics)
res_dirichlet = dirichlet.transform(topics)
res_rerank_1 = rerank_1.transform(topics)

# Learning to Rank

In [None]:
from sklearn.ensemble import RandomForestRegressor

pipeline = bm25 >> (tfidf ** pl2)
rf = RandomForestRegressor(n_estimators=400)
rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(topics, qrels)
pt.Experiment(
    [bm25, rf_pipe], 
    topics, 
    qrels, 
    ["map"], 
    names=["BM25 Baseline", "LTR"]
    )

In [None]:
import xgboost as xgb
# this configures XGBoost as LambdaMART
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=10,
      verbose=2,
      random_state=42)

lmart_x_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(topics, qrels, topics, qrels)

# Experiment Result

In [None]:
pt.Experiment(
    [tfidf, tfidf_new, lemurtfidf],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [None]:
pt.Experiment(
    [bm25, pl2, hiemstra, dirichlet],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [None]:
pt.Experiment(
    [rerank_1, rf_pipe, lmart_x_pipe],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [None]:
res = pt.Experiment(
    [tfidf, tfidf_new, lemurtfidf, bm25, pl2, hiemstra, dirichlet, rerank_1, rf_pipe, lmart_x_pipe],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [None]:
res