# Installation

In [8]:
import pyterrier as pt
from pyterrier.measures import *
import os

working_dir = os.path.dirname(os.getcwd())

In [7]:
if not pt.started():
    pt.init()

# Indexing

In [3]:
print(os.path.dirname(os.getcwd()))

/Users/zionn/Documents/github/Documents-Retrieval


In [4]:
WT2G_dir = os.path.join(os.path.dirname(os.getcwd()), "WT2G")
files = pt.io.find_files(WT2G_dir)
print(files)

['/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B01', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B02', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B03', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B04', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B05', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B06', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B07', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B08', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B09', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B10', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B11', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B12', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B13', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/B14', '/Users/zionn/Documents/github/Documents-Retrieval/WT2G/Wt01/

In [6]:
# build the index
index_path = os.path.join(os.path.dirname(os.getcwd()), "wt2g_index")
indexer = pt.TRECCollectionIndexer(
    index_path,
    verbose=True,
    blocks=False,

)
index_ref = indexer.index(files)
# if "data.properties" not in os.listdir("wt2g_index"):
#     indexref = indexer.index(files)

#     # load the index, print the statistics
#     index = pt.IndexFactory.of(indexref)
# else:
#     index = pt.IndexFactory.of("wt2g_index/data.properties")

# print(index.getCollectionStatistics().toString())

1082files [06:16,  2.87files/s]                       


In [None]:
'''
# list of filenames to index
files = pt.io.find_files(working_dir + "WT2G/")

# build the index
indexer = pt.TRECCollectionIndexer(working_dir + "wt2g_index_withoutstemstop", verbose=True, blocks=False, stemmer=None, stopwords=None)
if "data.properties" not in os.listdir(working_dir + "wt2g_index_withoutstemstop"):
    indexref = indexer.index(files)

    # load the index, print the statistics
    index = pt.IndexFactory.of(indexref)
else:
    index = pt.IndexFactory.of(working_dir + "wt2g_index_withoutstemstop/data.properties")
    
print(index.getCollectionStatistics().toString())
'''


# Read Topics and Qrels

In [12]:
topics = pt.io.read_topics(working_dir + "/topics.401-450.txt")
qrels = pt.io.read_qrels(working_dir + "/qrels.trec8.small_web")
# qrels = pt.io.read_qrels(working_dir + "qrels.trec8.small_web.gz")

In [13]:
print(topics)

    qid                              query
0   401         foreign minorities germany
1   402                behavioral genetics
2   403                       osteoporosis
3   404                ireland peace talks
4   405                      cosmic events
5   406                parkinson s disease
6   407        poaching wildlife preserves
7   408                    tropical storms
8   409                   legal pan am 103
9   410                 schengen agreement
10  411       salvaging shipwreck treasure
11  412                   airport security
12  413                   steel production
13  414                 cuba sugar exports
14  415              drugs golden triangle
15  416               three gorges project
16  417                         creativity
17  418                      quilts income
18  419           recycle automobile tires
19  420          carbon monoxide poisoning
20  421          industrial waste disposal
21  422                  art stolen forged
22  423    

In [14]:
print(qrels)

       qid         docno  label
0      401  WT01-B04-284      0
1      401    WT01-B05-1      0
2      401   WT01-B05-19      0
3      401    WT01-B05-2      0
4      401    WT01-B05-3      0
...    ...           ...    ...
47501  450   WT27-B03-32      0
47502  450   WT27-B04-97      0
47503  450  WT27-B10-341      0
47504  450   WT27-B13-76      0
47505  450  WT27-B24-355      0

[47506 rows x 3 columns]


# Retrieval Models

In [16]:
tfidf = pt.BatchRetrieve(index_path, wmodel="TF_IDF")
tfidf_new = pt.BatchRetrieve(index_path, wmodel="TF_IDF", controls={"tf_idf.k_1":2, "tf_idf.b":0.75})
lemurtfidf = pt.BatchRetrieve(index_path, wmodel="LemurTF_IDF")
bm25 = pt.BatchRetrieve(index_path, controls={"wmodel": "BM25"})
pl2 = pt.BatchRetrieve(index_path, controls={"wmodel": "PL2"})
hiemstra = pt.BatchRetrieve(index_path, controls={"wmodel": "Hiemstra_LM"})
dirichlet = pt.BatchRetrieve(index_path, controls={"wmodel": "DirichletLM"})

In [17]:
rerank_1 = (bm25 % 100) >> dirichlet

# Retrieval

In [18]:
res_tfidf = tfidf.transform(topics)
res_tfidf_new = tfidf_new.transform(topics)
res_lemurtfidf = lemurtfidf.transform(topics)
res_bm25 = bm25.transform(topics)
res_pl2 = pl2.transform(topics)
res_hiemastra = hiemstra.transform(topics)
res_dirichlet = dirichlet.transform(topics)
res_rerank_1 = rerank_1.transform(topics)

# Learning to Rank

In [19]:
from sklearn.ensemble import RandomForestRegressor

pipeline = bm25 >> (tfidf ** pl2)
rf = RandomForestRegressor(n_estimators=400)
rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(topics, qrels)
pt.Experiment(
    [bm25, rf_pipe], 
    topics, 
    qrels, 
    ["map"], 
    names=["BM25 Baseline", "LTR"]
    )

Unnamed: 0,name,map
0,BM25 Baseline,0.258559
1,LTR,0.751191


In [23]:
import xgboost as xgb
# this configures XGBoost as LambdaMART
lmart_x = xgb.sklearn.XGBRanker(
      objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=10,
      verbose=2,
      random_state=42
)

lmart_x_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(topics, qrels, topics, qrels)

Parameters: { "verbose" } are not used.



# Experiment Result

In [None]:
pt.Experiment(
    [tfidf, tfidf_new, lemurtfidf],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [None]:
pt.Experiment(
    [bm25, pl2, hiemstra, dirichlet],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [21]:
pt.Experiment(
    [rerank_1, rf_pipe, lmart_x_pipe],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

NameError: name 'lmart_x_pipe' is not defined

In [24]:
res = pt.Experiment(
    [tfidf, tfidf_new, lemurtfidf, bm25, pl2, hiemstra, dirichlet, rerank_1, rf_pipe, lmart_x_pipe],
    topics,
    qrels,
    eval_metrics=[R@10, P@10, P@200, MAP, MAP@10, MAP@100, NDCG@100, Rprec]
    )

In [52]:
res

Unnamed: 0,name,R@10,P@10,P@200,AP,AP@10,AP@100,nDCG@100,Rprec
0,BR(TF_IDF),0.151732,0.432,0.1205,0.259946,0.109068,0.22295,0.426088,0.314013
1,BR(TF_IDF),0.135986,0.41,0.1135,0.24177,0.098658,0.207276,0.408773,0.293293
2,BR(LemurTF_IDF),0.128323,0.388,0.1159,0.240003,0.08967,0.200419,0.39498,0.282482
3,BR(BM25),0.151732,0.432,0.1209,0.258559,0.108608,0.221041,0.422593,0.313925
4,BR(PL2),0.130725,0.402,0.1097,0.224116,0.093114,0.190089,0.385312,0.276776
5,BR(Hiemstra_LM),0.103248,0.302,0.1018,0.194431,0.073939,0.159929,0.33336,0.221769
6,BR(DirichletLM),0.151239,0.45,0.1341,0.303469,0.119195,0.259979,0.469072,0.310013
7,"Compose(RankCutoff(BR(BM25), 100), BR(Dirichle...",0.153752,0.46,0.0908,0.246076,0.120448,0.246076,0.444932,0.308065
8,"Compose(Compose(BR(BM25), FUnion(BR(TF_IDF), B...",0.354433,0.96,0.1746,0.751191,0.352715,0.746244,0.831291,0.751414
9,"Compose(Compose(BR(BM25), FUnion(BR(TF_IDF), B...",0.151528,0.448,0.1183,0.251718,0.103719,0.216402,0.424676,0.299643


In [51]:
def highlight_max(s):
    """
    Takes a Series s and returns a Series with
    the css property 'background-color: yellow'
    for the maximum value in each row.
    """
    is_max = s == s.max()
    is_min = s == s.min()
    return ['background-color: yellow' if v else "" for v in is_max]

res[[col for col in res if col != 'name']].style.apply(highlight_max)

Unnamed: 0,R@10,P@10,P@200,AP,AP@10,AP@100,nDCG@100,Rprec
0,0.151732,0.432,0.1205,0.259946,0.109068,0.22295,0.426088,0.314013
1,0.135986,0.41,0.1135,0.24177,0.098658,0.207276,0.408773,0.293293
2,0.128323,0.388,0.1159,0.240003,0.08967,0.200419,0.39498,0.282482
3,0.151732,0.432,0.1209,0.258559,0.108608,0.221041,0.422593,0.313925
4,0.130725,0.402,0.1097,0.224116,0.093114,0.190089,0.385312,0.276776
5,0.103248,0.302,0.1018,0.194431,0.073939,0.159929,0.33336,0.221769
6,0.151239,0.45,0.1341,0.303469,0.119195,0.259979,0.469072,0.310013
7,0.153752,0.46,0.0908,0.246076,0.120448,0.246076,0.444932,0.308065
8,0.354433,0.96,0.1746,0.751191,0.352715,0.746244,0.831291,0.751414
9,0.151528,0.448,0.1183,0.251718,0.103719,0.216402,0.424676,0.299643
