In [None]:
import pandas as pd
import nltk

nltk.download("punkt")
nltk.download('stopwords')

In [1]:
import pyterrier as pt

if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### Data

In [244]:
from pathlib import Path
# Using the split 100 training data to generate the training data for LTR
training_dataset = pt.get_dataset('irds:antique/train/split200-train')
validation_dataset = pt.get_dataset('irds:antique/train/split200-valid')

### BM25

In [147]:
result_limit = 500


# Setting up index
idx_path = Path("index").absolute()
if not (idx_path / "data.properties").is_file():
    pt.index.IterDictIndexer(
        str(idx_path),
        meta={
            "docno": 32,
            "text": 131072,
        },
    ).index(training_dataset.get_corpus_iter())


# Setting up models, bm25, tf-idf, and tf
bm25_pipeline = pt.BatchRetrieve(
    str(idx_path),
    wmodel="BM25",
    # metadata=["docno", "text"], # if we also want to return the query text
    # properties={"termpipelines": ""}, # do we want automatic stopword removal
    controls={"qe": "off", "bm25.b": 0.50}, # whether to run the Divergence from Randomness query expansion
    num_results=result_limit
)

In [150]:
import pyterrier as pt
from pyterrier.measures import *

# map: 0.153428 , ndcg = 0.244974 "bm25.b": 0.50

eval_metrics = ["map", nDCG@10]

pt.Experiment(
    [bm25_pipeline],
    validation_dataset.get_topics(),
    validation_dataset.get_qrels(),
    eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,BR(BM25),0.153428,0.244974


### TF-IDF

In [18]:
tf_idf_pipeline = pt.BatchRetrieve(
    str(idx_path),
    wmodel='TF_IDF',
    # metadata=["docno", "text"], # if we also want to return the query text
    # properties={"termpipelines": ""}, # do we want automatic stopword removal
    controls={"qe": "off"}, # whether to run the Divergence from Randomness query expansion
    num_results=result_limit
)

pt.Experiment(
    [tf_idf_pipeline],
    validation_dataset.get_topics(),
    validation_dataset.get_qrels(),
    eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,BR(TF_IDF),0.146731,0.228281


### TF

In [19]:

tf_pipeline = pt.BatchRetrieve(
    str(idx_path),
    wmodel='Tf',
    # metadata=["docno", "text"], # if we also want to return the query text
    # properties={"termpipelines": ""}, # do we want automatic stopword removal
    controls={"qe": "off"}, # whether to run the Divergence from Randomness query expansion
    num_results=result_limit
)

pt.Experiment(
    [tf_pipeline],
    validation_dataset.get_topics(),
    validation_dataset.get_qrels(),
    eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,BR(Tf),0.018991,0.03303


### Combine first 3 models

In [71]:
CUTOFF = 100
# 0.151085	0.244974
pipe = ((tf_pipeline % CUTOFF) | (tf_idf_pipeline % CUTOFF)) >> bm25_pipeline

pt.Experiment(
    [pipe],
    validation_dataset.get_topics(),
    validation_dataset.get_qrels(),
    eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,"Compose(Union(RankCutoff(BR(Tf), 100), RankCut...",0.151085,0.244974


### LTR

#### Prepare training data

In [61]:
# For each of the training queries, run it on bm25 pipeline and get the scores
bm25_train_results = bm25_pipeline(training_dataset.get_topics())
tf_idf_pipeline_train_results = tf_idf_pipeline(training_dataset.get_topics())
tf_pipeline_train_results = tf_pipeline(training_dataset.get_topics())

# Save the results to csv
bm25_train_results.to_csv("bm25_train_results_500.csv", index=False)
tf_idf_pipeline_train_results.to_csv("tf_idf_pipeline_train_results_500.csv", index=False)
tf_pipeline_train_results.to_csv("tf_pipeline_train_results_500.csv", index=False)

In [270]:
# Load the results from csv
folder = "data"

bm25_train_results = pd.read_csv(f"{folder}/bm25_train_results_500.csv")
tf_idf_pipeline_train_results = pd.read_csv(f"{folder}/tf_idf_pipeline_train_results_500.csv")
tf_pipeline_train_results = pd.read_csv(f"{folder}/tf_pipeline_train_results_500.csv")
w2w_0_2_train_results = pd.read_csv(f"{folder}/w2v_02_train_500.csv")
w2v_0_5_train_results = pd.read_csv(f"{folder}/w2v_05_train_500.csv")

print(bm25_train_results.shape)
print(tf_idf_pipeline_train_results.shape)
print(tf_pipeline_train_results.shape)
print(w2w_0_2_train_results.shape)
print(w2v_0_5_train_results.shape)
bm25_train_results.head()

(1066505, 6)
(1066505, 6)
(1066505, 6)
(40000, 6)
(40000, 6)


Unnamed: 0,qid,docid,docno,rank,score,query
0,3097310,321078,2606613_4,0,30.36635,what causes severe swelling and pain in the knees
1,3097310,275288,3241109_2,1,29.046531,what causes severe swelling and pain in the knees
2,3097310,291706,2105586_3,2,27.579889,what causes severe swelling and pain in the knees
3,3097310,42736,4204097_0,3,26.636697,what causes severe swelling and pain in the knees
4,3097310,160465,2818197_2,4,26.362626,what causes severe swelling and pain in the knees


In [155]:
tf_idf_pipeline_train_results.head()

Unnamed: 0,qid,docid,docno,rank,score,query
0,3097310,275288,3241109_2,0,17.295203,what causes severe swelling and pain in the knees
1,3097310,321078,2606613_4,1,15.393654,what causes severe swelling and pain in the knees
2,3097310,160465,2818197_2,2,14.827285,what causes severe swelling and pain in the knees
3,3097310,274745,3536038_5,3,14.711232,what causes severe swelling and pain in the knees
4,3097310,9768,773247_8,4,14.538702,what causes severe swelling and pain in the knees


In [156]:
tf_pipeline_train_results.head()

Unnamed: 0,qid,docid,docno,rank,score,query
0,3097310,287022,2933555_0,0,121.0,what causes severe swelling and pain in the knees
1,3097310,303344,4359345_0,1,24.0,what causes severe swelling and pain in the knees
2,3097310,227342,3926123_2,2,21.0,what causes severe swelling and pain in the knees
3,3097310,321144,2367043_3,3,20.0,what causes severe swelling and pain in the knees
4,3097310,142302,3900539_5,4,18.0,what causes severe swelling and pain in the knees


In [271]:
w2w_0_2_train_results.head()

Unnamed: 0,docno,text,rank,score,query,qid
0,1294030_3,The angina is the chest pain and it's the cons...,0,0.899816,what causes severe swelling and pain in the knees,3097310
1,773247_8,"It is bad for your body,and can cause severe s...",1,0.899293,what causes severe swelling and pain in the knees,3097310
2,513354_2,Actually it is prednisone. It will reduce her ...,2,0.895758,what causes severe swelling and pain in the knees,3097310
3,2606613_0,In a mild wrist sprain maybe slightly swollen ...,3,0.892028,what causes severe swelling and pain in the knees,3097310
4,2592562_4,It usually starts with flu-like symptoms like ...,4,0.890471,what causes severe swelling and pain in the knees,3097310


In [282]:
# labels of the training data
labels = training_dataset.get_qrels()
labels["qid"] = labels["qid"].astype(int)

# create a dataframe with the qid, docno and the scores from the three models joinin on the docno
train_data = pd.DataFrame()
train_data["qid"] = w2w_0_2_train_results["qid"]
train_data["docno"] = w2w_0_2_train_results["docno"]
train_data["query"] = w2w_0_2_train_results["query"]
train_data["score"] = w2w_0_2_train_results["score"]

# join the w2v_0_5_train_results
train_data = train_data.merge(w2v_0_5_train_results[["qid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_w2v_05"))

# join the bm25_train_results
train_data = train_data.merge(bm25_train_results[["qid", "docid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_bm25"))

# join the tf_idf_pipeline_train_results
train_data = train_data.merge(tf_idf_pipeline_train_results[["qid", "docid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_tf_idf"))

# join the tf_pipeline_train_results
train_data = train_data.merge(tf_pipeline_train_results[["qid", "docid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_tf"))

train_data.rename(columns={"score": "w2v_02_score", 
                           "score_w2v_05": "w2v_05_score",
                           "score_bm25": "bm25_score", 
                           "score_tf_idf": "tf_idf_score", 
                           "score_tf": "tf_score"}, 
                  inplace=True)

# replace the nan values with 0
train_data.fillna(0, inplace=True)

print(train_data.shape)
train_data.head()

(40000, 11)


Unnamed: 0,qid,docno,query,w2v_02_score,w2v_05_score,docid,bm25_score,docid_tf_idf,tf_idf_score,docid_tf,tf_score
0,3097310,1294030_3,what causes severe swelling and pain in the knees,0.899816,0.840419,0.0,0.0,0.0,0.0,0.0,0.0
1,3097310,773247_8,what causes severe swelling and pain in the knees,0.899293,0.857281,9768.0,24.033896,9768.0,14.538702,0.0,0.0
2,3097310,513354_2,what causes severe swelling and pain in the knees,0.895758,0.82692,45306.0,19.381763,45306.0,11.672008,0.0,0.0
3,3097310,2606613_0,what causes severe swelling and pain in the knees,0.892028,0.859853,321074.0,25.732907,321074.0,13.625401,321074.0,4.0
4,3097310,2592562_4,what causes severe swelling and pain in the knees,0.890471,0.841391,339046.0,16.334831,339046.0,8.543015,0.0,0.0


#### Train model for model weights

In [341]:
# train a linear model to give weights to the three models
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# join the labels
train_data_reg = train_data.merge(labels[["qid", "docno", "label" ]], on=["qid", "docno"])

X = train_data_reg[["w2v_02_score", "w2v_05_score", "bm25_score", "tf_idf_score", "tf_score",]]
y = train_data_reg["label"]

# scale the data
X_scaled = StandardScaler().fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# train the model
reg = LinearRegression().fit(X_train, Y_train)

# evaluate the model
print("training score", reg.score(X_train, Y_train))

reg.coef_

training score 0.11975801835439348


array([-0.04003533,  0.0331083 ,  0.75417444, -0.8290296 ,  0.02909848])

In [337]:
# try random forest regressor

from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor().fit(X_train, Y_train)

# evaluate the model
print("training score", reg.score(X_train, Y_train))

reg.feature_importances_

training score 0.8607719992070572


array([0.28866796, 0.14078773, 0.18649194, 0.27502595, 0.10902642])

## LTR re-ranking

In [245]:
bm25_test_results = bm25_pipeline(validation_dataset.get_topics())
tf_idf_test_results = tf_idf_pipeline(validation_dataset.get_topics())
tf_test_results = tf_pipeline(validation_dataset.get_topics())

# Save the results to csv
bm25_test_results.to_csv("bm25_test_results.csv", index=False)
tf_idf_test_results.to_csv("tf_idf_pipeline_test_results.csv", index=False)
tf_test_results.to_csv("tf_pipeline_test_results.csv", index=False)

In [342]:
folder = "data"

bm25_test_results = pd.read_csv(f"{folder}/bm25_test_results.csv")
tf_idf_test_results = pd.read_csv(f"{folder}/tf_idf_pipeline_test_results.csv")
tf_test_results = pd.read_csv(f"{folder}/tf_pipeline_test_results.csv")
w2v_02_test_results = pd.read_csv(f"{folder}/word2vec_02_test_results.csv")
w2v_05_test_results = pd.read_csv(f"{folder}/word2vec_05_test_results.csv")

print(bm25_test_results.shape)
print(tf_idf_test_results.shape)
print(tf_test_results.shape)
print(w2v_02_test_results.shape)
print(w2v_05_test_results.shape)

(95829, 6)
(95829, 6)
(95829, 6)
(100000, 6)
(100000, 6)


In [343]:
test_data = pd.DataFrame()
test_data["qid"] = bm25_test_results["qid"]
test_data["docid"] = bm25_test_results["docid"]
test_data["docno"] = bm25_test_results["docno"]
test_data["score"] = bm25_test_results["score"]
test_data["query"] = bm25_test_results["query"]

# join the w2v_0_2_test_results
test_data = test_data.merge(w2v_02_test_results[["qid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_w2v_02"))

# join the w2v_0_5_test_results
test_data = test_data.merge(w2v_05_test_results[["qid", "docno", "score"]], on=["qid", "docno"], how = "left", suffixes=("", "_w2v_05"))

# join the tf_idf_pipeline_test_results
test_data = test_data.merge(tf_idf_test_results[["qid", "docid", "score"]], on=["qid", "docid"], how = "left", suffixes=("", "_tf_idf"))

# join the tf_pipeline_test_results - if no match score is 0
test_data = test_data.merge(tf_test_results[["qid", "docid", "score"]], on=["qid", "docid"], how="left", suffixes=("", "_tf"))

test_data.rename(columns={"score": "bm25_score", 
                          "score_w2v_02": "w2v_02_score",
                            "score_w2v_05": "w2v_05_score",
                          "score_tf_idf": "tf_idf_score", 
                          "score_tf": "tf_score"}, 
                 inplace=True)

# normalize bm25, tf_idf, and tf scores
test_data["bm25_score"] = (test_data["bm25_score"] - test_data["bm25_score"].min()) / (test_data["bm25_score"].max() - test_data["bm25_score"].min())
test_data["tf_idf_score"] = (test_data["tf_idf_score"] - test_data["tf_idf_score"].min()) / (test_data["tf_idf_score"].max() - test_data["tf_idf_score"].min())
test_data["tf_score"] = (test_data["tf_score"] - test_data["tf_score"].min()) / (test_data["tf_score"].max() - test_data["tf_score"].min())


# fill na with 0
test_data.fillna(0, inplace=True)

print(test_data.shape)
test_data.head()

(95829, 9)


Unnamed: 0,qid,docid,docno,bm25_score,query,w2v_02_score,w2v_05_score,tf_idf_score,tf_score
0,1907320,51089,2585402_1,0.271957,how do i get college money,0.0,0.0,0.296527,0.035398
1,1907320,364804,1167603_8,0.239651,how do i get college money,0.0,0.0,0.284784,0.0
2,1907320,19701,485361_1,0.236026,how do i get college money,0.0,0.0,0.255581,0.026549
3,1907320,276267,2275022_4,0.23433,how do i get college money,0.0,0.0,0.228452,0.106195
4,1907320,366040,3779911_1,0.230711,how do i get college money,0.0,0.0,0.25977,0.0


In [344]:
# multiply the scores by the weights

weights = abs(reg.coef_)

ltr_data = test_data.copy()

ltr_data["w2v_02_score"] = ltr_data["w2v_02_score"] * weights[0]
ltr_data["w2v_05_score"] = ltr_data["w2v_05_score"] * weights[1]
ltr_data["bm25_score"] = ltr_data["bm25_score"] * weights[2]
ltr_data["tf_idf_score"] = ltr_data["tf_idf_score"] * weights[3] 
ltr_data["tf_score"] = ltr_data["tf_score"] * weights[4]


ltr_data["score"] = ltr_data["w2v_02_score"] + ltr_data["w2v_05_score"] + ltr_data["bm25_score"] + ltr_data["tf_idf_score"] + ltr_data["tf_score"] 

print(ltr_data.shape)
ltr_data.head()

(95829, 10)


Unnamed: 0,qid,docid,docno,bm25_score,query,w2v_02_score,w2v_05_score,tf_idf_score,tf_score,score
0,1907320,51089,2585402_1,0.205103,how do i get college money,0.0,0.0,0.24583,0.00103,0.451963
1,1907320,364804,1167603_8,0.180739,how do i get college money,0.0,0.0,0.236094,0.0,0.416832
2,1907320,19701,485361_1,0.178004,how do i get college money,0.0,0.0,0.211884,0.000773,0.390661
3,1907320,276267,2275022_4,0.176726,how do i get college money,0.0,0.0,0.189393,0.00309,0.369209
4,1907320,366040,3779911_1,0.173996,how do i get college money,0.0,0.0,0.215357,0.0,0.389354


In [345]:
evaluate_ltr = ltr_data[['qid', 'docid', 'docno', 'score', 'query']].copy()

# for each qid, sort the docno by the score
evaluate_ltr = evaluate_ltr.sort_values(by=["qid", "score"], ascending=[False, False])

# rank each docno
evaluate_ltr["rank"] = evaluate_ltr.groupby("qid").cumcount()

# reset the index
evaluate_ltr.reset_index(drop=True, inplace=True)

# reorder cols : qid	docid	docno	rank	score	query
evaluate_ltr = evaluate_ltr[['qid', 'docid', 'docno', 'rank', 'score', 'query']]

# convert qid to str
evaluate_ltr["qid"] = evaluate_ltr["qid"].astype(str)

evaluate_ltr

Unnamed: 0,qid,docid,docno,rank,score,query
0,4450252,241652,904349_3,0,0.492518,what are the causes of college dropouts
1,4450252,223120,2334325_2,1,0.381359,what are the causes of college dropouts
2,4450252,223121,2334325_3,2,0.334761,what are the causes of college dropouts
3,4450252,294377,4469711_14,3,0.328038,what are the causes of college dropouts
4,4450252,257069,4450252_4,4,0.304891,what are the causes of college dropouts
...,...,...,...,...,...,...
95824,2704,240194,4314878_2,495,0.093573,why does it seem that no matter how hard you try
95825,2704,320423,68367_6,496,0.093573,why does it seem that no matter how hard you try
95826,2704,396515,2006197_5,497,0.093573,why does it seem that no matter how hard you try
95827,2704,319510,3907796_6,498,0.093363,why does it seem that no matter how hard you try


#### Evaluate

In [346]:
validation_dataset = pt.get_dataset('irds:antique/train/split200-valid')

eval_metrics = ["map", nDCG@10]

pt.Experiment(
    retr_systems=[evaluate_ltr],
    topics=validation_dataset.get_topics(),
    qrels=validation_dataset.get_qrels(),
    eval_metrics=eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,qid docid docno rank s...,0.150841,0.236082


In [330]:
pt.Experiment(
    retr_systems=[bm25_pipeline],
    topics=validation_dataset.get_topics(),
    qrels=validation_dataset.get_qrels(),
    eval_metrics=eval_metrics
)

Unnamed: 0,name,map,nDCG@10
0,BR(BM25),0.153428,0.244974
