In [2]:
%pip install --user rank_bm25 xgboost scikit-learn nltk





In [2]:
import sys
print(sys.executable)


C:\ProgramData\anaconda3\python.exe


In [3]:
!"{sys.executable}" -m pip install rank_bm25 xgboost scikit-learn nltk


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ndcg_score


In [4]:
data = [
    
    ("eco friendly travel", 
     "eco friendly travel focuses on reducing carbon footprint while exploring new places", 3),

    ("eco friendly travel", 
     "sustainable tourism practices for responsible travelers", 2),

    ("eco friendly travel", 
     "best travel destinations around the world", 1),

    ("eco friendly travel", 
     "latest smartphone models released this year", 0),

    ("local food experiences", 
     "exploring local street food and traditional cuisines while traveling", 3),

    ("local food experiences", 
     "how food reflects culture and heritage in different regions", 2),

    ("local food experiences", 
     "easy homemade recipes for beginners", 1),

    ("local food experiences", 
     "online food delivery app discounts", 0),

    ("slow living lifestyle", 
     "slow living promotes mindfulness balance and intentional daily habits", 3),

    ("slow living lifestyle", 
     "reducing stress by simplifying daily routines", 2),

    ("slow living lifestyle", 
     "morning routines of successful entrepreneurs", 1),

    ("slow living lifestyle", 
     "fast fashion trends in urban markets", 0)
]

df = pd.DataFrame(data, columns=["query", "document", "relevance"])
df


Unnamed: 0,query,document,relevance
0,eco friendly travel,eco friendly travel focuses on reducing carbon...,3
1,eco friendly travel,sustainable tourism practices for responsible ...,2
2,eco friendly travel,best travel destinations around the world,1
3,eco friendly travel,latest smartphone models released this year,0
4,local food experiences,exploring local street food and traditional cu...,3
5,local food experiences,how food reflects culture and heritage in diff...,2
6,local food experiences,easy homemade recipes for beginners,1
7,local food experiences,online food delivery app discounts,0
8,slow living lifestyle,slow living promotes mindfulness balance and i...,3
9,slow living lifestyle,reducing stress by simplifying daily routines,2


In [5]:
tfidf = TfidfVectorizer()

query_vec = tfidf.fit_transform(df["query"])
doc_vec = tfidf.transform(df["document"])

tfidf_score = (query_vec.multiply(doc_vec)).sum(axis=1)
df["tfidf_score"] = np.array(tfidf_score).flatten()


In [6]:
df["query_length"] = df["query"].apply(lambda x: len(x.split()))
df["doc_length"] = df["document"].apply(lambda x: len(x.split()))


In [9]:
X_pairwise = []
y_pairwise = []

feature_cols = ["tfidf_score", "query_length", "doc_length"]

for q in df["query"].unique():
    subset = df[df["query"] == q].reset_index(drop=True)

    for i in range(len(subset)):
        for j in range(len(subset)):
            if subset.loc[i, "relevance"] == subset.loc[j, "relevance"]:
                continue  # skip equal relevance

            diff = subset.loc[i, feature_cols].values - subset.loc[j, feature_cols].values

            if subset.loc[i, "relevance"] > subset.loc[j, "relevance"]:
                X_pairwise.append(diff)
                y_pairwise.append(1)
            else:
                X_pairwise.append(diff)
                y_pairwise.append(0)

X_pairwise = np.array(X_pairwise)
y_pairwise = np.array(y_pairwise)

np.unique(y_pairwise, return_counts=True)


(array([0, 1]), array([18, 18]))

In [10]:
rank_model = LogisticRegression()
rank_model.fit(X_pairwise, y_pairwise)


In [12]:
features = df[["tfidf_score", "query_length", "doc_length"]].values
df["predicted_score"] = rank_model.decision_function(features)


In [18]:
def evaluate_ndcg(df, k=3):
    ndcgs = []

    for q in df["query"].unique():
        subset = df[df["query"] == q]

        y_true = subset["relevance"].to_numpy().reshape(1, -1)
        y_score = subset["predicted_score"].to_numpy().reshape(1, -1)

        ndcgs.append(ndcg_score(y_true, y_score, k=k))

    return float(np.mean(ndcgs))


In [14]:
def precision_at_k(df, k=3):
    precisions = []
    for q in df["query"].unique():
        subset = df[df["query"] == q].sort_values("predicted_score", ascending=False)
        precisions.append(np.mean(subset["relevance"].head(k) > 0))
    return np.mean(precisions)

precision_at_k(df)


np.float64(0.7777777777777777)

In [15]:
df.sort_values(["query", "predicted_score"], ascending=[True, False])


Unnamed: 0,query,document,relevance,tfidf_score,query_length,doc_length,predicted_score
0,eco friendly travel,eco friendly travel focuses on reducing carbon...,3,1.0,3,12,11.136914
1,eco friendly travel,sustainable tourism practices for responsible ...,2,0.0,3,6,5.571994
3,eco friendly travel,latest smartphone models released this year,0,0.0,3,6,5.571994
2,eco friendly travel,best travel destinations around the world,1,0.57735,3,6,5.56791
5,local food experiences,how food reflects culture and heritage in diff...,2,0.57735,3,9,8.353907
4,local food experiences,exploring local street food and traditional cu...,3,0.816497,3,9,8.352215
6,local food experiences,easy homemade recipes for beginners,1,0.0,3,5,4.643328
7,local food experiences,online food delivery app discounts,0,0.57735,3,5,4.639244
8,slow living lifestyle,slow living promotes mindfulness balance and i...,3,0.816497,3,9,8.352215
9,slow living lifestyle,reducing stress by simplifying daily routines,2,0.0,3,6,5.571994


In [19]:
rank_model.fit(X_pairwise[:, [0]], y_pairwise)

# Predict using ONLY TF-IDF feature
df["score_tfidf_only"] = rank_model.decision_function(
    df[["tfidf_score"]].values
)

df_eval = df.copy()
df_eval["predicted_score"] = df_eval["score_tfidf_only"]

evaluate_ndcg(df_eval)


0.8675034925694373

In [20]:

df_full = df.copy()
df_full["predicted_score"] = rank_model.decision_function(
    df[["tfidf_score"]].values
)

ndcg_tfidf = evaluate_ndcg(df_eval)
ndcg_full = evaluate_ndcg(df_full)

pd.DataFrame({
    "Model": ["TF-IDF Only Ranker", "Full Feature Ranker"],
    "NDCG@3": [ndcg_tfidf, ndcg_full]
})


Unnamed: 0,Model,NDCG@3
0,TF-IDF Only Ranker,0.867503
1,Full Feature Ranker,0.867503


In [21]:
df.head()

Unnamed: 0,query,document,relevance,tfidf_score,query_length,doc_length,predicted_score,score_tfidf_only
0,eco friendly travel,eco friendly travel focuses on reducing carbon...,3,1.0,3,12,11.136914,1.644866
1,eco friendly travel,sustainable tourism practices for responsible ...,2,0.0,3,6,5.571994,2.166206e-16
2,eco friendly travel,best travel destinations around the world,1,0.57735,3,6,5.56791,0.9496639
3,eco friendly travel,latest smartphone models released this year,0,0.0,3,6,5.571994,2.166206e-16
4,local food experiences,exploring local street food and traditional cu...,3,0.816497,3,9,8.352215,1.343028


In [22]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

query_vec = tfidf.fit_transform(df["query"])
doc_vec = tfidf.transform(df["document"])

df["tfidf_score"] = (query_vec.multiply(doc_vec)).sum(axis=1).A1

df["query_length"] = df["query"].apply(lambda x: len(x.split()))
df["doc_length"] = df["document"].apply(lambda x: len(x.split()))

df[["query", "document", "tfidf_score", "query_length", "doc_length"]].head()


Unnamed: 0,query,document,tfidf_score,query_length,doc_length
0,eco friendly travel,eco friendly travel focuses on reducing carbon...,1.0,3,12
1,eco friendly travel,sustainable tourism practices for responsible ...,0.0,3,6
2,eco friendly travel,best travel destinations around the world,0.57735,3,6
3,eco friendly travel,latest smartphone models released this year,0.0,3,6
4,local food experiences,exploring local street food and traditional cu...,0.816497,3,9


In [23]:
np.unique(y_pairwise, return_counts=True)


(array([0, 1]), array([18, 18]))

In [24]:
rank_model = LogisticRegression(max_iter=1000)
rank_model.fit(X_pairwise, y_pairwise)


In [25]:
evaluate_ndcg(df)


0.8858338322718243