In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
from scipy import stats
from model.model import build_pairwise_model, build_pointwise_model
from model.data import read_dataset
from model.metrics import mean_ndcg_score, ndcg_scores
from model.metrics import *


def rank(df):
    df = df.sort_values(["listing_id", "score"], ascending=False)
    return df


In [3]:
# Read the dataset and check the if there are any difference between train and validation set
train, validation = read_dataset("../data/dataset_v2.csv")
print(mean_ndcg_score(train))
print(mean_ndcg_score(validation))

0.4135230767638214
0.4107749368671361


Now we have baseline scores for the problem, let's test the pointwise ranking and lambdarank solutions

### Pointwise approach

In [4]:
clf = build_pointwise_model().fit(train, train["clicked"])
train["score"] = clf.predict_proba(train)[:, 1]
validation["score"] = clf.predict_proba(validation)[:, 1]


score = mean_ndcg_score(rank(train))
print(f"Train mean NDCG score {score}")

score = mean_ndcg_score(rank(validation))
print(f"validation mean NDCG score {score}")

The total shape (4152229, 49)
The total shape (4152229, 49)
The total shape (732575, 49)
Train mean NDCG score 0.4345855038086928
validation mean NDCG score 0.40518779880992895


As we can see, the model overfits somehow, and mean nDCG value seems to be smaller if compared to the validation set

### Lambdarank approach


In [5]:
groups = train.groupby("listing_id")["n0"].count().values.reshape(-1)
clf = build_pairwise_model()
clf.fit(train, train["clicked"], lgbmranker__group=groups)

train["score"] = clf.predict(train)
validation["score"] = clf.predict(validation)

score = mean_ndcg_score(rank(train))
print(f"Train mean NDCG score {score}")

score = mean_ndcg_score(rank(validation))
print(f"validation mean NDCG score {score}")

The total shape (4152229, 49)
The total shape (4152229, 49)
The total shape (732575, 49)
Train mean NDCG score 0.43506777442703415
validation mean NDCG score 0.42202427610857207


Now let's perform a two-sided t-test for $H_0$ hypothesis of identical mean nDCG values.

In [6]:
model_scores = ndcg_scores(rank(validation))
original_scores = ndcg_scores(validation)

stats.ttest_ind(original_scores, model_scores, equal_var=False)

Ttest_indResult(statistic=-3.719166735921395, pvalue=0.00020024515192454114)

The p-value is small enough (less than 0.05) to reject the $H_0$. In other words the avarage new model gives larger mean nDCG and it's statistically significant.

How often the new model outperforms the old one?
Let `positives`-statistics is the fraction of documents where the original ranking model gives higher nDCG than the new model. If the original model has better performance, then this statistics `positives > 0.5` and `positives < 0.5` otherwise. Let's test the $H_0$ of `positives = 0.5` against the $H_1$ of `positives < 0.5` with binomial test:

In [7]:
positives = np.sum(original_scores > model_scores)
pvalue = stats.binom_test(positives, len(model_scores), alternative="less")
print(f"Number of positive answers {positives} out of {len(model_scores)}, p-value = {pvalue}")

Number of positive answers 6064 out of 14998, p-value = 2.0825562113768356e-122


We can reject the $H_0$ at 0.05 significance level, hence the new model doesn't not perform worse than the original one