<a href="https://colab.research.google.com/github/lexoz-bedra/rank_model_vk/blob/main/vk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ndcg_score
from xgboost import XGBClassifier, XGBRanker

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from google.colab import drive

In [41]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [42]:
data = pd.read_csv('/content/drive/My Drive/vk/intern_task_1.csv')

In [43]:
data.head(3)

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333


# EDA & preprocessing

In [44]:
only_vals = []

for col in data.drop(columns=['query_id', 'rank']).columns:
  if data[col].nunique() == 1:
    print(col, data[col].unique())
    only_vals.append(col)

feature_64 [0]
feature_65 [0]
feature_72 [1]
feature_100 [0]


In [5]:
data = data.drop(columns=only_vals)

In [6]:
print(f'Shape: {data.shape}')
print(f'Columns: {data.columns}')

Shape: (221999, 142)
Columns: Index(['rank', 'query_id', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7',
       ...
       'feature_134', 'feature_135', 'feature_136', 'feature_137',
       'feature_138', 'feature_139', 'feature_140', 'feature_141',
       'feature_142', 'feature_143'],
      dtype='object', length=142)


In [7]:
print(data.shape)
data.drop_duplicates(inplace=True)
print(data.shape)

(221999, 142)
(221999, 142)


In [8]:
data.isnull().sum().unique()

array([0, 1])

In [9]:
print(data.shape)
data.dropna(inplace=True)
print(data.shape)

(221999, 142)
(221998, 142)


In [10]:
data.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [11]:
data['rank'].unique()

array([0, 1, 2, 4, 3])

In [12]:
counts = data['query_id'].value_counts()

counts

query_id
22540    908
13930    773
22450    522
19945    444
9940     430
        ... 
11410      3
22780      2
14350      2
9265       1
20560      1
Name: count, Length: 1851, dtype: int64

In [13]:
corr = data.corr(method='pearson')

In [14]:
for col in corr.columns:
  for row in corr.iterrows():
    if corr[row[0]][col] > 0.9 and corr[row[0]][col] < 1:
      print(row[0], col, corr[row[0]][col])

feature_8 query_id 0.9991181867845841
feature_20 query_id 0.9991181867847642
feature_35 query_id 0.999118186784735
feature_4 feature_0 0.979259838143921
feature_101 feature_1 0.9029509304923932
feature_106 feature_1 0.9060398238054369
feature_23 feature_3 0.9596961337778669
feature_73 feature_3 0.9299891165503587
feature_108 feature_3 0.9186752621195532
feature_136 feature_3 0.9150628655387286
feature_0 feature_4 0.979259838143921
feature_137 feature_5 0.9024311630329874
feature_101 feature_6 0.9784095428915781
feature_102 feature_7 0.9604444534072172
query_id feature_8 0.9991181867845841
feature_20 feature_8 0.9999999999999991
feature_35 feature_8 0.9999999999999974
feature_14 feature_10 0.9973991673936746
feature_126 feature_13 0.9161563871922709
feature_10 feature_14 0.9973991673936746
feature_19 feature_15 0.9998861676955451
feature_17 feature_16 0.9917524995419423
feature_18 feature_16 0.9887493508203611
feature_16 feature_17 0.9917524995419423
feature_18 feature_17 0.984960095772

In [15]:
corr['rank'].drop('rank').sort_values(ascending=False)

feature_97     0.236579
feature_7      0.235747
feature_107    0.217172
feature_122    0.211948
feature_102    0.207539
                 ...   
feature_18    -0.122099
feature_17    -0.127172
feature_10    -0.127519
feature_14    -0.127711
feature_16    -0.128370
Name: rank, Length: 141, dtype: float64

In [16]:
corr['rank']['query_id']

0.06763902711136546

Мы увидели, что в датасете все признаки численные, пропусков нет, дубликатов нет, сильной корреляции с ранком ни у какого конкретного признака нет. Однако присутствует мультиколлинеарность (некоторые фичи очень сильно коррелируют друг с другом), поэтому в обучении модели будем использовать L1-регуляризацию.

Также важно отметить, что ранк не уникален для одного `query_id`, то есть в рамках одной сессии мы можем получить несколько одинаково релевантных результатов.
В данном случае эффективно будет использовать pointwise или pairwise подход для обучения ранжированию. Попробуем разные модели и решим, какая лучше, на основании результатов.

Train и test-датасеты сделаем из исходной выборки простым `train_test_split` из sklearn. Также сделаем валидационный датасет, так как у нас довольно много данных.

# Models development & training

Построим и обучим 2 модели: XGBClassifier и XGBRanker. Посмотрим, что сработает лучше для задачи ранжирования.

В качестве метрик будем использовать nDCG@5 и pFound.

In [17]:
X = data.drop(['query_id', 'rank'], axis=1)
y = data['rank']
query_id = data['query_id']

In [18]:
X_train, X_temp, y_train, y_temp, query_id_train, query_id_temp = train_test_split(
    X, y, query_id, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test, query_id_val, query_id_test = train_test_split(
    X_temp, y_temp, query_id_temp, test_size=0.5, random_state=42)

In [19]:
eval_set = [(X_val, y_val)]

## XGBoost Classifier

In [20]:
xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.1,
                        max_depth=6,
                        objective='multi:softprob',
                        verbosity=0, use_label_encoder=False,
                        tree_method='hist', reg_alpha=0.005,
                        eval_set=eval_set, verbose=True, device='gpu')

In [21]:
xgb_clf.fit(X_train, y_train, verbose=True)

In [22]:
# подсчёт ndcg@5

y_pred_xgb_clf = xgb_clf.predict(X_test)

def avg_ndcg(y_true, y_score, query_id_score, cnt):
    ndcg_scores = []
    for qid in np.unique(query_id_score):
        indices = np.where(query_id_score == qid)[0]
        true_rel = y_true.iloc[indices].values
        pred_scores = y_score[indices]

        if len(true_rel) == 1:
            continue

        rel_sorted_idx = np.argsort(-pred_scores, axis=0)[:]
        sorted_pred_scores = pred_scores[rel_sorted_idx]

        k = min(cnt, len(true_rel))
        ndcg = ndcg_score([true_rel], [sorted_pred_scores], k=k)
        ndcg_scores.append(ndcg)

    average_ndcg = np.mean(ndcg_scores) if ndcg_scores else float('nan')

    return average_ndcg

In [23]:
print(f'Average NDCG@5: {avg_ndcg(y_test, y_pred_xgb_clf, query_id_test, 5)}')

Average NDCG@5: 0.33931625111099906


In [24]:
# подсчёт pFound

def pfound(y_true, y_score, query_ids, p_break=0.15):

    y_true = y_true.values

    p_found_values = []

    for qid in np.unique(query_ids):
        indices = np.where(query_ids == qid)[0]

        y_true_normalized = y_true[indices] / 4
        y_score_sorted = y_score[indices]
        order = np.argsort(-y_score_sorted)
        y_true_sorted = y_true_normalized[order]

        p_look = 1
        p_found = 0

        for rel in y_true_sorted:
            p_found += p_look * rel
            p_look *= (1 - rel) * (1 - p_break)

        p_found_values.append(p_found)

    return np.mean(p_found_values)

In [25]:
print(f'PFound: {pfound(y_test, y_pred_xgb_clf, query_id_test)}')

PFound: 0.5882059154946356


## XGBoost Ranker

In [26]:
xgb_rank = XGBRanker(n_estimators=1000, learning_rate=0.1,
                        max_depth=6,
                        objective='rank:pairwise',
                        verbosity=0, use_label_encoder=False,
                        tree_method='hist', reg_alpha=0.005, verbose=True,
                        eval_set=eval_set, device='gpu')

In [27]:
sorted_idx = query_id_train.argsort()
X_train_sorted = X_train.iloc[sorted_idx]
y_train_sorted = y_train.iloc[sorted_idx]
query_id_train_sorted = query_id_train.iloc[sorted_idx]

group_train = query_id_train_sorted.groupby(query_id_train_sorted).size().values

In [28]:
xgb_rank.fit(X_train_sorted, y_train_sorted, group=group_train)

In [29]:
y_pred_xgb_rank = xgb_rank.predict(X_test)

In [30]:
print(f'Average NDCG@5: {avg_ndcg(y_test, y_pred_xgb_rank, query_id_test, 5)}')

Average NDCG@5: 0.3435369176066209


In [31]:
print(f'PFound: {pfound(y_test, y_pred_xgb_rank, query_id_test)}')

PFound: 0.6274360695412513


Низкое значение NDCG@5 в обоих случаях может свидетельствовать о том, что в выборке немного достаточно релевантных результатов, а также о том, что данные модели или гиперпараметры плохо подходят для этой задачи или этого датасета.

Значение pFound - вероятности, что в каждом случае пользователь найдёт что-то достаточно релевантное, прежде чем остановит свой поиск - довольно высокое в обоих случаях и немного выше во втором, что может свидетельствовать о том, что вторая модель решает задачу ранжирования с данными условиями чуть лучше.