## В данном задании предлагается решить задачу ранжирования документов

Для начала подключим необходимые библиотеки для работы

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive/')

sns.set(font_scale=1, style='darkgrid', palette='Set2')

Mounted at /content/drive/


Загрузим данные

In [32]:
data = pd.read_csv('/content/drive/MyDrive/intern_task.csv')
data.head(10)

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0
5,1,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.2,0.974819,22.936731,0.333333,0.033233,9.3e-05,28.0,9.333333
6,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,3.0,16.2,0.0,0.945281,18.240926,0.0,0.013008,2.3e-05,5.0,1.666667
7,0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,10,2.0,0.0,1.0,0.0,2.0,0.666667,0.0,0.333333,...,218.0,55.069946,0.0,0.448807,4.695805,0.0,0.002153,2e-06,5.0,1.666667
9,0,10,3.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.975608,0.0,0.0,0.021583,4e-05,9.0,3.0


Проверим датасет на пропуски и удалим их

In [3]:
print(data.isna().sum())
print("----------------------------")
print((data == ' ').sum())

rank           0
query_id       0
feature_0      0
feature_1      0
feature_2      0
              ..
feature_139    0
feature_140    0
feature_141    0
feature_142    0
feature_143    0
Length: 146, dtype: int64
----------------------------
rank           0
query_id       0
feature_0      0
feature_1      0
feature_2      0
              ..
feature_139    0
feature_140    0
feature_141    0
feature_142    0
feature_143    0
Length: 146, dtype: int64


In [4]:
print(data.dtypes)

rank             int64
query_id         int64
feature_0      float64
feature_1      float64
feature_2      float64
                ...   
feature_139    float64
feature_140    float64
feature_141    float64
feature_142    float64
feature_143    float64
Length: 146, dtype: object


In [5]:
data = data.dropna()

Создадим список фичей

In [33]:
features = list(data.columns[2::])

In [7]:
data["rank"].max()

4

Будем решать задачу с помощью ```CatBoostRegressor```. Будем прогонять данные через модель, затем мы округлим полученные значения ранга документа и в рамках одого запроса будем сортировать документы по полученному рангу

Установим библиотеку ```catboost``` и импортируем ```ndcg_score```

In [8]:
!pip install catboost
from sklearn.metrics import ndcg_score
from catboost import CatBoostRegressor

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


Разделим данные на тестовую и обучающую выборки. Размер тестовой выборки возьмем равную 0.2 части от изначальных данных

In [49]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data['rank'], test_size=0.2)
print(f'Размер X_train = {len(X_train)}')
print(f'Размер X_test = {len(X_test)}')
print(f'Размер y_train = {len(y_train)}')
print(f'Размер y_test = {len(y_test)}')

Размер X_train = 188206
Размер X_test = 47052
Размер y_train = 188206
Размер y_test = 47052


Объявим модель и обучим ее на обучающей выборке

In [50]:
model = CatBoostRegressor()
model.fit(X_train, y_train)

Learning rate set to 0.093664
0:	learn: 0.8198559	total: 122ms	remaining: 2m 2s
1:	learn: 0.8129377	total: 220ms	remaining: 1m 49s
2:	learn: 0.8070815	total: 350ms	remaining: 1m 56s
3:	learn: 0.8018270	total: 454ms	remaining: 1m 52s
4:	learn: 0.7974948	total: 561ms	remaining: 1m 51s
5:	learn: 0.7937888	total: 704ms	remaining: 1m 56s
6:	learn: 0.7904445	total: 838ms	remaining: 1m 58s
7:	learn: 0.7874827	total: 947ms	remaining: 1m 57s
8:	learn: 0.7848125	total: 1.07s	remaining: 1m 57s
9:	learn: 0.7824734	total: 1.19s	remaining: 1m 57s
10:	learn: 0.7804492	total: 1.32s	remaining: 1m 58s
11:	learn: 0.7785069	total: 1.43s	remaining: 1m 57s
12:	learn: 0.7769194	total: 1.53s	remaining: 1m 56s
13:	learn: 0.7752629	total: 1.64s	remaining: 1m 55s
14:	learn: 0.7738508	total: 1.74s	remaining: 1m 54s
15:	learn: 0.7725308	total: 1.84s	remaining: 1m 53s
16:	learn: 0.7711873	total: 1.96s	remaining: 1m 53s
17:	learn: 0.7701174	total: 2.06s	remaining: 1m 52s
18:	learn: 0.7692071	total: 2.18s	remaining: 

<catboost.core.CatBoostRegressor at 0x7ebe529d9f30>

Предскажем ранги документов и посчитаем метрику ```ndcg```

In [51]:
y_pred = model.predict(X_test)
Y_pred = np.round(y_pred)
ndcg_5 = ndcg_score([y_test], [y_pred], k=5)
print(f"NDCG_5 = {ndcg_5}")

NDCG_5 = 0.9210878868801213


Итого, в данной задаче необходимо было отранжировать документы из одного запроса. Задача решалась с помощью библиотеки ```catboost```.  Затем, округлялись полученные значения ранга документа. Можно видеть, что метрика ```ndcg_5``` показала значение $0.92$.