# **Профильное задание для стажировки VK на направление "Инженер машинного обучения"**

*   Реализованы две модели для сравнения: первая на основе LGBMRanker, вторая на основе CatBoostRanker
*   Посчитаны метрики NDCG_5, NDCG_10 и NDCG_50




In [41]:
!pip install lightgbm

In [None]:
!pip install catboost

In [None]:
!pip install optuna

In [4]:
import lightgbm as lgb
from catboost import CatBoostRanker
from catboost import Pool
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from sklearn.model_selection import RandomizedSearchCV
from copy import deepcopy

In [5]:
from google.colab import drive

In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [7]:
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
file_path = '/content/drive/My Drive/intern_task.csv'
df = pd.read_csv(file_path)

## **Модель LightGBM**

In [11]:
df = df[df.groupby('query_id').query_id.transform(len) > 1]

В датасете нет пропусков в каких-либо ячейках, присутствуют только три строки с уникальными query_id

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [13]:
train_len = int(df.shape[0] * 0.6)
val_len = int(df.shape[0] * 0.8)

In [14]:
train_df = df[:train_len].copy()
val_df = df[train_len:val_len].copy()
test_df = df[val_len:].copy()

In [15]:
def get_data(data):
    qids = data.groupby("query_id")["query_id"].count().to_numpy()
    X = data.drop(["query_id", "rank"], axis=1)
    y = data["rank"]
    return qids, X, y

In [16]:
qids_train, X_train, y_train = get_data(train_df)
qids_val, X_val, y_val = get_data(val_df)
qids_test, X_test, y_test= get_data(test_df)

In [17]:
X_test.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
188206,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,800.0017,1.0,...,0.0,0.0,0.0,0.994411,0.0,0.0,0.030891,1.3e-05,43.0,21.5
188207,2.0,0.0,2.0,1.0,2.0,1.0,0.0,1.0,800.005951,1.0,...,0.0,0.0,0.125,0.994877,11.108381,0.5,0.025641,8e-06,81.0,40.5
188208,2.0,0.0,2.0,2.0,2.0,1.0,0.0,1.0,800.010202,1.0,...,0.0,0.0,0.25,0.902742,11.108381,1.0,0.025862,0.000177,20.0,10.0
188209,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,800.014452,1.0,...,61.0,36.35,0.1,0.925872,11.108381,0.5,0.025306,0.000141,88.0,44.0
188210,2.0,0.0,2.0,2.0,2.0,1.0,0.0,1.0,800.018703,1.0,...,0.0,0.0,0.285714,0.971648,11.108381,1.0,0.069048,0.000306,25.0,12.5


In [18]:
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [20]:
lgb_model = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    verbose = -1,
    learning_rate = 0.08,
    n_estimators = 100,
    num_leaves = 29,
    max_depth =-1
) #{'learning_rate': 0.08, 'n_estimators': 100, 'num_leaves': 25, 'max_depth': 99}

In [None]:
import optuna
def objective(trial):

    ranking_param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.06, 0.14, step=0.02),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=100),
        'num_leaves': trial.suggest_int('num_leaves', 25, 35, 1),
        'max_depth': trial.suggest_int('max_depth', -1, 500, 50)
    }

    gbm = lgb.LGBMRanker(objective="lambdarank", metric="ndcg",n_jobs=24, **ranking_param_grid)
    gbm.fit(X_train, y_train, group=qids_train, eval_set=[(X_val, y_val)],
                    eval_metric='ndcg', eval_group=[qids_val], eval_at=[10, 50, 100])

    return list(gbm.best_score_['valid_0'].values())[2]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print(len(study.trials))
print(study.best_trial.params)

learning_rate = 0.08,
n_estimators = 100,
num_leaves = 29,
max_depth =-1 - лучшие параметры найденные Optuna

In [21]:
lgb_model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_val, y_val)],
    eval_group=[qids_val],
    eval_at=10,
)

In [40]:
def ndcg_calc(df, X, test_model, kk):
    df['temp_col'] = test_model.predict(X)
    scores = []
    for query_id, group in df.groupby('query_id'):
        rank_true= group['rank'].values.reshape(1, -1)
        rank_pred = group['temp_col'].values.reshape(1, -1)
        try:
            ndcg = ndcg_score(rank_true, rank_pred, k = kk)
            scores.append(ndcg)
        except ValueError:
            continue
    df.drop(columns = ['temp_col'], inplace = True)
    return np.mean(scores)

print(f"Average NDCG@5: {ndcg_calc(test_df, X_test, lgb_model, 5)}")
print(f"Average NDCG@10: {ndcg_calc(test_df, X_test, lgb_model, 10)}")
print(f"Average NDCG@50: {ndcg_calc(test_df, X_test, lgb_model, 50)}")

Average NDCG@5: 0.5337795509649562
Average NDCG@10: 0.5444772958680018
Average NDCG@50: 0.6329924174538855


## **Модель CatBoost**

In [23]:
def get_pool(data):
    qids = data["query_id"]
    X = scaler.transform(data.drop(["query_id", "rank"], axis=1))
    y = data["rank"]
    return Pool(X, label = y, group_id = qids)

In [24]:
P_train = get_pool(train_df)
P_val = get_pool(val_df)
P_test = get_pool(test_df)

In [26]:
default_parameters = {}

In [27]:
def fit_model(loss_function, additional_params=None, train_pool=P_train, test_pool=P_test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [37]:
catboost_model = fit_model('YetiRank', {
    'custom_metric': ['NDCG:top=5', 'MAP:top=10', 'NDCG:top=10'],
    'iterations' : 10000,
    'task_type' : 'GPU',
    'metric_period' : 100
    })

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7143065	best: 0.7143065 (0)	total: 34.2ms	remaining: 5m 42s
100:	test: 0.7768248	best: 0.7768248 (100)	total: 1.05s	remaining: 1m 43s
200:	test: 0.7838938	best: 0.7838938 (200)	total: 1.93s	remaining: 1m 34s
300:	test: 0.7872023	best: 0.7872023 (300)	total: 2.8s	remaining: 1m 30s
400:	test: 0.7893885	best: 0.7893885 (400)	total: 3.63s	remaining: 1m 26s
500:	test: 0.7920726	best: 0.7920726 (500)	total: 4.5s	remaining: 1m 25s
600:	test: 0.7932348	best: 0.7932348 (600)	total: 5.35s	remaining: 1m 23s
700:	test: 0.7936101	best: 0.7936101 (700)	total: 6.66s	remaining: 1m 28s
800:	test: 0.7952112	best: 0.7952112 (800)	total: 9.44s	remaining: 1m 48s
900:	test: 0.7957301	best: 0.7957301 (900)	total: 10.3s	remaining: 1m 44s
1000:	test: 0.7962834	best: 0.7962834 (1000)	total: 11.2s	remaining: 1m 40s
1100:	test: 0.7968989	best: 0.7968989 (1100)	total: 12.1s	remaining: 1m 37s
1200:	test: 0.7972536	best: 0.7972536 (1200)	total: 12.9s	remaining: 1m 34s
1300:	test: 0.7978133	best: 0.7978133

In [39]:
print(f"NDCG@5: {ndcg_calc(test_df, X_test, catboost_model, 5)}")
print(f"NDCG@10: {ndcg_calc(test_df, X_test, catboost_model, 10)}")
print(f"NDCG@50: {ndcg_calc(test_df, X_test, catboost_model, 50)}")

NDCG@5: 0.5406855930718129
NDCG@10: 0.5491067934887015
NDCG@50: 0.6393799916577184


Метрики не сильно отличаются от метрик, полученных с помощью LightGBM, разница не более 1.5% в пользу CatBoost