# Профильное задание: Инженер машинного обучения

Оксана Коновалова

VK, апрель 2024 

In [1]:
# Установка используемых библиотек
# !pip install pandas
# !pip install numpy
# !pip install scikit-learn

In [2]:
import pandas as pd
from zipfile import ZipFile

## Load data

In [3]:
# Чтение данных из архива
with ZipFile("data/intern_task.zip", "r") as myzip:
    myzip.extractall(path='data',  members=['intern_task.csv'])

In [4]:
data = pd.read_csv('data/intern_task.csv')

data.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


## Exploration data

In [5]:
data.describe()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
count,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,...,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0,235258.0
mean,0.677869,14828.413401,1.91196,0.206233,1.189847,0.550272,1.960082,0.803964,0.097557,0.541174,...,476.3432,10466.98,0.070299,0.74071,11.067947,0.281747,0.027033,0.000269,22.45791,9.560379
std,0.830033,8193.94517,1.237374,0.579089,1.037233,0.790947,1.203534,0.339955,0.266035,0.419973,...,19548.4,2649584.0,0.100721,0.3528,15.336861,0.392089,0.033351,0.002088,63.708018,14.347378
min,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,8215.0,1.0,0.0,0.0,0.0,1.0,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.620522,0.0,0.0,0.006703,0.0,4.0,2.0
50%,0.0,14935.0,2.0,0.0,1.0,0.0,2.0,1.0,0.0,0.5,...,0.0,0.0,0.0,0.926779,9.4489,0.0,0.017761,8e-06,12.0,5.5
75%,1.0,21580.0,3.0,0.0,2.0,1.0,3.0,1.0,0.0,1.0,...,1.0,0.4,0.142857,0.999613,16.146733,0.5,0.034954,0.000109,28.0,12.0
max,4.0,29995.0,31.0,18.0,27.0,9.0,31.0,1.0,1.0,1.0,...,1731553.0,800000000.0,0.818182,1.0,2506.799764,1.0,0.711261,0.25,15074.0,1552.0


In [6]:
# Проверка того, что нет ячеек со значением NULL 
data.isnull().sum().sum()

0

In [7]:
# Количество уникальных 'query_id'
len(pd.unique(data['query_id']))

2000

## Model

In [8]:
from sklearn.metrics import ndcg_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [9]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

### Model: RandomForestRegressor

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [11]:
ndcg_scores_rfr = []
mse_scores_rfr = []
models_rfr = {}

for query_id, group in train_data.groupby('query_id'):
    X_train = group.drop(['rank', 'query_id'], axis=1)
    y_train = group['rank']

    # Обучение
    model_rfr = RandomForestRegressor()
    model_rfr.fit(X_train, y_train)

    # Сохранение модели
    models_rfr[query_id] = model_rfr

    test_group = test_data[test_data['query_id'] == query_id]
    # Если в тестовых данных только один документ, то метрики не считать
    if len(test_group) < 2:
        continue
    X_test = test_group.drop(['rank', 'query_id'], axis=1)
    y_test = test_group['rank']

    # Предсказание и оценка
    y_pred_rfr = model_rfr.predict(X_test)
    
    ndcg_scores_rfr.append(ndcg_score([y_test], [y_pred_rfr], k=5))
    mse_scores_rfr.append(mean_squared_error(y_test, y_pred_rfr))

In [12]:
mean_ndcg_rfr = np.mean(ndcg_scores_rfr)
mean_mse_rfr = np.mean(mse_scores_rfr)

print("Mean NDCG для всех сессий для RandomForestRegressor:", mean_ndcg_rfr)
print("Mean MSE для всех сессий для RandomForestRegressor:", mean_mse_rfr)

Mean NDCG для всех сессий для RandomForestRegressor: 0.5557311191114679
Mean MSE для всех сессий для RandomForestRegressor: 0.5102783692887881


In [14]:
# Модель RandomForestRegressor для сессии 10 ('query_id' == 10)
models_rfr[10]

### Model: GradientBoostingRegressor

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

In [16]:
ndcg_scores_gbr = []
mse_scores_gbr = []
models_gbr = {}

for query_id, group in train_data.groupby('query_id'):
    X_train = group.drop(['rank', 'query_id'], axis=1)
    y_train = group['rank']

    # Обучение
    model_gbr = GradientBoostingRegressor()
    model_gbr.fit(X_train, y_train)

    # Сохранение модели
    models_gbr[query_id] = model_gbr
    
    test_group = test_data[test_data['query_id'] == query_id]
    # Если в тестовых данных только один документ, то метрики не считать
    if len(test_group) < 2:
        continue
    X_test = test_group.drop(['rank', 'query_id'], axis=1)
    y_test = test_group['rank']

    # Предсказание и оценка
    y_pred_gbr = model_gbr.predict(X_test)
    
    ndcg_scores_gbr.append(ndcg_score([y_test], [y_pred_gbr], k=5))
    mse_scores_gbr.append(mean_squared_error(y_test, y_pred_gbr))

In [17]:
mean_ndcg_gbr = np.mean(ndcg_scores_gbr)
mean_mse_gbr = np.mean(mse_scores_gbr)

print("Mean NDCG для всех сессий для GradientBoostingRegressor:", mean_ndcg_gbr)
print("Mean MSE для всех сессий для GradientBoostingRegressor:", mean_mse_gbr)

Mean NDCG для всех сессий для GradientBoostingRegressor: 0.5368197277934198
Mean MSE для всех сессий для GradientBoostingRegressor: 0.5796115045253156


In [18]:
# Модель GradientBoostingRegressor для сессии 10 ('query_id' == 10)
models_gbr[10]

### Model: XGboost

In [19]:
# !pip install xgboost

In [20]:
import xgboost as xgb

In [21]:
ndcg_scores_xgb = []
mse_scores_xgb = []
models_xgb = {}

for query_id, group in train_data.groupby('query_id'):
    X_train = group.drop(['rank', 'query_id'], axis=1)
    y_train = group['rank']

    # Обучение
    dtrain = xgb.DMatrix(X_train, label=y_train)
    params = {'objective': 'rank:pairwise', 'eval_metric': 'ndcg@5', 'eta': 0.1, 'max_depth': 6}
    model_xgb = xgb.train(params, dtrain)

    # Сохранение модели
    models_xgb[query_id] = model_xgb
    
    test_group = test_data[test_data['query_id'] == query_id]
    # Если в тестовых данных только один документ, то метрики не считать
    if len(test_group) < 2:
        continue
    X_test = test_group.drop(['rank', 'query_id'], axis=1)
    y_test = test_group['rank']

    # Предсказание и оценка
    dtest = xgb.DMatrix(X_test)
    
    y_pred_xgb = model_xgb.predict(dtest)
    
    ndcg_scores_xgb.append(ndcg_score([y_test], [y_pred_xgb], k=5))
    mse_scores_xgb.append(mean_squared_error(y_test, y_pred_xgb))

In [22]:
mean_ndcg_xgb = np.mean(ndcg_scores_xgb)
mean_mse_xgb = np.mean(mse_scores_xgb)

print("Mean NDCG для всех сессий для XGboost:", mean_ndcg_xgb)
print("Mean MSE для всех сессий для XGboost:", mean_mse_xgb)

Mean NDCG для всех сессий для XGboost: 0.5287765752062529
Mean MSE для всех сессий для XGboost: 1.3085384321845166


In [23]:
# Модель XGboost для сессии 10 ('query_id' == 10)
models_xgb[10]

<xgboost.core.Booster at 0x1f29e6dbb30>

XGboost обучилась быстрее всего, но метрики хуже остальных. 

### Model: CatBoostRegressor

In [24]:
# !pip install catboost

In [25]:
from catboost import CatBoostRegressor, Pool

In [26]:
ndcg_scores_cbr = []
mse_scores_cbr = []
models_cbr = {}

for query_id, group in train_data.groupby('query_id'):
    X_train = group.drop(['rank', 'query_id'], axis=1)
    y_train = group['rank']

    # Если все значения в y_train равны, то модель нельзя будет обучить
    if len(pd.unique(y_train)) == 1:
        continue

    # Обучение
    train_pool = Pool(X_train, y_train)
    model_cbr = CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, loss_function='RMSE')
    model_cbr.fit(train_pool, verbose=False)

    # Сохранение модели
    models_cbr[query_id] = model_cbr
    
    test_group = test_data[test_data['query_id'] == query_id]
    # Если в тестовых данных только один документ, то метрики не считать
    if len(test_group) < 2:
        continue
    X_test = test_group.drop(['rank', 'query_id'], axis=1)
    y_test = test_group['rank']

    # Предсказание и оценка
    y_pred_cbr = model_cbr.predict(X_test)
    
    ndcg_scores_cbr.append(ndcg_score([y_test], [y_pred_cbr], k=5))
    mse_scores_cbr.append(mean_squared_error(y_test, y_pred_cbr))

In [27]:
mean_ndcg_cbr = np.mean(ndcg_scores_cbr)
mean_mse_cbr = np.mean(mse_scores_cbr)

print("Mean NDCG для всех сессий для CatBoostRegressor:", mean_ndcg_cbr)
print("Mean MSE для всех сессий для CatBoostRegressor:", mean_mse_cbr)

Mean NDCG для всех сессий для CatBoostRegressor: 0.5737288934590689
Mean MSE для всех сессий для CatBoostRegressor: 0.5129707762004034


In [28]:
# Модель CatBoostRegressor для сессии 10 ('query_id' == 10)
models_cbr[10]

<catboost.core.CatBoostRegressor at 0x1f20ffd4980>

Сравнивая все метрики, можно сделать вывод, что у CatBoostRegressor лучшие показатели из всех. Но обучение CatBoostRegressor заняло больше всего времени. Если будет необходимо использовать более быструю модель с небольшой потерей метрики, то будет лучше воспользоваться RandomForestRegressor.