In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from catboost import CatBoostRanker, Pool, CatBoostClassifier
from catboost.utils import eval_metric
from IPython import display as disp
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score

## NDCG

Так как имеется значительное количество групп, для которых нет релевантного результа, а единого решения, как считать ndcg, когда idcg = 0, я не нашел, то буду считать 2 варинта метрики

In [6]:
def dcg_score(y_true, y_score):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array,
        Ground truth (true relevance labels).
    y_score : array,
        Predicted scores.
    
    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

In [7]:
def ndcg_score(ground_truth, predictions, idcg_zero_policy = 1):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    idcg_zero_policy : [0,1,'ignore'] 
        ndcg if idcg = 0
    Returns
    -------
    score : float
    """
    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(ground_truth, predictions):
        ignore = False
        actual = dcg_score(y_true, y_score)
        best = dcg_score(y_true, y_true)
        if best < 1e-8:
            if idcg_zero_policy == 0:
                scores.append(0)
            elif idcg_zero_policy == 1:
                scores.append(1.)
            elif idcg_zero_policy == 'ignore':
                ignore = True
        else:
            if not ignore:
                score = float(actual) / float(best)
                scores.append(score)
                
    return np.mean(scores)

## Предобработка

In [8]:
train_df = pd.read_csv('../data/train_df.csv')
test_df = pd.read_csv('../data/test_df.csv')

In [9]:
display(train_df,test_df)

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.00000,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.00000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15076,494693,9,0,0,0,9,4,38,6,6,...,0.309672,0.921060,0.285226,0,0,0,0.98807,0.0,0.0,0
15077,494693,9,0,0,0,9,4,38,6,6,...,0.303805,0.995086,0.302312,0,0,0,0.87146,0.0,0.0,0
15078,494693,9,0,0,0,9,4,38,6,6,...,0.346538,0.993070,0.344137,0,0,0,0.49999,0.0,0.0,0
15079,494693,9,0,0,0,9,4,38,6,6,...,0.243154,0.994833,0.241898,0,0,0,0.67614,0.0,0.0,0


Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,10655,9,0,0,1,20,4,40,0,0,...,0.148830,0.196644,0.029267,0,0,0,0.03674,0.0,0.0,0
1,10655,9,0,0,1,20,4,40,0,0,...,0.119724,0.174199,0.020856,0,0,0,0.00000,0.0,0.0,0
2,10655,9,0,0,1,20,4,40,0,0,...,0.160606,0.198780,0.031925,0,0,0,0.00000,0.0,0.0,0
3,10655,9,0,0,1,20,4,40,0,0,...,0.180191,0.187882,0.033855,0,0,0,0.00000,0.0,0.0,0
4,10655,9,0,0,1,20,4,40,0,0,...,0.117308,0.153586,0.018017,0,0,0,0.00000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,493078,9,0,0,0,9,4,35,0,0,...,0.341683,0.067348,0.023012,0,0,0,0.46108,0.0,0.0,0
1525,493078,9,0,0,0,9,4,35,0,0,...,0.270293,0.049000,0.013244,0,0,0,0.03674,0.0,0.0,0
1526,493078,9,0,0,0,9,4,35,0,0,...,0.372268,0.069882,0.026015,0,0,0,0.14540,0.0,0.0,1
1527,493078,9,0,0,0,9,4,35,0,0,...,0.355755,0.077469,0.027560,0,0,0,0.21288,0.0,0.0,1


In [10]:
train_df['search_id'].value_counts()

search_id
156182    20
8591      20
226704    20
227432    20
315998    20
          ..
155433     1
271166     1
178343     1
387764     1
303366     1
Name: count, Length: 1000, dtype: int64

In [11]:
test_df['search_id'].value_counts()

search_id
333637    20
381319    20
434883    20
356764    20
480809    20
          ..
412165     4
316202     4
399604     4
33453      2
312348     1
Name: count, Length: 100, dtype: int64

In [12]:
#есть ли пересечения групп в трейне и тесте
set(test_df['search_id'].unique()) & set(train_df['search_id'].unique())

set()

In [13]:
train_df['target'].value_counts()

target
0    14759
1      322
Name: count, dtype: int64

Смотрим, есть ли признаки с 0 дисперсией

In [14]:
cols_w_0_var_train = [col for col in train_df.columns if train_df.var()[col] == 0]
cols_w_0_var_train

['feature_0', 'feature_73', 'feature_74', 'feature_75']

In [15]:
cols_w_0_var_test = [col for col in test_df.columns if test_df.var()[col] == 0]
cols_w_0_var_test

['feature_0', 'feature_73', 'feature_74', 'feature_75']

In [16]:
train_df.loc[0,cols_w_0_var_train]

feature_0     9.0
feature_73    0.0
feature_74    0.0
feature_75    0.0
Name: 0, dtype: float64

In [17]:
test_df.loc[0,cols_w_0_var_test]

feature_0     9.0
feature_73    0.0
feature_74    0.0
feature_75    0.0
Name: 0, dtype: float64

значения и там и там одинаковые -> можно удалять

In [18]:
train_df = train_df.drop(cols_w_0_var_train,axis=1)
test_df = test_df.drop(cols_w_0_var_test,axis=1)

In [19]:
# plt.figure(figsize=(50,50))
# corr_matrix = train_df.corr()
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# plt.title('Матрица корреляций')
# plt.savefig('../expirements/corr_person.png')

In [20]:
# plt.figure(figsize=(50,50))
# corr_matrix = train_df.corr('spearman')
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# plt.title('Матрица корреляций')
# plt.savefig('../expirements/corr_spearman.png')

#### Удаление корреляций

In [21]:
X = train_df.drop(['search_id','target'],axis =1)
group_id = train_df['search_id']
y = train_df['target']

In [22]:
corr_f = X.corr()
upper = corr_f.where(np.triu(np.ones(corr_f.shape), k=1).astype(bool))
upper

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_76,feature_77,feature_78
feature_1,,-0.017054,-0.054672,-0.054672,0.038106,0.045054,0.514021,0.222314,-0.013512,0.036547,...,-0.024909,0.036379,0.024205,-0.036493,0.023376,0.076516,0.101744,0.022636,-0.025528,-0.023964
feature_2,,,0.066014,0.066014,-0.008419,-0.024244,-0.051710,-0.048600,-0.004476,-0.003273,...,-0.020644,-0.010678,0.008217,0.184478,-0.026022,0.000258,-0.023006,-0.028082,-0.005987,-0.001722
feature_3,,,,1.000000,-0.007873,0.017213,-0.057525,0.005045,0.005249,-0.016109,...,0.026314,0.018691,0.019376,0.035253,-0.050584,0.142552,0.077396,0.044092,0.022348,0.023254
feature_4,,,,,-0.007873,0.017213,-0.057525,0.005045,0.005249,-0.016109,...,0.026314,0.018691,0.019376,0.035253,-0.050584,0.142552,0.077396,0.044092,0.022348,0.023254
feature_5,,,,,,-0.086111,0.065607,0.049628,-0.010910,0.016359,...,0.001226,0.020756,0.031695,0.018836,0.037763,0.064221,0.067338,0.029528,-0.051923,-0.046336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
feature_71,,,,,,,,,,,...,,,,,,,0.900260,0.433373,0.017872,0.041132
feature_72,,,,,,,,,,,...,,,,,,,,0.471807,0.035760,0.053233
feature_76,,,,,,,,,,,...,,,,,,,,,0.061974,0.076301
feature_77,,,,,,,,,,,...,,,,,,,,,,0.967949


In [23]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
len(to_drop)

9

In [24]:
corr_features = X[to_drop]

In [25]:
X_1 = X.drop(to_drop,axis = 1)
X_1.shape

(15081, 66)

#### Сплит

In [26]:
group_id[round(len(group_id)*0.75)-5:round(len(group_id)*0.75)+5]

11306    138696
11307    138696
11308    138696
11309    139380
11310    139380
11311    139380
11312    139380
11313    139380
11314    139380
11315    139380
Name: search_id, dtype: int64

разбиваем по 11309

In [27]:
X_train = X_1[:11309]
X_val = X_1[11309:]
y_train = y[:11309]
y_val = y[11309:]
group_id_train = group_id[:11309]
group_id_val = group_id[11309:]

#### Нормализация

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
normalizer = StandardScaler()
X_train_norm = normalizer.fit_transform(X_train)
X_val_norm = normalizer.transform(X_val)

In [30]:
X_train_norm = pd.DataFrame(X_train_norm,columns=X_train.columns)
X_val_norm = pd.DataFrame(X_val_norm,columns=X_val.columns)

In [31]:
train_pool = Pool(
    data=X_train_norm.values,
    label=y_train.values,
    group_id = group_id_train,
)

val_pool = Pool(
    data=X_val_norm.values,
    label=y_val.values,
    group_id = group_id_val,
)

In [32]:
group_interval_val = [0]
group_interval_val.extend(pd.DataFrame(group_id_val).groupby('search_id').agg({'search_id':'count'}).values.reshape(1,-1)[0].tolist())
group_interval_val[:5]

[0, 18, 14, 20, 14]

In [33]:
for i in range(1,len(group_interval_val)):
    group_interval_val[i] +=group_interval_val[i-1]
group_interval_val[:5]

[0, 18, 32, 52, 66]

---

## Модели

### CatBoost

#### Classifier

In [34]:
params = parameters = {
    'iterations': 2000,
    'verbose': True,
    'random_seed': 42,
    'eval_metric': 'AUC',
    'od_wait': 20,
    'learning_rate': 1e-2,
    'l2_leaf_reg': 1e-1,
    'max_depth': 12,
    'od_type' : 'Iter',
    'auto_class_weights': 'Balanced',
}

In [35]:
gbc = CatBoostClassifier(**params)

In [None]:
gbc.fit(train_pool, eval_set=val_pool, plot=False)

In [37]:
preds_gbc = gbc.predict(val_pool)
preds_gbc

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [39]:
pd.DataFrame(preds_gbc,columns=['preds']).value_counts(normalize=True)

preds
0        0.999205
1        0.000795
Name: proportion, dtype: float64

In [41]:
y_true = []
y_score = []
for i in range(1,len(group_interval_val)):
    y_true_i = y_val[group_interval_val[i-1]:group_interval_val[i]].values.tolist()
    y_true.append(y_true_i)
    y_score_i = preds_gbc[group_interval_val[i-1]:group_interval_val[i]].tolist()
    y_score.append(y_score_i)

In [42]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_score,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_score,1)))
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_score,'ignore')))

Тестовый NDCG(0): 0.11034904014237581
Тестовый NDCG(1): 0.875781138907808
Тестовый NDCG(ignore): 0.4704353816596022


In [43]:
ndcg_score([y_val.values],[preds_gbc])

0.4715766796517793

#### YetiRank

In [114]:
parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': True,
    'loss_function' : 'YetiRank',
    'random_seed': 42,
    'learning_rate': 3e-1,
    'l2_leaf_reg': 1e-1,
    'max_depth': 12,
    'od_wait': 20,
    'od_type' : 'Iter',
}

In [115]:
gbr = CatBoostRanker(**parameters)

In [None]:
gbr.fit(train_pool, eval_set=val_pool, plot=False)

In [117]:
gbr.shrink(35)

In [118]:
preds_gbr = gbr.predict(val_pool)

In [119]:
y_score_r = []
for i in range(1,len(group_interval_val)):
    y_score_i = preds_gbr[group_interval_val[i-1]:group_interval_val[i]].tolist()
    y_score_r.append(y_score_i)

In [120]:
eval_metric(y_val.values,preds_gbr,'NDCG',group_id=group_id_val.values)

[0.8843090888402785]

In [121]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_score_r,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_score_r,1))) #как стоковая метрика в катбусте
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_score_r,'ignore')))

Тестовый NDCG(0): 0.11887699007484646
Тестовый NDCG(1): 0.8843090888402786
Тестовый NDCG(ignore): 0.5067913787401348


In [122]:
ndcg_score([y_val.values],[preds_gbr])

0.47048664643527544

---

In [125]:
f_imp = gbr.get_feature_importance(train_pool,type='FeatureImportance')

In [127]:
usless_f = list(X_train_norm.columns[f_imp == 0])
usless_f

['feature_1',
 'feature_2',
 'feature_5',
 'feature_9',
 'feature_12',
 'feature_14',
 'feature_15',
 'feature_18',
 'feature_23',
 'feature_36']

In [129]:
X_train_norm_ = X_train_norm.drop(usless_f,axis=1)
X_val_norm_ = X_val_norm.drop(usless_f,axis=1)

In [131]:
train_pool_ = Pool(
    data=X_train_norm_.values,
    label=y_train.values,
    group_id = group_id_train,
)

val_pool_ = Pool(
    data=X_val_norm_.values,
    label=y_val.values,
    group_id = group_id_val,
)

In [132]:
gbr = CatBoostRanker(**parameters)

In [None]:
gbr.fit(train_pool_, eval_set=val_pool_, plot=False)

In [134]:
gbr.shrink(25)

In [135]:
preds_gbr = gbr.predict(val_pool)

In [136]:
y_score_r = []
for i in range(1,len(group_interval_val)):
    y_score_i = preds_gbr[group_interval_val[i-1]:group_interval_val[i]].tolist()
    y_score_r.append(y_score_i)

In [137]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_score_r,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_score_r,1))) #как стоковая метрика в катбусте
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_score_r,'ignore')))

Тестовый NDCG(0): 0.10667721200480482
Тестовый NDCG(1): 0.8721093107702369
Тестовый NDCG(ignore): 0.45478179854679945


In [138]:
ndcg_score([y_val.values],[preds_gbr])

0.465386880768941

стало хуже, так что удалять не будем

#### RandomForestClassifier

посчитать вот такой ndsg для дерева

In [215]:
import pickle
with open('../expirements/rfc.pkl','rb') as f:
    rfc = pickle.load(f)

In [216]:
preds_rfc = rfc.predict(X_val_norm)

In [217]:
y_score_t = []
for i in range(1,len(group_interval_val)):
    y_score_i = preds_rfc[group_interval_val[i-1]:group_interval_val[i]]
    y_score_t.append(y_score_i)

In [218]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_score_t,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_score_t,1)))
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_score_t,'ignore')))

Тестовый NDCG(0): 0.14755352003028288
Тестовый NDCG(1): 0.912985618795715
Тестовый NDCG(ignore): 0.6290439538133114


In [219]:
ndcg_score([y_val.values],[preds_rfc])

0.6963087739918126

In [220]:
ndcg_score([y_val.values],[np.zeros_like(y_val.values)])

0.5031785974726114

In [221]:
y_zeros = []
for i in range(len(y_true)):
    group = [0 for j in range(len(y_true[i]))]
    y_zeros.append(group)

In [222]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_zeros,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_zeros,1)))
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_zeros,'ignore')))

Тестовый NDCG(0): 0.11034904014237581
Тестовый NDCG(1): 0.875781138907808
Тестовый NDCG(ignore): 0.4704353816596022


### lgbm

In [139]:
from lightgbm import LGBMRanker

In [None]:
group_in_val = pd.DataFrame(group_id_val).groupby('search_id').agg({'search_id':'count'}).values.reshape(1,-1)[0].tolist()
group_in_train = pd.DataFrame(group_id_train).groupby('search_id').agg({'search_id':'count'}).values.reshape(1,-1)[0].tolist()

In [207]:
lgbm = LGBMRanker(class_weight = 'balanced',random_state=42,n_jobs = -1,n_estimators=150)

In [208]:
lgbm.fit(X_train_norm,y_train,group=group_in_train)

[LightGBM] [Info] Calculating query weights...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10717
[LightGBM] [Info] Number of data points in the train set: 11309, number of used features: 66


In [209]:
preds_lgbm = lgbm.predict(X_val_norm)

In [210]:
y_score_l = []
for i in range(1,len(group_interval_val)):
    y_score_i = preds_lgbm[group_interval_val[i-1]:group_interval_val[i]]
    y_score_l.append(y_score_i)

In [211]:
print('Тестовый NDCG(0): {}'.format(ndcg_score(y_true,y_score_l,0)))
print('Тестовый NDCG(1): {}'.format(ndcg_score(y_true,y_score_l,1)))
print('Тестовый NDCG(ignore): {}'.format(ndcg_score(y_true,y_score_l,'ignore')))

Тестовый NDCG(0): 0.11785787505170782
Тестовый NDCG(1): 0.8832899738171399
Тестовый NDCG(ignore): 0.5024467304835966


In [212]:
ndcg_score([y_val.values],[preds_lgbm])

0.4982023500464655

In [213]:
index_col = ['Constant_classifier','CatBoostClassifier','CatBoostRanker','LGBMRanker','RandomForestClassifier']

In [226]:
ndsg_0 = [ndcg_score(y_true,y_zeros,0),ndcg_score(y_true,y_score,0),ndcg_score(y_true,y_score_r,0),ndcg_score(y_true,y_score_l,0),ndcg_score(y_true,y_score_t,0)]
ndsg_1 = [ndcg_score(y_true,y_zeros,1),ndcg_score(y_true,y_score,1),ndcg_score(y_true,y_score_r,1),ndcg_score(y_true,y_score_l,1),ndcg_score(y_true,y_score_t,1)]
ndsg_ignore = [ndcg_score(y_true,y_zeros,'ignore'),ndcg_score(y_true,y_score,'ignore'),ndcg_score(y_true,y_score_r,'ignore'),ndcg_score(y_true,y_score_l,'ignore'),ndcg_score(y_true,y_score_t,'ignore')]
ndsg_all = [ndcg_score([y_val.values],[np.zeros_like(y_val.values)]),ndcg_score([y_val.values],[preds_gbc]),ndcg_score([y_val.values],[preds_gbr]),ndcg_score([y_val.values],[preds_lgbm]),ndcg_score([y_val.values],[preds_rfc])]

In [228]:
results = pd.DataFrame(np.array([ndsg_0,ndsg_1,ndsg_ignore,ndsg_all]).T,columns=['NDCG(0)','NDCG(1)','NDCG(ignore)','NDCG(all)'],index=index_col)
results.to_markdown('../expirements/val_res.md')

In [229]:
results

Unnamed: 0,NDCG(0),NDCG(1),NDCG(ignore),NDCG(all)
Constant_classifier,0.110349,0.875781,0.470435,0.503179
CatBoostClassifier,0.110349,0.875781,0.470435,0.471577
CatBoostRanker,0.106677,0.872109,0.454782,0.465387
LGBMRanker,0.117858,0.88329,0.502447,0.498202
RandomForestClassifier,0.147554,0.912986,0.629044,0.696309


### Таким образом лучший ранкер, который прям ранжирует, как и просят в задании - LGBMRanker, а лучшая модель по метрикам - RandomForestClassifier, обученный с использованием синтетических данных