In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb

In [2]:
# import lightgbm as lgb
ranker_full_model = lgb.LGBMRanker(n_estimators = 200,
                         learning_rate = 0.1,
                         random_state = 33,
                         n_jobs = 8,
                         colsample_bytree= 0.844,
                         max_depth= 48,
                         min_child_samples= 1500,
                         min_child_weight=0.00415,
                         min_split_gain= 0.0279,
                         num_leaves= 256,
                         reg_alpha= 0.3605,
                         reg_lambda= 0.4198,
                         subsample= 0.2429)


study_cols = ['cnt_users_by_item',
       'mean_time_by_item', 'mean_good_by_item', 'mean_abs_react_by_item',
       'pretarget_time_sum_5m', 'pretarget_time_sum_1m',
       'pretarget_good_sum_5m', 'pretarget_good_sum_1m', 'pretarget_prc',
       'cnt_items', 'time_sum', 'good_mean', 'good_sum', 'reaction_mean',
       'reaction_abs_mean', 'reaction_abs_sum', 'als_score', 'emb_als_score',
       'emb_als_score_tune', 'cosine', 'source_good_mean',
       'source_good_sum']



In [4]:
import pickle
with open('pre_ranker_final.pickle', 'rb') as f:
    pre_ranker_model = pickle.load(f)

In [5]:
import gc
result_lst = []
for i in tqdm([2,3,4,5,6,7,8]):
    tmp_result_df = pd.read_parquet(f'/srv/data/vk/old/result_df_{i}.parquet.gzip')
    tmp_result_df['user_id'] = tmp_result_df['user_id']+3000000
    
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    tmp_result_df.loc[tmp_result_df['timespent']>10,'timespent'] = 10
    group_pretrain = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['rank'] = pre_ranker_model.predict(tmp_result_df[study_cols], group=group_pretrain)
    tmp_result_df = tmp_result_df.sort_values(['rank'], ascending = False).groupby('user_id').head(200)
    result_lst.append(tmp_result_df)
    
    
    tmp_result_df = pd.read_parquet(f'/srv/data/vk/train/result_df_{i}.parquet.gzip')
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    tmp_result_df.loc[tmp_result_df['timespent']>10,'timespent'] = 10
    group_pretrain = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['rank'] = pre_ranker_model.predict(tmp_result_df[study_cols], group=group_pretrain)
    tmp_result_df = tmp_result_df.sort_values(['rank'], ascending = False).groupby('user_id').head(200)
    result_lst.append(tmp_result_df)
    
    gc.collect()

100%|██████████| 7/7 [05:46<00:00, 49.57s/it]


In [6]:
result_df = pd.concat(result_lst).reset_index(drop=True)

In [7]:
result_df = result_df.sort_values('user_id').reset_index(drop=True)
group_train = result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values

In [8]:
del result_lst
gc.collect()

0

In [9]:
%%time

ranker_full_model.fit(result_df[study_cols], 
          result_df['timespent'], 
          group=group_train
         )

CPU times: user 2h 8min 38s, sys: 9.41 s, total: 2h 8min 47s
Wall time: 16min 47s


LGBMRanker(colsample_bytree=0.844, max_depth=48, min_child_samples=1500,
           min_child_weight=0.00415, min_split_gain=0.0279, n_estimators=200,
           n_jobs=8, num_leaves=256, random_state=33, reg_alpha=0.3605,
           reg_lambda=0.4198, subsample=0.2429)

In [11]:

with open('ranker_final.pickle', 'wb') as f:
    pickle.dump(ranker_full_model, f)