In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb

In [6]:
test_user_df = pd.read_parquet('test.parquet.gzip')

In [2]:
import pickle
with open('pre_ranker_final.pickle', 'rb') as f:
    pre_ranker_model = pickle.load(f)
    
with open('ranker_final.pickle', 'rb') as f:
    full_ranker_model = pickle.load(f)
    
study_cols =  ['cnt_users_by_item',
   'mean_time_by_item', 'mean_good_by_item', 'mean_abs_react_by_item',
   'pretarget_time_sum_5m', 'pretarget_time_sum_1m',
   'pretarget_good_sum_5m', 'pretarget_good_sum_1m', 'pretarget_prc',
   'cnt_items', 'time_sum', 'good_mean', 'good_sum', 'reaction_mean',
   'reaction_abs_mean', 'reaction_abs_sum', 'als_score', 'emb_als_score',
   'emb_als_score_tune', 'cosine', 'source_good_mean',
   'source_good_sum']

In [4]:
import gc
full_result_df_lst = []
for i in tqdm(range(10)):
    tmp_result_df = pd.read_parquet(f'/srv/data/vk/test/result_df_{i}.parquet.gzip')
    
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    group_pretrain = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['rank'] = pre_ranker_model.predict(tmp_result_df[study_cols], group=group_pretrain)
    tmp_result_df = tmp_result_df.sort_values(['rank'], ascending = False).groupby('user_id').head(200)
    
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    group_valid = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['full_rank'] = full_ranker_model.predict(tmp_result_df[study_cols], group=group_valid)
    
    tmp_result_df = tmp_result_df.sort_values('full_rank', ascending = False).groupby(
        'user_id').head(20)[['user_id','item_id','full_rank']]
    full_result_df_lst.append(tmp_result_df)

100%|██████████| 10/10 [03:41<00:00, 22.14s/it]


In [10]:
result_df = pd.concat(full_result_df_lst).reset_index(drop=True)
del full_result_df_lst
gc.collect()

1

In [11]:
user_predict_dct = result_df.sort_values('full_rank', ascending = False).groupby('user_id').head(20).groupby(
    'user_id')['item_id'].apply(list).to_dict()  
prediction_lst = []
for user_id in tqdm(test_user_df.user_id.values):
    prediction_lst.append(user_predict_dct[user_id])
    
test_user_df['predictions'] = prediction_lst
test_user_df.to_parquet('result/clean_predict.parquet.gzip', compression='gzip')

100%|██████████| 200000/200000 [00:00<00:00, 2562478.96it/s]
