In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb

In [13]:
import pickle

pre_study_cols = ['cnt_users_by_item',
       'mean_time_by_item', 'mean_good_by_item', 'mean_abs_react_by_item',
       'pretarget_time_sum_5m', 'pretarget_time_sum_1m',
       'pretarget_good_sum_5m', 'pretarget_good_sum_1m', 'pretarget_prc',
       'cnt_items', 'time_sum', 'good_mean', 'good_sum', 'reaction_mean',
       'reaction_abs_mean', 'reaction_abs_sum', 'als_score', 'emb_als_score',
       'emb_als_score_tune', 'cosine', 'source_good_mean',
       'source_good_sum']

study_cols = ['cnt_users_by_item',
       'mean_time_by_item', 'mean_good_by_item', 'mean_abs_react_by_item',
       'pretarget_time_sum_5m', 'pretarget_time_sum_1m',
       'pretarget_good_sum_5m', 'pretarget_good_sum_1m', 'pretarget_prc',
       'cnt_items', 'time_sum', 'good_mean', 'good_sum', 'reaction_mean',
       'reaction_abs_mean', 'reaction_abs_sum', 'als_score', 'emb_als_score',
       'emb_als_score_tune', 'cosine', 'source_good_mean',
       'source_good_sum']




with open('pre_ranker_final.pickle', 'rb') as f:
    pre_ranker_model = pickle.load(f)
    
with open('ranker_final.pickle', 'rb') as f:
    full_ranker_model = pickle.load(f)

In [14]:
import gc
result_lst = []
for i in tqdm([9]):
    tmp_result_df = pd.read_parquet(f'/srv/data/vk/old/result_df_{i}.parquet.gzip')
    tmp_result_df['user_id'] = tmp_result_df['user_id']+3000000
    
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    #tmp_result_df.loc[tmp_result_df['timespent']>10,'timespent'] = 10
    group_pretrain = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['rank'] = pre_ranker_model.predict(tmp_result_df[pre_study_cols], group=group_pretrain)
    tmp_result_df = tmp_result_df.sort_values(['rank'], ascending = False).groupby('user_id').head(200)
    result_lst.append(tmp_result_df)
    
    
    tmp_result_df = pd.read_parquet(f'/srv/data/vk/train/result_df_{i}.parquet.gzip')
    tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
    #tmp_result_df.loc[tmp_result_df['timespent']>10,'timespent'] = 10
    group_pretrain = tmp_result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values
    tmp_result_df['rank'] = pre_ranker_model.predict(tmp_result_df[pre_study_cols], group=group_pretrain)
    tmp_result_df = tmp_result_df.sort_values(['rank'], ascending = False).groupby('user_id').head(200)
    result_lst.append(tmp_result_df)
    
    gc.collect()

100%|██████████| 1/1 [00:46<00:00, 46.85s/it]


In [15]:
result_df = pd.concat(result_lst).reset_index(drop=True)

In [16]:
result_df = result_df.sort_values('user_id').reset_index(drop=True)
group_train = result_df.groupby('user_id').size().reset_index(name='cnt').cnt.values

In [17]:
del result_lst
gc.collect()

0

In [18]:
%%time

result_df['rank'] = full_ranker_model.predict(result_df[study_cols], group=group_train)

CPU times: user 9min 36s, sys: 857 ms, total: 9min 36s
Wall time: 38.9 s


In [19]:
full_df = pd.read_parquet('train.parquet.gzip')
target_old_df = full_df.iloc[-10000000:-5000000].reset_index(drop=True)
target_df = full_df.iloc[-5000000:].reset_index(drop=True)
target_old_df['user_id'] = target_old_df['user_id']+3000000
target_df = target_df.append(target_old_df).reset_index(drop=True)

In [20]:
def calc_metric(result_df, target_df, rank_col = 'rank'):
    user_predict_dct = result_df.sort_values(rank_col, ascending = False).groupby('user_id').head(20).groupby(
        'user_id')['item_id'].apply(list).to_dict()
    
    clean_valid_df = target_df.iloc[:].reset_index(drop=True)
    clean_valid_df = clean_valid_df[(clean_valid_df['user_id'].isin(result_df.user_id)) & 
                                    (clean_valid_df['timespent']>0)].reset_index(drop=True)

    ndcg_score_lst = []
    recall_lst = []

    top_20_pred_lst = [a for a in range(20,0,-1)]

    for user_id in clean_valid_df['user_id'].unique():
        pred_lst = user_predict_dct[user_id]
        target_dct = clean_valid_df[(clean_valid_df.user_id==user_id)].set_index('item_id')['timespent'].to_dict()
        if len(target_dct)==0:
            continue
        user_pred_lst = top_20_pred_lst.copy()
        user_target_lst = []
        for k in pred_lst:
            try:
                tmp_target = target_dct[k]
            except KeyError:
                tmp_target = 0
            user_target_lst.append(tmp_target)

        if len(user_target_lst)<20:
            user_target_lst = (user_target_lst+[0]*20)[0:20]

        for k, v in target_dct.items():
            if k not in pred_lst:
                user_pred_lst.append(0)
                user_target_lst.append(v)
        ndcg = ndcg_score([user_target_lst], [user_pred_lst], k = 20)
        ndcg_score_lst.append(ndcg)
        recall = np.sum([1 for k in target_dct.keys() if k in pred_lst])/len(target_dct)
        recall_lst.append(recall)
    return np.mean(ndcg_score_lst)

In [21]:
basic_valid_score = calc_metric(result_df, target_df, rank_col = 'rank')
basic_valid_score

0.18175859727921806