In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import joblib

import json
import tqdm

import glob

import numba
import dask
import xgboost
from dask.diagnostics import ProgressBar
import re
ProgressBar().register()
fold1, fold2 = joblib.load("./valid/fold1.pkl.z"), joblib.load("./valid/fold2.pkl.z")

In [None]:
train = pd.read_parquet("./data/train.parquet")
train_melt = pd.read_parquet("./data/22c_train_melt_with_features.parquet")
test_melt = pd.read_parquet("./data/22c_test_melt_with_features.parquet")

In [None]:
test_melt.head()

In [None]:
item_data = pd.read_parquet("./data/item_data.parquet")
item_data.head()

item_title_map = item_data[['item_id', 'title']].drop_duplicates()
item_title_map = item_title_map.set_index("item_id").squeeze().to_dict()

item_price_map = item_data[['item_id', 'price']].drop_duplicates()
item_price_map = item_price_map.set_index("item_id").squeeze().to_dict()

item_domain_map = item_data[['item_id', 'domain_id']].drop_duplicates()
item_domain_map = item_domain_map.set_index("item_id").squeeze().to_dict()

# stack gen

In [None]:
%%time
log_pos = np.log1p(np.arange(1,11))
best_sellers = [1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574]
best_sellers_domain = [item_domain_map[e] for e in best_sellers]

def pad(lst):
    
    if len(lst) == 0:
        return best_sellers
    if len(lst) < 10:
        lst += best_sellers[:(10 - len(lst))]
    return np.array(lst)

def pad_str(lst):
    if len(lst) == 0:
        return best_sellers_domain
    if len(lst) < 10:
        lst += best_sellers_domain[:(10 - len(lst))]
    return lst

# this is wrong, double counts exact item hits
def ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain):
    relevance = np.zeros((ypred.shape[0], 10))
    for i in range(10):
        relevance[:, i] = np.equal(ypred_domain[:, i], ytrue_domain) * (np.equal(ypred[:, i], ytrue) * 12 + 1)
    dcg = (relevance / log_pos).sum(axis=1)

    i_relevance = np.ones(10)
    i_relevance[0] = 12.
    idcg = np.zeros(ypred.shape[0]) + (i_relevance / log_pos).sum()

    return (dcg / idcg).mean()

In [None]:
%%time
tr_list = glob.glob("./stack_2f/*_train.parquet")
ts_list = glob.glob("./stack_2f/*_test.parquet")

train = train_melt[['seq_index','event_info','has_bought', 'item_domain', 'bought_domain', 'bought_id', 'y_rank']].copy()
for f in tr_list:
    fname = re.search('/(\d[\d\w]+)_', f).group(1)
    fdf = pd.read_parquet(f).rename(columns={"p": fname})
    train = pd.merge(train, fdf, on=['seq_index','event_info'])
    
train = train.sort_values("seq_index")
    
test = test_melt[['seq_index','event_info']].copy()
for f in ts_list:
    fname = re.search('/(\d[\d\w]+)_', f).group(1)
    fdf = pd.read_parquet(f).rename(columns={"p": fname})
    test = pd.merge(test, fdf, on=['seq_index','event_info'])
    
test = test.sort_values("seq_index")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.columns

In [None]:
from sklearn.model_selection import GroupKFold
from cuml.preprocessing import TargetEncoder


stack_p = list()
for f1, f2 in [(fold1, fold2), (fold2, fold1)]:
    Xtr = train[train['seq_index'].isin(f1)]
    Xval = train[train['seq_index'].isin(f2)]


    features = ['22c', '26']

    params = [0.1, 3, 1, 0.5, 1.]
    learning_rate, max_depth, min_child_weight, subsample, colsample_bytree = params


    Xtrr, ytr = Xtr[features], Xtr['y_rank']
    Xvall = Xval[features]
    
    groups = Xtr.groupby('seq_index').size().values

    mdl = xgboost.XGBRanker(seed=0, tree_method='gpu_hist', gpu_id=0, n_estimators=100,
                               learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight,
                                subsample=subsample, colsample_bytree=colsample_bytree, objective='rank:pairwise', num_parallel_tree=5)

    mdl.fit(Xtrr, ytr, group=groups)

    p = mdl.predict(Xvall)

    preds = Xval[['seq_index', 'has_bought', 'item_domain', 'bought_domain', 'event_info', 'bought_id']].copy()
    preds['p'] = p
    
    preds = preds.sort_values('p', ascending=False).drop_duplicates(subset=['seq_index', 'event_info'])

    ytrue = preds.groupby("seq_index")['bought_id'].apply(lambda x: x.iloc[0]).values
    ytrue_domain = preds.groupby("seq_index")['bought_domain'].apply(lambda x: x.iloc[0]).values

    ypred = preds.groupby("seq_index")['event_info'].apply(lambda x: pad(x.iloc[:10].tolist()))
    ypred = np.array(ypred.tolist())

    ypred_domain = preds.groupby("seq_index")['item_domain'].apply(lambda x: pad_str(x.iloc[:10].tolist()))
    ypred_domain = np.array(ypred_domain.tolist())

    print(ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain))

# test

In [None]:
groups = train.groupby('seq_index').size().values
learning_rate, max_depth, min_child_weight, subsample, colsample_bytree = params
mdl = xgboost.XGBRanker(seed=0, tree_method='gpu_hist', gpu_id=0, n_estimators=100,
                           learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight,
                            subsample=subsample, colsample_bytree=colsample_bytree, objective='rank:pairwise', num_parallel_tree=5)
mdl.fit(train[features], train['y_rank'], group=groups)

In [None]:
test[features].head()

In [None]:
p = mdl.predict(test[features])

In [None]:
preds = test[['seq_index', 'event_info']].copy()
preds['p'] = p
preds = preds.sort_values('p', ascending=False).drop_duplicates(subset=['seq_index', 'event_info'])

In [None]:
def pad(lst):
    pad_candidates = [1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574]
    if len(lst) == 0:
        return pad_candidates
    if len(lst) < 10:
        lst += [lst[0]] * (10 - len(lst)) # pad_candidates[:(10 - len(lst))]
    return np.array(lst)
ypred = preds.groupby("seq_index")['event_info'].apply(lambda x: pad(x.iloc[:10].tolist()))
seq_index = ypred.index
ypred = np.array(ypred.tolist())


In [None]:
ypred_final = np.zeros((177070, 10))
ypred_final[seq_index, :] = ypred
no_views = np.setdiff1d(np.arange(177070), seq_index)
#ypred_final[no_views, :] = np.array([1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574])
ypred_final = ypred_final.astype(int)

In [None]:
#permite produtos repetidos
pd.DataFrame(ypred_final).to_csv("./subs/27.csv", index=False, header=False)

In [None]:
test['seq_index'].max()

In [None]:
!wc -l ./subs/27.csv

In [None]:
!head ./subs/27.csv