In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import joblib

import json
import tqdm

import numba
import dask
import xgboost
from dask.diagnostics import ProgressBar
ProgressBar().register()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import StandardScaler
fold1, fold2 = joblib.load("./valid/fold1.pkl.z"), joblib.load("./valid/fold2.pkl.z")

In [None]:
train = pd.read_parquet("./data/train.parquet")
train_melt = pd.read_parquet("./data/22c_train_melt_with_features.parquet")
test_melt = pd.read_parquet("./data/22c_test_melt_with_features.parquet")

In [None]:
item_data = pd.read_parquet("./data/item_data.parquet")

item_title_map = item_data[['item_id', 'title']].drop_duplicates()
item_title_map = item_title_map.set_index("item_id").squeeze().to_dict()

item_price_map = item_data[['item_id', 'price']].drop_duplicates()
item_price_map = item_price_map.set_index("item_id").squeeze().to_dict()

item_domain_map = item_data[['item_id', 'domain_id']].drop_duplicates()
item_domain_map = item_domain_map.set_index("item_id").squeeze().to_dict()

In [None]:
embs_np = joblib.load("22a_embs_np.pkl.z")

In [None]:
item_emb_map = {t: embs_np[i] for i, t in enumerate(item_data['item_id'].values)} 

In [None]:
bags = train_melt[train_melt['viewed'] == 1].groupby("seq_index", dropna=False)['event_info'].apply(lambda x: x.unique())

In [None]:
bags_mean = np.zeros((train_melt['seq_index'].max()+1, 1024), dtype=np.float32)
for ilist in tqdm.tqdm(bags.index):
    if ilist in bags:
        bags_mean[ilist, :] = np.mean([item_emb_map[e] for e in bags[ilist]], axis=0)

In [None]:
class BagOfEmbeds(torch.utils.data.Dataset):
    def __init__(self, bags, max_seq=3):
        super().__init__()
        self.max_seq = max_seq
        self.bags = bags
        
    def __len__(self):
        return len(self.bags)
    
    def __getitem__(self, index):
        if index in self.bags:
            return torch.Tensor(embs_np[self.bags[index][:self.max_seq]])
        return torch.zeros((1, 1024))

class Dataset(torch.utils.data.Dataset):
    def __init__(self, seq_ixs, candidates, nums, y):
        super().__init__()
        self.seq_ixs = seq_ixs.values
        self.candidates = candidates.values
        self.nums = nums
        self.y = y.values
        
    def __len__(self):
        return self.seq_ixs.shape[0]
    
    def __getitem__(self, index):
        #print(self.seq_ixs[index].item())
        x = bags_mean[self.seq_ixs[index]]
        c = item_emb_map[self.candidates[index]]
        n = self.nums[index]
        y = self.y[index].astype(np.float32)#.reshape(-1,1)
        
        return x, c, n, y

log_pos = np.log1p(np.arange(1,11))
best_sellers = [1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574]
best_sellers_domain = [item_domain_map[e] for e in best_sellers]

def pad(lst):
    
    if len(lst) == 0:
        return best_sellers
    if len(lst) < 10:
        lst += best_sellers[:(10 - len(lst))]
    return np.array(lst)

def pad_str(lst):
    if len(lst) == 0:
        return best_sellers_domain
    if len(lst) < 10:
        lst += best_sellers_domain[:(10 - len(lst))]
    return lst

def ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain):
    relevance = np.zeros((ypred.shape[0], 10))
    for i in range(10):
        relevance[:, i] = np.equal(ypred_domain[:, i], ytrue_domain) * (np.equal(ypred[:, i], ytrue) * 12 + 1)
    dcg = (relevance / log_pos).sum(axis=1)

    i_relevance = np.ones(10)
    i_relevance[0] = 12.
    idcg = np.zeros(ypred.shape[0]) + (i_relevance / log_pos).sum()

    return (dcg / idcg).mean()

In [None]:
features = ['item_price', 'seq_pos', 'n_views',
           'n_views_this', 'n_views_this_domain', 'unique_items_viewed',
           'unique_domains_viewed', 'n_searches', 'n_unique_searches',
           'avg_search_seqpos', 'avg_search_len', 'avg_search_words', 'n_views_this_ratio', 'n_views_this_ratio_domain', 'viewed']

In [None]:
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.hid1_x = nn.Linear(1024, 512)
        self.hid1_c = nn.Linear(1024, 512)
        self.hid1_n = nn.Linear(len(features), 512)
        
        self.hid1 = nn.Linear(512*3, 512)
        self.out = nn.Linear(512, 1)
        
        
    def forward(self, x, c, n):
        ox = F.relu(self.hid1_x(x))
        
        oc = F.relu(self.hid1_c(c))
        
        on = F.relu(self.hid1_n(n))
        
        o = torch.cat([ox,oc, on], dim=-1)
        o = F.relu(self.hid1(o))
        o = self.out(o)
        return o

In [None]:
stack_p = list()
for f1, f2 in [(fold1, fold2), (fold2, fold1)]:
    Xtr = train_melt[train_melt['seq_index'].isin(f1)]
    Xval = train_melt[train_melt['seq_index'].isin(f2)]

    #len(fold2), len(fold1)

    #Xtr.shape, Xval.shape
    
    scaler = StandardScaler()
    Xtr_nums = scaler.fit_transform(Xtr[features]).astype(np.float32)
    Xtr_nums[np.isnan(Xtr_nums)] = 0
    Xval_nums = scaler.transform(Xval[features]).astype(np.float32)
    Xval_nums[np.isnan(Xval_nums)] = 0

    torch.manual_seed(0)
    mdl = NN().cuda()
    optimizer = Adam(mdl.parameters(), lr=1e-5)
    criterion = nn.MSELoss()

    #del tl, vl
    t = Dataset(Xtr['seq_index'], Xtr['event_info'], Xtr_nums, Xtr['y_rank'])
    tl = torch.utils.data.DataLoader(t, batch_size=128, num_workers=8, pin_memory=True, shuffle=True)

    v = Dataset(Xval['seq_index'], Xval['event_info'], Xval_nums, Xval['y_rank'])
    vl = torch.utils.data.DataLoader(v, batch_size=1024, num_workers=4, pin_memory=True, shuffle=False)

    for epoch in range(5):
        tr_loss = 0
        tr_cnt = 0
        #tl = torch.utils.data.DataLoader(t, batch_size=128, num_workers=8, pin_memory=True, shuffle=True)
        for x, c, n, y in tqdm.tqdm(tl):
            #print(x.shape, c.shape, y.shape)
            x = x.cuda()
            c = c.cuda()
            n = n.cuda()
            y = y.cuda().unsqueeze(-1)

            mdl.zero_grad()

            p = mdl(x,c,n)

            loss = criterion(p, y)

            loss.backward()
            optimizer.step()

            tr_loss += loss.item()
            tr_cnt += 1
            #if tr_cnt % 1000 == 0:
            #    print(tr_cnt, tr_loss/tr_cnt)
            #    break
        print(epoch, tr_loss/tr_cnt)



    p = list()
    with torch.no_grad():
        for x, c, n, y in tqdm.tqdm(vl):
            #print(x.shape, c.shape, y.shape)
            x = x.cuda()
            c = c.cuda()
            n = n.cuda()
            y = y.cuda()
            p.append(mdl(x,c,n))

        p = torch.cat(p)

    preds = Xval[['seq_index', 'has_bought', 'item_domain', 'bought_domain', 'event_info', 'bought_id']].copy()
    preds['p'] = p.cpu().numpy()


    stack_p.append(preds[['seq_index', 'event_info', 'p']])
    preds = preds.sort_values('p', ascending=False).drop_duplicates(subset=['seq_index', 'event_info'])

    ytrue = preds.groupby("seq_index")['bought_id'].apply(lambda x: x.iloc[0]).values
    ytrue_domain = preds.groupby("seq_index")['bought_domain'].apply(lambda x: x.iloc[0]).values

    ypred = preds.groupby("seq_index")['event_info'].apply(lambda x: pad(x.iloc[:10].tolist()))
    ypred = np.array(ypred.tolist())

    ypred_domain = preds.groupby("seq_index")['item_domain'].apply(lambda x: pad_str(x.iloc[:10].tolist()))
    ypred_domain = np.array(ypred_domain.tolist())

    print(epoch, ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain))


8 0.28640934147437763

In [None]:
stacked = pd.concat(stack_p)
stacked.to_parquet("./stack_2f/26_train.parquet", engine='fastparquet', compression=None)

In [None]:
stacked['y'] =  train_melt['y_rank']

In [None]:
stacked.corr(method='spearman')

In [None]:
# 0.593471 / 0.597317

# test

In [None]:
Xtr = train_melt
Xval = test_melt

scaler = StandardScaler()
Xtr_nums = scaler.fit_transform(Xtr[features]).astype(np.float32)
Xtr_nums[np.isnan(Xtr_nums)] = 0
Xval_nums = scaler.transform(Xval[features]).astype(np.float32)
Xval_nums[np.isnan(Xval_nums)] = 0

torch.manual_seed(0)
mdl = NN().cuda()
optimizer = Adam(mdl.parameters(), lr=1e-5)
criterion = nn.MSELoss()

#del tl, vl
t = Dataset(Xtr['seq_index'], Xtr['event_info'], Xtr_nums, Xtr['y_rank'])
tl = torch.utils.data.DataLoader(t, batch_size=128, num_workers=8, pin_memory=True, shuffle=True)

v = Dataset(Xval['seq_index'], Xval['event_info'], Xval_nums, Xval['event_info']*0)
vl = torch.utils.data.DataLoader(v, batch_size=1024, num_workers=4, pin_memory=True, shuffle=False)

for epoch in range(5):
    tr_loss = 0
    tr_cnt = 0
    #tl = torch.utils.data.DataLoader(t, batch_size=128, num_workers=8, pin_memory=True, shuffle=True)
    for x, c, n, y in tqdm.tqdm(tl):
        #print(x.shape, c.shape, y.shape)
        x = x.cuda()
        c = c.cuda()
        n = n.cuda()
        y = y.cuda().unsqueeze(-1)

        mdl.zero_grad()

        p = mdl(x,c,n)

        loss = criterion(p, y)

        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        tr_cnt += 1
        #if tr_cnt % 1000 == 0:
        #    print(tr_cnt, tr_loss/tr_cnt)
        #    break
    print(epoch, tr_loss/tr_cnt)



p = list()
with torch.no_grad():
    for x, c, n, y in tqdm.tqdm(vl):
        #print(x.shape, c.shape, y.shape)
        x = x.cuda()
        c = c.cuda()
        n = n.cuda()
        p.append(mdl(x,c,n))

    p = torch.cat(p)

preds = Xval[['seq_index', 'event_info']].copy()
preds['p'] = p.cpu().numpy()



In [None]:
preds.head()

In [None]:
preds.to_parquet("./stack_2f/26_test.parquet", engine='fastparquet', compression=None)