In [1]:
# ! pip install xgboost

In [34]:
import os
import sys
sys.path.append('../../src')
sys.path.append('../../src/metrics')
 
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from datetime import datetime
from hydra import initialize, compose
import time
import pathlib
import wandb
from wandb.xgboost import WandbCallback


from sklearn import metrics, preprocessing
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, precision_score, ndcg_score
import torch
import xgboost as xgb

import data_process.neg_sample as ng_sample
from data_process.utils import mix_merge
from data_process.data_split import data_split_user
# from evaluate_ignite import CustomHR, CustomNDCG, CustomRoc, CustomRoctop, CustomRecall_top, CustomPrecision_top
from metrics.evaluate_ignite import CustomHR, CustomNDCG, CustomAuc_top, CustomAuc, CustomRecall_top, CustomPrecision_top
from metrics import ranking
from utils.constants import DEFAULT_USER_COL,DEFAULT_ITEM_COL,DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# import argparse
# torch.manual_seed(0)

In [35]:
with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name="config", overrides=[])

In [36]:
if device.type =='cpu':
    BATCH_SIZE = cfg.params.batch_size_cpu
    EPOCHS  = cfg.params.epochs_cpu
else:
    BATCH_SIZE = cfg.params.batch_size_gpu
    EPOCHS  = cfg.params.epochs_gpu

In [37]:
if device.type == 'cpu':
    use_amp=False
#     df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))
#     df_train_neg = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_neg))
#     df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test)).iloc[:202,]
#     df_all_features = pd.read_csv(pathlib.Path(cfg.path.root, cfg.file.all_features))
#     df_train_pos = df_train_pos.sort_values(by=[DEFAULT_USER_COL]).iloc[:100,].reset_index(drop=True)
#     df_train_neg = df_train_neg.sort_values(by=[DEFAULT_USER_COL]).iloc[:100*cfg.params.neg_train,].reset_index(drop=True)
    df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))
    df_train_neg = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_neg))
    df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test))
    df_all_features = pd.read_csv(pathlib.Path(cfg.path.root, cfg.file.all_features))
    df_train_pos = df_train_pos.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)
    df_train_neg = df_train_neg.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)
    
else:
    use_amp=True
    df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))
    df_train_neg = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_neg))
    df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test))
    df_all_features = pd.read_csv(pathlib.Path(cfg.path.root, cfg.file.all_features))
    df_train_pos = df_train_pos.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)
    df_train_neg = df_train_neg.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)

In [38]:
df_train_pos[DEFAULT_RATING_COL] = 1

In [39]:
def concat_index(df1, df2):
    df2.index = df2.index//cfg.params.neg_train
    return pd.concat([df1, df2], axis=0).sort_index(kind='mregesort').reset_index(drop=True)

In [40]:
df_train_all = concat_index(df_train_pos, df_train_neg)

In [41]:
df_train_all['flag'] = 1
df_test_ori['flag'] = 0
df_all = pd.concat([df_train_all, df_test_ori], axis=0).reset_index(drop=True)

user features: 
       'WindowID_user', 'Split', 'City',
       'State', 'Country', 'Zip_user', 'DegreeType', 'Major', 'GraduationDate',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany',
       
job features: 
       'WindowID_job', 'City_job',
       'State_job', 'Country_job', 'Zip_job', 'StartDate', 'EndDate',

### Choose the features and process data for the training

In [42]:
user_features = ['WindowID_user', 'Split', 'City', 'State', 'Country', 'Zip_user', 'DegreeType', 'Major', 'GraduationDate', 'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed', 'ManagedOthers', 'ManagedHowMany']
user_features_extend = [DEFAULT_USER_COL] + user_features

item_features = ['WindowID_job', 'City_job', 'State_job', 'Country_job', 'Zip_job', 'StartDate', 'EndDate']
item_features_extend =[DEFAULT_ITEM_COL] + item_features

base_features = [DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL]

In [43]:
df_mix_merge = mix_merge(df_all, df_all_features, user_features_extend, item_features_extend)

In [44]:
def _cat_encode(df_data, list_f, encoder):
    for f in list_f:
        df_data[f] = encoder.fit_transform(df_data[f].astype('category').cat.codes.values)
    return df_data

In [45]:
def _embedding_dimension(df_all_encode, features_to_train, max_dim=50):

    embedding_size = []
    features_to_em = [i for i in features_to_train if i !=DEFAULT_RATING_COL]
    for c in features_to_em:
        num_unique_values = int(df_all_encode[c].nunique())
        embed_dim = int(min(np.ceil(num_unique_values/2), max_dim))
        embedding_size.append([num_unique_values, embed_dim])  
    return embedding_size

In [46]:
def encode_data(df_mix_merge, features_to_code, features_to_train, max_dim=50):
    encoder = preprocessing.LabelEncoder()
#     features_to_code = df_mix_merge.columns
    df_all_encode = _cat_encode(df_mix_merge, features_to_code, encoder)
    df_train = df_all_encode[df_all.flag==1]
    df_test = df_all_encode[df_all.flag==0]
    df_train = df_train[features_to_train]
    df_test = df_test[features_to_train]
    embedding_size = _embedding_dimension(df_all_encode, features_to_train, max_dim)
    return df_train, df_test, embedding_size

In [47]:
num_feature=[]
features_to_code = df_mix_merge.columns
features_to_train = [DEFAULT_USER_COL, DEFAULT_ITEM_COL]+ user_features + item_features +[DEFAULT_RATING_COL]
df_train,  df_test, embedding_size = encode_data(df_mix_merge, features_to_code, features_to_train, max_dim=50)

print(f'The size of embedding layers:{embedding_size}')

The size of embedding layers:[[89946, 50], [139292, 50], [3, 2], [2, 1], [6296, 50], [55, 28], [1, 1], [12643, 50], [7, 4], [14075, 50], [606, 50], [20, 10], [64, 32], [3, 2], [2, 1], [251, 50], [4, 2], [5405, 50], [55, 28], [4, 2], [10465, 50], [139259, 50], [235, 50]]


In [48]:
df_train_split, df_val_split = data_split_user(df_train, val_size=0.2)

np_train = df_train_split.values
np_val = df_val_split.values
np_test = df_test.values

In [51]:
print(df_test.shape)
df_test.head()

(7483191, 24)


Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
2736740,0,86875,0,1,4716,48,0,3129,2,10123,...,0,0,1,4071,49,3,8,2443,16,1
2736741,0,63735,0,1,4716,48,0,3129,2,10123,...,0,0,1,2597,36,3,1398,25482,61,0
2736742,0,67228,0,1,4716,48,0,3129,2,10123,...,0,0,2,2236,46,3,8200,50432,88,0
2736743,0,46363,0,1,4716,48,0,3129,2,10123,...,0,0,3,194,11,3,3504,120611,159,0
2736744,0,119351,0,1,4716,48,0,3129,2,10123,...,0,0,1,1744,15,3,6828,27061,65,0


In [52]:
df_train_split.head()

Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
15,2,71412,1,1,4921,46,0,10296,4,6268,...,0,0,2,4232,46,3,8,93721,133,1
16,2,123290,1,1,4921,46,0,10296,4,6268,...,0,0,1,3593,10,3,3842,11153,40,0
17,2,110856,1,1,4921,46,0,10296,4,6268,...,0,0,3,3883,10,3,4334,124651,178,0
18,2,3777,1,1,4921,46,0,10296,4,6268,...,0,0,1,1007,37,3,8,69228,108,0
19,2,103326,1,1,4921,46,0,10296,4,6268,...,0,0,3,2988,45,3,8,78468,112,0


In [53]:
df_val_split.head()

Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
0,0,38336,0,1,4716,48,0,3129,2,10123,...,0,0,1,4071,49,3,8,32089,66,1
1,0,63730,0,1,4716,48,0,3129,2,10123,...,0,0,1,44,36,3,1412,24702,62,0
2,0,67227,0,1,4716,48,0,3129,2,10123,...,0,0,2,1194,23,3,5862,41679,81,0
3,0,46359,0,1,4716,48,0,3129,2,10123,...,0,0,2,2236,46,3,8182,26675,65,0
4,0,119347,0,1,4716,48,0,3129,2,10123,...,0,0,2,1198,11,3,8,36361,76,0


In [54]:
X_train_split = df_train_split.iloc[:,:-1]
y_train_split = df_train_split.iloc[:,-1]

X_val_split = df_val_split.iloc[:,:-1]
y_val_split = df_val_split.iloc[:,-1]


X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]



In [55]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['error@t']=0.4

In [56]:
train_dmatrix = xgb.DMatrix(data=X_train_split,label=y_train_split)
val_dmatrix = xgb.DMatrix(data=X_val_split,label=y_val_split)
test_dmatrix = xgb.DMatrix(data=X_test)#,label=y_test)

In [57]:
evallist = [(val_dmatrix, 'eval'), (train_dmatrix, 'train')]

In [58]:
# wandb.init(project="pytorch-jrs",
#            name='xgb',
#            tags=['jrs', 'xgb']
#           )

In [59]:
num_round = 400
bst = xgb.train(param, train_dmatrix, num_round, evallist, early_stopping_rounds=10)#, callbacks=[WandbCallback()])
# train(..., evals=evals, early_stopping_rounds=10)
# bst.save_model('test0001.model')



Parameters: { "error@t" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	eval-auc:0.56339	train-auc:0.56536
[1]	eval-auc:0.62647	train-auc:0.63007
[2]	eval-auc:0.65654	train-auc:0.66201
[3]	eval-auc:0.66103	train-auc:0.66641
[4]	eval-auc:0.77053	train-auc:0.77591
[5]	eval-auc:0.77171	train-auc:0.77744
[6]	eval-auc:0.78625	train-auc:0.79173
[7]	eval-auc:0.79330	train-auc:0.79837
[8]	eval-auc:0.80157	train-auc:0.80647
[9]	eval-auc:0.80616	train-auc:0.81104
[10]	eval-auc:0.80791	train-auc:0.81293
[11]	eval-auc:0.80876	train-auc:0.81380
[12]	eval-auc:0.83894	train-auc:0.84239
[13]	eval-auc:0.89631	train-auc:0.89712
[14]	eval-auc:0.89856	train-auc:0.89945
[15]	eval-auc:0.91680	train-auc:0.91759
[16]	eval-auc:0.92099	train-auc:0.92202
[17]	eval-auc:0.

In [26]:
# wandb.log({'test/hr':0.2})

In [27]:
# api = wandb.Api()
# run = api.run('/tiyuok2023/pytorch-jrs/runs/31dwxp5y')
# run.summary["tensor"] = np.random.random(1000)
# run.summary.update()

In [60]:
test_preds = bst.predict(test_dmatrix)


In [61]:
test_preds.shape

(7483191,)

In [62]:
y_test

2736740     1
2736741     0
2736742     0
2736743     0
2736744     0
           ..
10219926    0
10219927    0
10219928    0
10219929    0
10219930    0
Name: rating, Length: 7483191, dtype: int64

# precision 

In [64]:
df_y_test = pd.DataFrame({"userid":X_test.userid, "y_test": y_test, "y_pred" :test_preds })

topk = 10
hr_ls = []
ndcg_ls = []
auc_ls = []
recall_ls = []
precision_ls = []

hr_obj = ranking.HitRate(k=topk)
ndcg_obj = ranking.NDCG(k=topk)
auc_obj = ranking.AUC()
prec_obj = ranking.Precision(k=topk)
recall_obj = ranking.Recall(k=topk)



def calculate_metrics(df):
   
    df1 = df.reset_index()

    df1['ori_index'] = df1.index
    df1 = df1.sort_values(by="y_pred", ascending=False)
  

    gt_pos = df1['y_test']
    pd_rank = df1['ori_index']

    
    #precision
    precision_score= prec_obj.compute(gt_pos, pd_rank)
    precision_ls.append(precision_score)

    #recall
    recall_score= recall_obj.compute(gt_pos, pd_rank)
    recall_ls.append(recall_score)

    

    #hit rate
    hr_score = hr_obj.compute(gt_pos,pd_rank)
    hr_ls.append(hr_score)

    #ndcg
    ndcg_score= ndcg_obj.compute(gt_pos, pd_rank)
    ndcg_ls.append(ndcg_score)

    #auc
    df_topk = df1.iloc[:topk]
    gt_pos_k = df_topk['y_test']#.tolist()
    pd_scores_k = df_topk['y_pred']#.tolist()
    auc_score = auc_obj.compute(pd_scores_k, gt_pos_k)
    auc_ls.append(auc_score)




df_y_test.groupby("userid").apply(lambda df:calculate_metrics(df))

avg_hr = np.mean(hr_ls)
avg_ndcg = np.mean(ndcg_ls)
avg_auc = np.mean(auc_ls )
avg_precision = np.mean(precision_ls )
avg_recall = np.mean(recall_ls) 

 
print("avg_hr: ", avg_hr, "\navg_ndcg: ",avg_ndcg, "\navg_auc: ", avg_auc,  "\navg_recall: ", avg_recall, "\navg_precision: ", avg_precision)
 


avg_hr:  0.8821719237154311 
avg_ndcg:  0.3145534247437734 
avg_auc:  0.6828074784911647 
avg_recall:  0.3324290399643681 
avg_precision:  0.03324290399643681


In [None]:
avg_hr:  0.8821719237154311 
avg_ndcg:  0.3145534247437734 
avg_auc:  0.6828074784911647 
avg_recall:  0.03324290399643681 
avg_precision:  0.03324290399643681

In [32]:
class HitRate():
    def __init__(self, k=-1):
        self.k=k
    def compute(self, gt_pos,pd_rank, **kwargs):
        if self.k > 0:
            truncated_pd_rank = pd_rank[: self.k]

        else:
            truncated_pd_rank = pd_rank
        gt_pos_k = gt_pos[truncated_pd_rank]
        if sum(gt_pos_k)==0:
            return 0
        else:
            return 1

In [33]:
if self.k > 0:
    truncated_pd_rank = pd_rank[: self.k]
    print("truncated_pd_rank: ", truncated_pd_rank)
else:
    truncated_pd_rank = pd_rank
print("gt_pos: ", gt_pos)
pred = np.zeros_like(gt_pos)
print("pred zeros_like: ", pred)
pred[truncated_pd_rank] = 1
print("pred[truncated_pd_rank] = 1 ", pred)


tp = np.sum(pred * gt_pos)
tp_fn = np.sum(gt_pos)
tp_fp = np.sum(pred)

NameError: name 'self' is not defined

In [None]:
truncated_pd_rank = y['ori_index'][:10]
y['y_test'][truncated_pd_rank]

In [None]:
y

In [None]:
# from metrics import ranking

In [None]:
prec_obj = Precision(k=10)

gt_pos = y['y_test']
pd_rank = y['ori_index']

print(prec_obj.compute(gt_pos, pd_rank))

In [None]:
roc_auc_score(y_test, test_preds)

In [None]:
metric_names=['loss',"auc", 'hr', 'ndcg', 'roc_top', 'recall_top', 'precision_top'],

In [None]:
df_test['y_pred'] = test_preds

In [None]:
# device = 'cpu'
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def roc(gt_item, pred_prob):
    try:
        return roc_auc_score(gt_item, pred_prob)
    except:
        return 0

def recall(gt, prob, th):
    y_pred = [1 if y >= th else 0 for y in prob]
    return recall_score(gt, y_pred, zero_division=0)

def precision(gt, prob, th):
    y_pred = [1 if y >= th else 0 for y in prob]
    return precision_score(gt, y_pred, zero_division=0)

In [None]:
df_test_result = df_test[['userid', 'rating','y_pred', 'itemid']]

In [None]:
df_test_result_sort = df_test_result.sort_values(['userid','y_pred'], ascending = False)

In [None]:
df_test_result_sort.head()

In [None]:
precision(y_test, test_preds, 0.5)

In [None]:
df_test_result_sort.iloc[:17,:][df_test_result_sort.iloc[:17,:].rating==1].itemid.tolist()[0]

In [None]:
topk = 10
hr_ls = []
ndcg_ls = []
roc_ls = []
recall_ls = []
precision_ls = []

def eval_scores(df, topk,  thres = 0.5):
    df_topk = df.iloc[:topk]
    gt_item =  df[df.rating==1].itemid.tolist()[0]
    pred_items = df_topk.itemid.tolist()
#     print("gt_item, pred_items ", gt_item, pred_items)
    
    hr_user = hit(gt_item, pred_items)

    y_true = [df.rating.tolist()]
    y_score = [df.y_pred.tolist()]
    
    ndcg_user = ndcg_score(y_true, y_score, k=topk)
    
    y_gt = df_topk.rating.tolist()
    y_pred =  df_topk.y_pred.tolist()
    
    roc_user = roc(y_gt, y_pred)
    recall_user = recall(y_gt, y_pred, thres)
    precision_user =  precision(y_gt, y_pred, thres)
    
    hr_ls.append(hr_user)
    ndcg_ls.append(ndcg_user)
    roc_ls.append(roc_user)
    recall_ls.append(recall_user)
    precision_ls.append(precision_user)

for  thres in [0.5, 0.7, 0.9, 0.97]: 
    df_test_result_sort.groupby('userid').apply(lambda df:eval_scores(df, topk, thres) )
    avg_hr = np.mean(hr_ls)
#     print("The prob threshold is: ", thres)
    
#     print(avg_hr)
#     print("-"*30) 
#     break
    avg_ndcg = np.mean(ndcg_ls)
    
    avg_roc = np.mean(roc_ls )
    avg_recall = np.mean(recall_ls) 
    avg_precision = np.mean(precision_ls )
    
    print("The prob threshold is: ", thres)
    print("avg_hr: ", avg_hr, "\navg_ndcg: ",avg_ndcg, "\navg_roc: ", avg_roc,  "\navg_recall: ", avg_recall, "\navg_precision: ", avg_precision)
    print("-"*30)

In [None]:
# wandb.log({'test/hr':0.2})