In [None]:
# ! pip install xgboost

In [1]:
import os
import sys
sys.path.append('../../src')
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from datetime import datetime
from hydra import initialize, compose
import time
import pathlib
import wandb
from wandb.xgboost import WandbCallback


from sklearn import metrics, preprocessing
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, precision_score, ndcg_score
import torch
import xgboost as xgb

import data_process.neg_sample as ng_sample
from data_process.utils import mix_merge
from data_process.data_split import data_split_user
from evaluate_entity import CustomHR, CustomNDCG, CustomRoc, CustomRoctop, CustomRecall_top, CustomPrecision_top
from utils.constants import DEFAULT_USER_COL,DEFAULT_ITEM_COL,DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# import argparse
# torch.manual_seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name="config", overrides=[])

In [3]:
if device.type =='cpu':
    BATCH_SIZE = cfg.params.batch_size_cpu
    EPOCHS  = cfg.params.epochs_cpu
else:
    BATCH_SIZE = cfg.params.batch_size_gpu
    EPOCHS  = cfg.params.epochs_gpu

In [4]:
if device.type == 'cpu':
    use_amp=False
    df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))
    df_train_neg = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_neg))
    df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test)).iloc[:202,]
    df_all_features = pd.read_csv(pathlib.Path(cfg.path.root, cfg.file.all_features))
    df_train_pos = df_train_pos.sort_values(by=[DEFAULT_USER_COL]).iloc[:100,].reset_index(drop=True)
    df_train_neg = df_train_neg.sort_values(by=[DEFAULT_USER_COL]).iloc[:100*cfg.params.neg_train,].reset_index(drop=True)
else:
    use_amp=True
    df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))
    df_train_neg = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_neg))
    df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test))
    df_all_features = pd.read_csv(pathlib.Path(cfg.path.root, cfg.file.all_features))
    df_train_pos = df_train_pos.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)
    df_train_neg = df_train_neg.sort_values(by=[DEFAULT_USER_COL]).reset_index(drop=True)

In [5]:
df_train_pos[DEFAULT_RATING_COL] = 1

In [6]:
def concat_index(df1, df2):
    df2.index = df2.index//cfg.params.neg_train
    return pd.concat([df1, df2], axis=0).sort_index(kind='mregesort').reset_index(drop=True)

In [7]:
df_train_all = concat_index(df_train_pos, df_train_neg)

In [8]:
df_train_all['flag'] = 1
df_test_ori['flag'] = 0
df_all = pd.concat([df_train_all, df_test_ori], axis=0).reset_index(drop=True)

user features: 
       'WindowID_user', 'Split', 'City',
       'State', 'Country', 'Zip_user', 'DegreeType', 'Major', 'GraduationDate',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany',
       
job features: 
       'WindowID_job', 'City_job',
       'State_job', 'Country_job', 'Zip_job', 'StartDate', 'EndDate',

### Choose the features and process data for the training

In [9]:
user_features = ['WindowID_user', 'Split', 'City', 'State', 'Country', 'Zip_user', 'DegreeType', 'Major', 'GraduationDate', 'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed', 'ManagedOthers', 'ManagedHowMany']
user_features_extend = [DEFAULT_USER_COL] + user_features

item_features = ['WindowID_job', 'City_job', 'State_job', 'Country_job', 'Zip_job', 'StartDate', 'EndDate']
item_features_extend =[DEFAULT_ITEM_COL] + item_features

base_features = [DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL]

In [10]:
df_mix_merge = mix_merge(df_all, df_all_features, user_features_extend, item_features_extend)

In [11]:
def _cat_encode(df_data, list_f, encoder):
    for f in list_f:
        df_data[f] = encoder.fit_transform(df_data[f].astype('category').cat.codes.values)
    return df_data

In [12]:
def _embedding_dimension(df_all_encode, features_to_train, max_dim=50):

    embedding_size = []
    features_to_em = [i for i in features_to_train if i !=DEFAULT_RATING_COL]
    for c in features_to_em:
        num_unique_values = int(df_all_encode[c].nunique())
        embed_dim = int(min(np.ceil(num_unique_values/2), max_dim))
        embedding_size.append([num_unique_values, embed_dim])  
    return embedding_size

In [13]:
def encode_data(df_mix_merge, features_to_code, features_to_train, max_dim=50):
    encoder = preprocessing.LabelEncoder()
#     features_to_code = df_mix_merge.columns
    df_all_encode = _cat_encode(df_mix_merge, features_to_code, encoder)
    df_train = df_all_encode[df_all.flag==1]
    df_test = df_all_encode[df_all.flag==0]
    df_train = df_train[features_to_train]
    df_test = df_test[features_to_train]
    embedding_size = _embedding_dimension(df_all_encode, features_to_train, max_dim)
    return df_train, df_test, embedding_size

In [14]:
num_feature=[]
features_to_code = df_mix_merge.columns
features_to_train = [DEFAULT_USER_COL, DEFAULT_ITEM_COL]+ user_features + item_features +[DEFAULT_RATING_COL]
df_train,  df_test, embedding_size = encode_data(df_mix_merge, features_to_code, features_to_train, max_dim=50)

print(f'The size of embedding layers:{embedding_size}')

The size of embedding layers:[[36, 18], [413, 50], [3, 2], [1, 1], [35, 18], [19, 10], [1, 1], [36, 18], [6, 3], [26, 13], [22, 11], [9, 5], [17, 9], [3, 2], [2, 1], [4, 2], [3, 2], [230, 50], [39, 20], [1, 1], [245, 50], [413, 50], [105, 50]]


In [15]:
df_train_split, df_val_split = data_split_user(df_train, val_size=0.2)

np_train = df_train_split.values
np_val = df_val_split.values
np_test = df_test.values

In [17]:
df_test

Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
500,0,241,0,0,26,17,0,4,2,19,...,0,0,0,167,34,0,0,4,2,1
501,0,168,0,0,26,17,0,4,2,19,...,0,0,0,110,26,0,24,69,24,0
502,0,177,0,0,26,17,0,4,2,19,...,0,0,1,92,33,0,189,144,39,0
503,0,118,0,0,26,17,0,4,2,19,...,0,0,2,9,9,0,79,345,74,0
504,0,357,0,0,26,17,0,4,2,19,...,0,0,0,75,12,0,160,75,27,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,1,378,1,0,14,16,0,26,2,19,...,0,0,0,118,15,0,0,164,47,0
698,1,5,1,0,14,16,0,26,2,19,...,0,0,2,59,18,0,145,386,90,0
699,1,289,1,0,14,16,0,26,2,19,...,0,0,0,229,16,0,0,123,36,0
700,1,285,1,0,14,16,0,26,2,19,...,0,0,2,5,13,0,0,350,77,0


In [18]:
df_train_split

Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
5,1,290,1,0,14,16,0,26,2,19,...,0,0,1,92,33,0,190,238,56,1
6,1,126,1,0,14,16,0,26,2,19,...,0,0,0,115,22,0,67,48,15,0
7,1,174,1,0,14,16,0,26,2,19,...,0,0,2,73,8,0,0,300,63,0
8,1,171,1,0,14,16,0,26,2,19,...,0,0,2,198,17,0,0,380,90,0
9,1,36,1,0,14,16,0,26,2,19,...,0,0,0,67,34,0,48,34,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,35,121,1,0,18,6,0,19,1,3,...,0,0,1,118,15,0,0,215,52,1
496,35,164,1,0,18,6,0,19,1,3,...,0,0,0,1,26,0,25,65,25,0
497,35,176,1,0,18,6,0,19,1,3,...,0,0,1,52,18,0,140,118,35,0
498,35,113,1,0,18,6,0,19,1,3,...,0,0,0,148,2,0,207,67,25,0


In [19]:
df_val_split

Unnamed: 0,userid,itemid,WindowID_user,Split,City,State,Country,Zip_user,DegreeType,Major,...,ManagedOthers,ManagedHowMany,WindowID_job,City_job,State_job,Country_job,Zip_job,StartDate,EndDate,rating
0,0,93,0,0,26,17,0,4,2,19,...,0,0,0,167,34,0,0,96,28,1
1,0,164,0,0,26,17,0,4,2,19,...,0,0,0,1,26,0,25,65,25,0
2,0,176,0,0,26,17,0,4,2,19,...,0,0,1,52,18,0,140,118,35,0
3,0,114,0,0,26,17,0,4,2,19,...,0,0,1,92,33,0,187,72,27,0
4,0,353,0,0,26,17,0,4,2,19,...,0,0,1,53,9,0,0,108,33,0
40,3,44,2,0,1,3,0,11,1,20,...,0,0,2,9,9,0,78,231,54,1
41,3,353,2,0,1,3,0,11,1,20,...,0,0,1,53,9,0,0,108,33,0
42,3,114,2,0,1,3,0,11,1,20,...,0,0,1,92,33,0,187,72,27,0
43,3,164,2,0,1,3,0,11,1,20,...,0,0,0,1,26,0,25,65,25,0
44,3,176,2,0,1,3,0,11,1,20,...,0,0,1,52,18,0,140,118,35,0


In [20]:
X_train_split = df_train_split.iloc[:,:-1]
y_train_split = df_train_split.iloc[:,-1]

X_val_split = df_val_split.iloc[:,:-1]
y_val_split = df_val_split.iloc[:,-1]


X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]



In [35]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['error@t']=0.4

In [22]:
train_dmatrix = xgb.DMatrix(data=X_train_split,label=y_train_split)
val_dmatrix = xgb.DMatrix(data=X_val_split,label=y_val_split)
test_dmatrix = xgb.DMatrix(data=X_test)#,label=y_test)

In [23]:
evallist = [(val_dmatrix, 'eval'), (train_dmatrix, 'train')]

In [29]:
wandb.init(project="pytorch-jrs",
           name='xgb',
           tags=['jrs', 'xgb']
          )

0,1
best_iteration,▁
best_score,▁
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
eval-auc,▁▄▆▇▇▇███████████████████████████████
hr,▁
train-auc,▁▄▅▇▇▇▇▇█████████████████████████████

0,1
best_iteration,26.0
best_score,1.0
epoch,36.0
hr,0.1


In [30]:
num_round = 400
bst = xgb.train(param, train_dmatrix, num_round, evallist, early_stopping_rounds=10, callbacks=[WandbCallback()])
# train(..., evals=evals, early_stopping_rounds=10)
# bst.save_model('test0001.model')

[0]	eval-auc:0.50347	train-auc:0.63197
[1]	eval-auc:0.72483	train-auc:0.77688
[2]	eval-auc:0.81597	train-auc:0.86112
[3]	eval-auc:0.89149	train-auc:0.92665
[4]	eval-auc:0.90017	train-auc:0.93576
[5]	eval-auc:0.92014	train-auc:0.94796
[6]	eval-auc:0.96094	train-auc:0.96134
[7]	eval-auc:0.98090	train-auc:0.96546
[8]	eval-auc:0.97396	train-auc:0.97550
[9]	eval-auc:0.97396	train-auc:0.98241
[10]	eval-auc:0.97569	train-auc:0.98717
[11]	eval-auc:0.97743	train-auc:0.98802
[12]	eval-auc:0.97049	train-auc:0.99164
[13]	eval-auc:0.95139	train-auc:0.99299
[14]	eval-auc:0.96007	train-auc:0.99458
[15]	eval-auc:0.96875	train-auc:0.99632
[16]	eval-auc:0.97049	train-auc:0.99687
[17]	eval-auc:0.97396	train-auc:0.99732
[18]	eval-auc:0.96701	train-auc:0.99745
[19]	eval-auc:0.96528	train-auc:0.99777
[20]	eval-auc:0.96181	train-auc:0.99829
[21]	eval-auc:0.97049	train-auc:0.99884
[22]	eval-auc:0.97222	train-auc:0.99929
[23]	eval-auc:0.96354	train-auc:0.99968
[24]	eval-auc:0.95833	train-auc:0.99984
[25]	eval-

In [32]:
# wandb.log({'test/hr':0.2})

In [27]:
api = wandb.Api()
run = api.run('/tiyuok2023/pytorch-jrs/runs/31dwxp5y')
run.summary["tensor"] = np.random.random(1000)
run.summary.update()

[34m[1mwandb[0m: [32m[41mERROR[0m Storing tensors in summary requires h5py


In [33]:
test_preds = bst.predict(test_dmatrix)


In [34]:
test_preds

array([7.46058404e-01, 2.45384849e-03, 1.84179638e-02, 5.92743456e-02,
       5.62945716e-02, 3.67387780e-04, 1.21127800e-04, 2.60922150e-03,
       6.85525596e-01, 1.22194707e-01, 8.72454494e-02, 4.48581502e-02,
       2.25372729e-03, 3.80990244e-02, 7.97946006e-03, 3.87651213e-02,
       6.11840725e-01, 1.10332601e-01, 5.64728538e-03, 5.08497767e-02,
       3.39803815e-01, 6.60459250e-02, 5.94888069e-02, 3.78956869e-02,
       4.36046928e-01, 9.28485245e-02, 6.06266223e-02, 2.37977915e-02,
       1.44711360e-01, 2.60305405e-02, 1.09261402e-03, 2.03933865e-02,
       2.74209548e-02, 9.63237941e-01, 5.98057844e-02, 2.49987338e-02,
       1.82644784e-01, 1.12669347e-02, 1.66629687e-01, 4.58640084e-02,
       8.17838848e-01, 1.12406000e-01, 2.31590420e-02, 2.42760733e-01,
       9.78250980e-01, 2.44247273e-01, 5.45275100e-02, 6.65192902e-01,
       8.47830653e-01, 6.68903112e-01, 6.74387738e-02, 6.43256962e-01,
       1.53298164e-02, 1.49077997e-01, 5.70013933e-02, 1.11613430e-01,
      

In [43]:
roc_auc_score(y_test, test_preds)

0.8350000000000001

In [44]:
metric_names=['loss',"auc", 'hr', 'ndcg', 'roc_top', 'recall_top', 'precision_top'],

In [45]:
df_test['y_pred'] = test_preds

In [46]:
# device = 'cpu'
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def roc(gt_item, pred_prob):
    try:
        return roc_auc_score(gt_item, pred_prob)
    except:
        return 0

def recall(gt, prob, th):
    y_pred = [1 if y >= th else 0 for y in prob]
    return recall_score(gt, y_pred, zero_division=0)

def precision(gt, prob, th):
    y_pred = [1 if y >= th else 0 for y in prob]
    return precision_score(gt, y_pred, zero_division=0)

In [108]:
df_test_result = df_test[['userid', 'rating','y_pred', 'itemid']]

In [117]:
df_test_result_sort = df_test_result.sort_values(['userid','y_pred'], ascending = False)

In [118]:
df_test_result_sort.head()

Unnamed: 0,userid,rating,y_pred,itemid
10219830,89945,1,0.987102,30752
10219906,89945,0,0.946819,109812
10219878,89945,0,0.668708,47142
10219847,89945,0,0.655196,89889
10219904,89945,0,0.623258,101971


In [130]:
precision(y_test, test_preds, 0.5)

0.21256247132958303

In [171]:
df_test_result_sort.iloc[:17,:][df_test_result_sort.iloc[:17,:].rating==1].itemid.tolist()[0]

30752

In [172]:
topk = 10
hr_ls = []
ndcg_ls = []
roc_ls = []
recall_ls = []
precision_ls = []

def eval_scores(df, topk,  thres = 0.5):
    df_topk = df.iloc[:topk]
    gt_item =  df[df.rating==1].itemid.tolist()[0]
    pred_items = df_topk.itemid.tolist()
#     print("gt_item, pred_items ", gt_item, pred_items)
    
    hr_user = hit(gt_item, pred_items)

    y_true = [df.rating.tolist()]
    y_score = [df.y_pred.tolist()]
    
    ndcg_user = ndcg_score(y_true, y_score, k=topk)
    
    y_gt = df_topk.rating.tolist()
    y_pred =  df_topk.y_pred.tolist()
    
    roc_user = roc(y_gt, y_pred)
    recall_user = recall(y_gt, y_pred, thres)
    precision_user =  precision(y_gt, y_pred, thres)
    
    hr_ls.append(hr_user)
    ndcg_ls.append(ndcg_user)
    roc_ls.append(roc_user)
    recall_ls.append(recall_user)
    precision_ls.append(precision_user)

for  thres in [0.5, 0.7, 0.9, 0.97]: 
    df_test_result_sort.groupby('userid').apply(lambda df:eval_scores(df, topk, thres) )
    avg_hr = np.mean(hr_ls)
#     print("The prob threshold is: ", thres)
    
#     print(avg_hr)
#     print("-"*30) 
#     break
    avg_ndcg = np.mean(ndcg_ls)
    
    avg_roc = np.mean(roc_ls )
    avg_recall = np.mean(recall_ls) 
    avg_precision = np.mean(precision_ls )
    
    print("The prob threshold is: ", thres)
    print("avg_hr: ", avg_hr, "\navg_ndcg: ",avg_ndcg, "\navg_roc: ", avg_roc,  "\navg_recall: ", avg_recall, "\navg_precision: ", avg_precision)
    print("-"*30)

The prob threshold is:  0.5
avg_hr:  0.9824135185110202 
avg_ndcg:  0.8794684331620457 
avg_roc:  0.9281131761392523 
avg_recall:  0.9498454603123186 
avg_precision:  0.2912561568967206
------------------------------
The prob threshold is:  0.7
avg_hr:  0.9824135185110202 
avg_ndcg:  0.8794684331620459 
avg_roc:  0.9281131761392523 
avg_recall:  0.9314154215761699 
avg_precision:  0.3662185262096183
------------------------------
The prob threshold is:  0.9
avg_hr:  0.9824135185110202 
avg_ndcg:  0.8794684331620459 
avg_roc:  0.9281131761392525 
avg_recall:  0.8738713204032879 
avg_precision:  0.4434103343100388
------------------------------
The prob threshold is:  0.97
avg_hr:  0.9824135185110202 
avg_ndcg:  0.879468433162046 
avg_roc:  0.9281131761392523 
avg_recall:  0.7616478384689098 
avg_precision:  0.4331653368990901
------------------------------


In [None]:
# wandb.log({'test/hr':0.2})