In [65]:
import os
import sys
sys.path.append('../../src')
from hydra import initialize, compose
import pathlib
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm

# from datasets.jobdataset import generate_dataset, _hfd5_from_dataframe
from datasets.job_hdf5 import hdf5_from_dataframe, get_career
import data_process.neg_sample as ng_sample
from utils.constants import DEFAULT_USER_COL,DEFAULT_ITEM_COL,DEFAULT_RATING_COL, DEFAULT_PREDICTION_COL
# from implicit_eval import microsoft_eval,model_infer_df
# from implicit.als import AlternatingLeastSquares
# from implicit.bpr import BayesianPersonalizedRanking
# from implicit.lmf import LogisticMatrixFactorization
from implicit_build import bpr, als, lmf
from metrics import ranking
from metrics.evaluate_ignite import model_infer2

with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name="config", overrides=[])

# Create the the data pipe

In [66]:
df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))

In [67]:
df_train_pos[DEFAULT_RATING_COL] = 1

In [68]:
# df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype(str)
# df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype(str)
df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype("category")
df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype("category")

In [69]:
# df_train_pos[DEFAULT_USER_COL]

In [70]:
# pathlib.Path(cfg.path.root, cfg.file.hdf5)

In [71]:
hdf5_from_dataframe(df_train_pos, pathlib.Path(cfg.path.root, cfg.file.hdf5))

## Read data & Train

In [72]:
jobsid, usersid, user_job_app = get_career(pathlib.Path(cfg.path.root, cfg.file.hdf5))
model_path = "./models"

In [73]:
# user_job_app

In [74]:
# user_job_app = job_user_app.T.tocsr()
bpr(model_path, user_job_app)
# als(model_path, user_job_app)
# lmf(model_path, user_job_app)

100%|██████████| 100/100 [00:38<00:00,  2.62it/s, train_auc=93.11%, skipped=0.10%]


In [75]:
# df_train_pos.userid.unique()

In [76]:
def read_train_gd_csv(data_testgd_path, usecols):
    test_gddf = pd.read_csv(data_testgd_path, usecols=usecols)
#     test_gddf[DEFAULT_USER_COL] = test_gddf[DEFAULT_USER_COL].astype('str')
#     test_gddf[DEFAULT_ITEM_COL] = test_gddf[DEFAULT_ITEM_COL].astype('str')
    return test_gddf

In [77]:
df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test))


df_train =read_train_gd_csv('../../data/jobs/leave_one_train_neg.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])

df_test =read_train_gd_csv('../../data/jobs/leave_one_test.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])


In [78]:
with open('./models/model_bpr.sav','rb') as pickle_in:
    model = pickle.load(pickle_in)
#     microsoft_eval(model, user_job_app, test_gddf=df_test, usersid=usersid, jobsid=jobsid, k=10, logger=None)
    
#     model.recommend(test_user_indexes, df_train[test_user_indexes], N=10)

In [79]:
# df_test_ori[DEFAULT_USER_COL] = df_test_ori[DEFAULT_USER_COL].astype('str')
# df_test_ori[DEFAULT_ITEM_COL] = df_test_ori[DEFAULT_ITEM_COL].astype('str')

In [80]:
apps_true = df_test_ori[df_test_ori['userid'].isin([1472090])]
# apps_true = df_test_ori

In [12]:
apps_true

Unnamed: 0,userid,itemid,rating
7483090,1472090,254881,1
7483091,1472090,516136,0
7483092,1472090,544291,0
7483093,1472090,372900,0
7483094,1472090,954885,0
...,...,...,...
7483186,1472090,493793,0
7483187,1472090,1076061,0
7483188,1472090,807312,0
7483189,1472090,273190,0


In [13]:
gt_pos, reco_ind, scores = model_infer2(df_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1)

In [14]:
gt_pos.flatten().shape

(85,)

In [15]:
reco_ind.shape

(85,)

In [16]:
auc_test = ranking.AUC()
auc_test.compute(pd_scores=scores, gt_pos=gt_pos)

0.8452380952380952

In [17]:
p = ranking.Precision(k=20)

In [18]:
gt_pos.shape

(85,)

In [19]:
reco_ind.shape

(85,)

In [20]:
p.compute(gt_pos=gt_pos, pd_rank=reco_ind)

0.05

In [22]:
r = ranking.Recall(k=20)
r.compute(gt_pos=gt_pos, pd_rank=reco_ind)

1.0

In [61]:
class HitRate():
    def __init__(self, k=-1):
        self.k=k
    def compute(self, gt_pos,pd_rank, **kwargs):
        if self.k > 0:
            truncated_pd_rank = pd_rank[: self.k]
        else:
            truncated_pd_rank = pd_rank
        gt_pos_k = gt_pos[truncated_pd_rank]
        if sum(gt_pos_k)==0:
            return 0
        else:
            return 1

In [48]:
hr = hit_rate(k=20)
hr.compute(gt_pos=gt_pos)

1

In [23]:
# test_items[test_items_sort_ind]

In [30]:
def cal_p(apps_true, jobsid, usersid,model, u_i_matrix, n, metric):
    gt_item, reco_ind, _ = model_infer2(df_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    return metric.compute(gt_pos=gt_item, pd_rank=reco_ind)
    

In [24]:
def cal_auc(apps_true, jobsid, usersid,model, u_i_matrix, n, metric):
    gt_item, reco_ind, pd_scores = model_infer2(df_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    return metric.compute(gt_pos=gt_item, pd_scores=pd_scores)
    

In [63]:
def cal_hit(apps_true, jobsid, usersid,model, u_i_matrix, n, metric):
    gt_item, reco_ind, pd_scores = model_infer2(df_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    return metric.compute(gt_pos=gt_item, pd_rank=reco_ind)   

In [25]:
# aa= df_test_ori[df_test_ori['userid'].isin(['7','999368'])]
#  '1327949',
#  '543701',
#  '860552',
#  '298359' ])]
tqdm.pandas()


## Precision

In [81]:
metric = ranking.Precision(k=10)

metric_result = df_test_ori.groupby('userid').progress_apply(cal_p, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

print(sum(metric_result)/len(metric_result))

100%|██████████| 74091/74091 [09:19<00:00, 132.46it/s]

0.05736864126544292





## Recall

In [57]:
metric = ranking.Recall(k=10)

metric_result = df_test_ori.groupby('userid').progress_apply(cal_p, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

print(sum(metric_result)/len(metric_result))

100%|██████████| 74091/74091 [08:24<00:00, 146.84it/s]

0.8477142972830708





## NDCG 

In [82]:
metric = ranking.NDCG(k=10)

metric_result = df_test_ori.groupby('userid').progress_apply(cal_p, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

print(sum(metric_result)/len(metric_result))

100%|██████████| 74091/74091 [09:30<00:00, 129.79it/s]

0.45821665254537375





# Auc@k

In [83]:
metric = ranking.AUC()

metric_result = df_test_ori.groupby('userid').progress_apply(cal_auc, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

print(sum(metric_result)/len(metric_result))

100%|██████████| 74091/74091 [14:43<00:00, 83.87it/s]  

0.5127078043264857





# Hit Rate

In [64]:
metric = hit_rate(k=10)

metric_result = df_test_ori.groupby('userid').progress_apply(cal_hit, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

print(sum(metric_result)/len(metric_result))

100%|██████████| 74091/74091 [09:11<00:00, 134.27it/s]

0.12826119231755545



