In [1]:
import os
import sys
sys.path.append('../../src')
from hydra import initialize, compose
import pathlib
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm

# from datasets.jobdataset import generate_dataset, _hfd5_from_dataframe
from datasets.job_hdf5 import hdf5_from_dataframe, get_career
import data_process.neg_sample as ng_sample
from utils.constants import DEFAULT_USER_COL,DEFAULT_ITEM_COL,DEFAULT_RATING_COL, DEFAULT_PREDICTION_COL
from implicit_eval import microsoft_eval,model_infer_df
# from implicit.als import AlternatingLeastSquares
# from implicit.bpr import BayesianPersonalizedRanking
# from implicit.lmf import LogisticMatrixFactorization
from implicit_build import bpr
from metrics import ranking

with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name="config", overrides=[])

# Create the the data pipe

In [295]:
df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))

In [296]:
df_train_pos[DEFAULT_RATING_COL] = 1

In [301]:
df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype(str)
df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype(str)
df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype("category")
df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype("category")

In [304]:
# df_train_pos[DEFAULT_ITEM_COL].cat.categories

In [303]:
# df_train_pos[DEFAULT_ITEM_COL].cat.categories

In [5]:
# hdf5_from_dataframe(df_train_pos, pathlib.Path(cfg.path.root, cfg.file.hdf5))

In [6]:
# df_train_pos[DEFAULT_ITEM_COL].unique()

## Read data & Train

In [8]:
jobsid, usersid, user_job_app = get_career(pathlib.Path(cfg.path.root, cfg.file.hdf5))
model_path = "./models"

In [9]:
# user_job_app

In [10]:
# user_job_app = job_user_app.T.tocsr()
# bpr(model_path, user_job_app)
# als(model_path, user_job_app)
# lmf(model_path, user_job_app)

In [11]:
# df_train_pos.userid.unique()

In [12]:
def read_train_gd_csv(data_testgd_path, usecols):
    test_gddf = pd.read_csv(data_testgd_path, usecols=usecols)
    test_gddf[DEFAULT_USER_COL] = test_gddf[DEFAULT_USER_COL].astype('str')
    test_gddf[DEFAULT_ITEM_COL] = test_gddf[DEFAULT_ITEM_COL].astype('str')
    return test_gddf

In [13]:
df_test_ori = pd.read_feather(pathlib.Path(cfg.path.root, cfg.file.test))


In [14]:
df_train =read_train_gd_csv('../../data/jobs/leave_one_train_neg.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])

In [15]:
df_test =read_train_gd_csv('../../data/jobs/leave_one_test.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])


In [16]:
with open('./models/model_bpr.sav','rb') as pickle_in:
    model = pickle.load(pickle_in)
#     microsoft_eval(model, user_job_app, test_gddf=df_test, usersid=usersid, jobsid=jobsid, k=10, logger=None)
    
#     model.recommend(test_user_indexes, df_train[test_user_indexes], N=10)

In [17]:
df_test_ori[DEFAULT_USER_COL] = df_test_ori[DEFAULT_USER_COL].astype('str')
df_test_ori[DEFAULT_ITEM_COL] = df_test_ori[DEFAULT_ITEM_COL].astype('str')

In [18]:
apps_true = df_test_ori[df_test_ori['userid'].isin(['1472090'])]

In [19]:
def model_infer2(apps_true, jobsid, usersid,model, u_i_matrix, n):
    test_items = apps_true.itemid.values
    test_items_indices = np.where(np.isin(jobsid, test_items))
    test_items_rating = apps_true.rating.values
    test_users = apps_true[DEFAULT_USER_COL].unique()
    test_user_indices = np.where(np.isin(usersid, test_users))[0]
    ids, scores = model.recommend(test_user_indices, 
                                  user_job_app[test_user_indices],
                                  N=n,
                                  items=test_items_indices[0])
    reco_jobsid = jobsid[ids][0]
    reco_jobsid = reco_jobsid.astype(np.int32)
    test_items = test_items.astype(np.int32)
    test_items_sort_ind = np.argsort(test_items)
    test_items_rating = test_items_rating[test_items_sort_ind]
    reco_indices = np.searchsorted(test_items[test_items_sort_ind], reco_jobsid)
    return test_items_rating, reco_indices

In [25]:
gt_item, reco_ind = model_infer2(apps_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1)

In [None]:
precision_list=[]
for u in range(users):
    gt_item, reco_ind = model_infer2(apps_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    p = ranking.Precision(k=20)
    precision_list.append(p.compute(gt_pos=gt_item, pd_rank=reco_ind))
    

In [28]:
p = ranking.Precision(k=20)

In [29]:
p.compute(gt_pos=gt_item, pd_rank=reco_ind)

0.0

In [30]:
r = ranking.Recall(k=20)
r.compute(gt_pos=gt_item, pd_rank=reco_ind)

0.0

In [384]:
# test_items[test_items_sort_ind]

In [25]:
metric = ranking.Precision(k=80)
def cal_p(apps_true, jobsid, usersid,model, u_i_matrix, n, metric):
    gt_item, reco_ind = model_infer2(apps_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    return metric.compute(gt_pos=gt_item, pd_rank=reco_ind)
    

In [26]:
aa= df_test_ori[df_test_ori['userid'].isin(['7','999368'])]
#  '1327949',
#  '543701',
#  '860552',
#  '298359' ])]
tqdm.pandas()


In [27]:
# set(df_test_ori.userid)

In [None]:
metric_result = df_test_ori.groupby('userid').progress_apply(cal_p, jobsid=jobsid, usersid=usersid, 
                            model=model, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)

  0%|          | 0/74091 [00:00<?, ?it/s]

In [None]:
sum(metric_result)/len(metric_result)

In [73]:
gt_pos = np.asarray([0, 1, 0, 0, 0]) 
item_score = np.asarray([.5, .2, .3, .4, .6])
item_rank = item_score.argsort()[::-1]
item_rank

array([4, 0, 3, 2, 1])

In [70]:
pd_rank = item_rank

In [71]:
# gt_pos = np.asarray([0, 0, 0, 1, 1])  # [1, 0, 1]
# pd_rank = np.asarray([4, 3, 0,1,2])  # [1, 1, 1]

In [72]:
if k > 0:
    truncated_pd_rank = pd_rank[:k]
else:
    truncated_pd_rank = pd_rank
    
print(truncated_pd_rank)
pred = np.zeros_like(gt_pos)
pred[truncated_pd_rank] = 1
print('pred', pred)
print('choose the topk', pred*gt_pos)
tp = np.sum(pred * gt_pos)
print('tp', tp)
tp_fn = np.sum(gt_pos)
print('tp_fn', tp_fn)
tp_fp = np.sum(pred)
print('tp_fp', tp_fp)

[4 0 3]
pred [1 0 0 1 1]
choose the topk [0 0 0 0 0]
tp 0
tp_fn 1
tp_fp 3


In [37]:
score = np.asarray([0.3, 0.4, 0.1])

In [24]:
rank = score.argsort()[::-1]

In [25]:
rank

array([1, 0, 2])