In [1]:
%load_ext autoreload
%aimport -torch, pandas, hydra, numpy, pickle, os
# %aimport src, src.metrics
%autoreload 2
import os
from hydra import initialize, compose
import pathlib
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm

# from datasets.jobdataset import generate_dataset, _hfd5_from_dataframe
from src.data_process.job_hdf5 import hdf5_from_dataframe, get_career
import src.data_process.neg_sample as ng_sample
from src.utils.constants import DEFAULT_USER_COL,DEFAULT_ITEM_COL,DEFAULT_RATING_COL, DEFAULT_PREDICTION_COL
# from implicit_eval import microsoft_eval,model_infer_df
# from implicit.als import AlternatingLeastSquares
# from implicit.bpr import BayesianPersonalizedRanking
# from implicit.lmf import LogisticMatrixFactorization
from src.implicit_build import bpr, bpr, lmf
from src.metrics import ranking
from src.metrics.evaluate_ignite import model_infer2

with initialize(version_base=None, config_path="../../src/conf"):
    cfg = compose(config_name="config", overrides=['path.root=../../data/jobs'])

In [2]:
import wandb
# wandb.login()

In [34]:
%aimport

Modules to reload:
all-except-skipped

Modules to skip:
torch, pandas, hydra, numpy, pickle, os


# Create the the data pipe

In [36]:
df_train_pos  = ng_sample.read_feather(pathlib.Path(cfg.path.root, cfg.file.train_pos))

In [37]:
df_train_pos[DEFAULT_RATING_COL] = 1

In [4]:
# df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype(str)
# df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype(str)
df_train_pos[DEFAULT_USER_COL] = df_train_pos[DEFAULT_USER_COL].astype("category")
df_train_pos[DEFAULT_ITEM_COL] = df_train_pos[DEFAULT_ITEM_COL].astype("category")

In [11]:
# hdf5_from_dataframe(df_train_pos, pathlib.Path(cfg.path.root, cfg.file.hdf5))

## Read data & Train

In [3]:
jobsid, usersid, user_job_app = get_career(pathlib.Path(cfg.path.leave_one_cf, cfg.leave_one_data.hdf5))
model_path = "./models"

In [4]:
# user_job_app = job_user_app.T.tocsr()
# bpr(model_path, user_job_app)
# als(model_path, user_job_app)
# lmf(model_path, user_job_app)

In [5]:
def read_train_gd_csv(data_testgd_path, usecols):
    test_gddf = pd.read_csv(data_testgd_path, usecols=usecols)
#     test_gddf[DEFAULT_USER_COL] = test_gddf[DEFAULT_USER_COL].astype('str')
#     test_gddf[DEFAULT_ITEM_COL] = test_gddf[DEFAULT_ITEM_COL].astype('str')
    return test_gddf

In [6]:
df_test_ori = pd.read_feather(pathlib.Path(cfg.path.leave_one_cf, cfg.leave_one_data.test_pos_neg))
# df_train =read_train_gd_csv('../../data/jobs/leave_one_train_neg.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])
# df_test =read_train_gd_csv('../../data/jobs/leave_one_test.csv', usecols=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL])

In [7]:
with open('./models/model_als.sav','rb') as pickle_in:
    model_als = pickle.load(pickle_in)
with open('./models/model_bpr.sav','rb') as pickle_in:
    model_bpr = pickle.load(pickle_in)
with open('./models/model_lmf.sav','rb') as pickle_in:
    model_lmf = pickle.load(pickle_in)

In [8]:
def cal_metrics(apps_true, jobsid, usersid,model, u_i_matrix, n, metric:dict):
    gt_item, reco_ind, pd_scores = model_infer2(df_true=apps_true, jobsid=jobsid, usersid=usersid, 
                                 model=model, u_i_matrix=user_job_app, n=cfg.params.neg_test+1) 
    precision = metric['precision'].compute(gt_pos=gt_item, pd_rank=reco_ind)
    recall = metric['recall'].compute(gt_pos=gt_item, pd_rank=reco_ind)
    ndcg = metric['ndcg'].compute(gt_pos=gt_item, pd_rank=reco_ind)
    auc_k = metric['auc_k'].compute(gt_pos=gt_item, pd_scores=pd_scores, pd_rank=reco_ind)
    auc = metric['auc'].compute(gt_pos=gt_item, pd_scores=pd_scores)
    hit = metric['hit'].compute(gt_pos=gt_item, pd_rank=reco_ind)
    map_k = metric['map_k'].compute(gt_pos=gt_item, pd_scores=pd_scores, pd_rank=reco_ind)
    
    return [precision, recall, ndcg, auc_k, auc, hit, map_k]

In [9]:
def get_results(metric_value, wandb_enable: False, model_name:str, project_name:str)->dict:
    precision_top = np.mean(np.array(list(metric_value.flat))[:,0])
    recall_top = np.mean(np.array(list(metric_value.flat))[:,1])
    ndcg_top = np.mean(np.array(list(metric_value.flat))[:,2])
    auc_top_k = np.mean(np.array(list(metric_value.flat))[:,3])
    auc = np.mean(np.array(list(metric_value.flat))[:,4])
    hit = np.mean(np.array(list(metric_value.flat))[:,5])
    map_k = np.mean(np.array(list(metric_value.flat))[:,6])
    print(f'Precision: {precision_top:.4f} \nRecall: {recall_top:.4f} \nNDCG: {ndcg_top:.4f} \nAUC_K: {auc_top_k:.4f} \
    \nAUC: {auc:.4f} \nHitRate: {hit:.4f} \nMAP_K: {map_k:.4f}')
    result_dict = {'Precision': precision_top, 'Recall': recall_top, 'NDCG': ndcg_top, 'AUC_K':auc_top_k, 'AUC':auc,\
                   'HitRate':hit, 'MAP_K':map_k}
    if wandb_enable:
        wandb_log(model_name, project_name, result_dict)

In [10]:
def wandb_log(model_name:str, project_name:str, result:dict):
    wandb.init(project=project_name,
          name = model_name,
           tags= ['jrs','cf'],
           config = dict(cfg.params)
          )
    wandb.log(result)

In [11]:
apps_true = df_test_ori[df_test_ori['userid'].isin([1472090])]


In [12]:
tqdm.pandas()
k=cfg.params.topk
metric={'precision':ranking.Precision(k=k),'recall':ranking.Recall(k=k), 'ndcg':ranking.NDCG(k=k), \
        'auc_k':ranking.AUC_K(k=k),'auc':ranking.AUC(), 'hit':ranking.HitRate(k=k), 'map_k':ranking.MAP_K(k=k)}

In [13]:
metric_result = df_test_ori.groupby('userid').progress_apply(cal_metrics, jobsid=jobsid, usersid=usersid, 
                            model=model_bpr, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)
metric_value_b = metric_result.values

  0%|          | 0/74091 [00:00<?, ?it/s]

  ans = (L / rank).mean()


In [21]:
restult_dict = get_results(metric_value_b, wandb_enable=True,model_name='bpr', project_name=cfg.name.cf_name)

Error in callback <function _WandbInit._resume_backend at 0x13d3d8620> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Precision: 0.0574 
Recall: 0.5737 
NDCG: 0.4582 
AUC_K: 0.4889     
AUC: 0.7659 
HitRate: 0.5737 
MAP_K: nan


Problem at: <ipython-input-10-24685eef0a15> 5 wandb_log


Traceback (most recent call last):
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_init.py", line 999, in init
    run = wi.init()
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_init.py", line 494, in init
    self._wl._global_run_stack[-1].finish()
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_run.py", line 256, in wrapper
    return func(self, *args, **kwargs)
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_run.py", line 222, in wrapper
    return func(self, *args, **kwargs)
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_run.py", line 1678, in finish
    return self._finish(exit_code, quiet)
  File "/Users/hao/.pyenv/versions/3.6.15/envs/torch-cpu5/lib/python3.6/site-packages/wandb/sdk/wandb_run.py", line 1684, in 

Exception: problem

Error in callback <function _WandbInit._pause_backend at 0x13d3d8d90> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [15]:
metric_result = df_test_ori.groupby('userid').progress_apply(cal_metrics, jobsid=jobsid, usersid=usersid, 
                            model=model_als, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)
metric_value_a = metric_result.values

  0%|          | 0/74091 [00:00<?, ?it/s]

  ans = (L / rank).mean()


In [19]:
restult_dict = get_results(metric_value_a, wandb_enable=True,model_name='als', project_name=cfg.name.cf_name)

Precision: 0.0848 
Recall: 0.8477 
NDCG: 0.7233 
AUC_K: 0.7690     
AUC: 0.9213 
HitRate: 0.8477 
MAP_K: nan


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC_K,▁
HitRate,▁
NDCG,▁
Precision,▁
Recall,▁

0,1
AUC,0.76586
AUC_K,0.48895
HitRate,0.57369
MAP_K,
NDCG,0.45822
Precision,0.05737
Recall,0.57369


In [17]:
metric_result = df_test_ori.groupby('userid').progress_apply(cal_metrics, jobsid=jobsid, usersid=usersid, 
                            model=model_lmf, u_i_matrix=user_job_app, 
                           n=cfg.params.neg_test+1, metric=metric)
metric_value_l = metric_result.values

  0%|          | 0/74091 [00:00<?, ?it/s]

  ans = (L / rank).mean()


In [20]:
restult_dict = get_results(metric_value_l, wandb_enable=True,model_name='lmf', project_name=cfg.name.cf_name)

Precision: 0.0786 
Recall: 0.7861 
NDCG: 0.6728 
AUC_K: 0.7111     
AUC: 0.8925 
HitRate: 0.7861 
MAP_K: nan


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC_K,▁
HitRate,▁
NDCG,▁
Precision,▁
Recall,▁

0,1
AUC,0.92129
AUC_K,0.76901
HitRate,0.84771
MAP_K,
NDCG,0.7233
Precision,0.08477
Recall,0.84771


Precision: 0.0786 
Recall: 0.7861 
NDCG: 0.6728 
AUC_K: 0.7111     
AUC: 0.8925 
HitRate: 0.1283