Simnet Results
===

A copy of the ZachEval notebook with the simnet results.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

import os
import json
import torch
import pickle
from datetime import datetime
import pytz
import dateutil
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
from glob import glob

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

## Import cbrec

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))

In [None]:
import cbrec.genconfig

In [None]:
# create a config, which is needed by lots of the components for resolving paths, etc.
config = cbrec.genconfig.Config()

In [None]:
import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader
import cbrec.modeling
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

## Load the eval data

In [None]:
output_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/simnet_20220608203014/outputs/"
assert os.path.exists(output_dir)

In [None]:
def identify_model_filepaths(model_dir):
    logger = logging.getLogger("cbrec.modeling.submitEvalFromDirectory.identify_model_filepaths")
    if not os.path.exists(model_dir):
        raise ValueError(f"Dir '{model_dir}' does not exist.")
    model_filepaths = []
    for model_filepath in glob(os.path.join(model_dir, '*.json')):
        model_filepaths.append(model_filepath)
    if len(model_filepaths) == 0:
        raise ValueError(f"No .json files in dir '{model_dir}'.")
    logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {model_dir}.")
    return model_filepaths


class ModelEval:
    def __init__(self, model_output_dir):
        self.logger = logging.getLogger('eval.ModelEval')
        self.model_output_dir = model_output_dir
        self.model_filepaths = self.identify_model_filepaths()
        
        self.models = {}
        
        
    def identify_model_filepaths(self):
        if not os.path.exists(self.model_output_dir):
            raise ValueError(f"Dir '{self.model_output_dir}' does not exist.")
        model_filepaths = []
        for model_filepath in glob(os.path.join(self.model_output_dir, '*.json')):
            model_filepaths.append(model_filepath)
        if len(model_filepaths) == 0:
            raise ValueError(f"No .json files in dir '{model_dir}'.")
        self.logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {self.model_output_dir}.")
        return model_filepaths

        
    def create_managers(self):
        self.managers = []
        for model_filepath in model_filepaths:
            manager = cbrec.modeling.manager.ModelManager.load_from_filepath(model_filepath)
            self.managers.append(manager)
            
            self.models[manager.model_config.output_name] = {}
            
            
    def create_test_metrics(self):
        for manager in self.managers:
            manager.load_model(load_preprocessor=False, load_model_state_dict=False, load_training_metrics=True)
            self.models[manager.model_config.output_name]['train_metrics'] = manager.model_trainer.train_metrics
            self.models[manager.model_config.output_name]['test_metrics'] = manager.model_trainer.test_metrics

    def get_scores(self):
        for manager in self.managers:
            #metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
            scores_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_coverage_scores.pkl')
            #assert os.path.exists(metadata_filepath)
            assert os.path.exists(scores_filepath)
            
            with open(scores_filepath, 'rb') as scores_infile:
                scores = pickle.load(scores_infile)
            self.models[manager.model_config.output_name]['coverage_scores'] = scores

In [None]:
ev = ModelEval(output_dir)
ev.create_managers()
ev.create_test_metrics()

In [None]:
# final validation loss and accuracy after training for all loaded models
print(f"{'Model':>35} ValLoss ValAcc")
print("="*60)
for model_name in ev.models.keys():
    final_validation_loss = ev.models[model_name]['test_metrics'][1, -1]
    final_validation_acc = ev.models[model_name]['test_metrics'][2, -1]
    print(f"{model_name:>35}  {final_validation_loss:.4f} {final_validation_acc:.2%}")

In [None]:
ev.get_scores()

In [None]:
ev.models['simnet_all']['coverage_scores'][0]

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
test_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] > VALIDATION_END_TIMESTAMP]
len(test_md_list)

In [None]:
test_metadata_ids = set([md['metadata_id'] for md in test_md_list])
len(test_metadata_ids)

In [None]:
save_ranks = True
for manager in ev.managers:
    metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
    assert os.path.exists(metadata_filepath)
    target_ranks = []
    with open(metadata_filepath, 'r') as metadata_file:
        for line in tqdm(metadata_file, total=len(test_md_list) + 1000, desc=f'Reading metrics {manager.model_config.output_name}'):
            md = json.loads(line)
            if md['metadata_id'] not in test_metadata_ids:
                continue
            metrics = md[manager.model_config.output_name + "_metrics"]
            target_rank = metrics['target_rank']
            target_ranks.append(target_rank)
            
    target_ranks = np.array(target_ranks)
    mrr = (1 / target_ranks).mean()
    hr1 = (target_ranks == 1).sum() / len(target_ranks) * 100
    hr5 = (target_ranks <= 5).sum() / len(target_ranks) * 100
    ev.models[manager.model_config.output_name]['metrics'] = {
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    }
    if save_ranks:
        ev.models[manager.model_config.output_name]['metrics']['ranks'] = target_ranks

In [None]:
pd.DataFrame([{'model_name': model_name, **ev.models[model_name]['metrics']} for model_name in ev.models.keys()])

### Sidebar: Create the needed coverage data

Based on the sites available at the time.

In [None]:
class CoverageHelper:
    def __init__(self):
        pass

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
len(md_list)

In [None]:
coverage_md_list = [md for md in md_list if not md['has_target']]
len(coverage_md_list)

In [None]:
def load_coverage_predictions(config, coverage_md_list):
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    with db:
        coverage_rcs = []
        for test_context_md in tqdm(coverage_md_list, desc="Loading coverage data"):
            test_context = cbrec.featuredb.get_test_context_by_metadata_id(db, test_context_md['metadata_id'], config)
            rc = cbrec.reccontext.RecContext.create_from_test_context(config, test_context_md, test_context)
            coverage_rcs.append(rc)
    return coverage_rcs


cov_helper = CoverageHelper()
    
coverage_rcs = load_coverage_predictions(config, coverage_md_list)
assert len(coverage_rcs) == 1000

coverage_sites = set()
for coverage_rc in coverage_rcs:
    coverage_sites.update(set(coverage_rc.candidate_usp_arr[:,1]))
coverage_sites = sorted(list(coverage_sites))
print(f"# eligible coverage sites: {len(coverage_sites)}")
cov_helper.coverage_sites = coverage_sites

eligible_sites = set(coverage_sites)
len(eligible_sites)
cov_helper.eligible_sites = eligible_sites

site_id_arr_map = {}
for coverage_rc in coverage_rcs:
    site_id_arr, _ = np.unique(coverage_rc.candidate_usp_arr[:,1], return_index=True)
    assert len(site_id_arr) <= len(coverage_sites)
    site_id_arr_map[coverage_rc.metadata_id] = site_id_arr
cov_helper.site_id_arr_map = site_id_arr_map

In [None]:
cov_helper.timestamp = 1609502404437  # this is the timestamp when recommendations were generated for coverage
assert cov_helper.timestamp == coverage_rc.timestamp
datetime.utcfromtimestamp(cov_helper.timestamp/1000).isoformat()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
author_usp_set = set([(row.user_id, row.site_id) for row in journal_df.itertuples()])
len(author_usp_set)

In [None]:
inits_df = ints_df.sort_values(by='created_at').drop_duplicates(subset=['user_id', 'site_id'], keep='first').copy()
len(inits_df)

In [None]:
inits_df['usp'] = [(row.user_id, row.site_id) for row in inits_df.itertuples()]

In [None]:
inits_df = inits_df[~inits_df.usp.isin(author_usp_set)]
len(inits_df)

In [None]:
inits_df = inits_df[inits_df.created_at < cov_helper.timestamp]
len(inits_df)

In [None]:
previous_int_site_ids = set(inits_df.site_id)
len(previous_int_site_ids)

In [None]:
sites_with_previous_ints = previous_int_site_ids & cov_helper.eligible_sites
len(sites_with_previous_ints)

In [None]:
cov_helper.sites_with_previous_ints = sites_with_previous_ints

In [None]:
print(f"num eligible sites with > 0 indegree: {len(sites_with_previous_ints)}")
print(f"num eligible sites: {len(eligible_sites)}")
print(f"pct > 0 indegree: {len(sites_with_previous_ints) / len(eligible_sites):.3%}")
print(f"pct zero indegree: {1 - (len(sites_with_previous_ints) / len(eligible_sites)):.3%}")

In [None]:
sdf = journal_df[(journal_df.published_at.notna())&(journal_df.published_at > 0)].sort_values(by='published_at').drop_duplicates(subset='site_id', keep='first')
len(sdf)

In [None]:
site_first_journal_timestamp_map = sdf[sdf.site_id.isin(eligible_sites)].set_index('site_id').created_at.to_dict()
len(site_first_journal_timestamp_map)

In [None]:
ages = np.array([cov_helper.timestamp - site_first_journal_timestamp_map[site_id] for site_id in coverage_sites])
ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
len(ages)

In [None]:
# median eligible site has been around for 93 weeks
ages.min(), ages.mean(), ages.std(), np.median(ages), ages.max()

In [None]:
cov_helper.site_first_journal_timestamp_map = site_first_journal_timestamp_map

In [None]:
cov_helper.n = 5  # number of recs to make in each batch

In [None]:
# save cov_helper to pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'wb') as coverage_helper_file:
    pickle.dump(cov_helper, coverage_helper_file)
print("Finished.")

## Make coverage predictions

In [None]:
# load cov_helper from pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'rb') as coverage_helper_file:
    cov_helper = pickle.load(coverage_helper_file)
cov_helper.__dict__.keys()

In [None]:
def compute_coverage_metrics(model_coverage_scores, cov_helper):
    recs = []
    for scores_md in model_coverage_scores:
        metadata_id = scores_md['metadata_id']
        y_score_site = scores_md['scores']
        site_id_arr = cov_helper.site_id_arr_map[metadata_id]
        assert y_score_site.shape == site_id_arr.shape

        # create rec batch
        sort_inds = np.argsort(y_score_site)
        # TODO need to compute ranks if there are ties; for now, we'll assume there aren't any ties
        # in the case of ties, not clear what order argsort prefers
        #ranks = rankdata(-1 * y_score_site, method='max')

        #highest_scores = y_score_site[sort_inds[-cov_helper.n:]]
        highest_score_site_ids = site_id_arr[sort_inds[-cov_helper.n:]]
        recs.append(list(highest_score_site_ids))
        
    recced_sites = set()
    for rec in recs:
        recced_sites.update(rec)
    nonrecced_sites = cov_helper.eligible_sites - recced_sites
    
    recced_inted = len(recced_sites & cov_helper.sites_with_previous_ints) / len(recced_sites)
    nonrecced_inted = len(nonrecced_sites & cov_helper.sites_with_previous_ints) / len(nonrecced_sites)
    
    site_ages = []
    for rec in recs:
        ages = np.array([cov_helper.timestamp - cov_helper.site_first_journal_timestamp_map[site_id] for site_id in rec])
        ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
        assert np.all(ages > 0)
        site_ages.append({
            'min': ages.min(),
            #'mean': ages.mean(),
            #'std': ages.std(),
            'median': np.median(ages),
            #'max': ages.max(),
        })
    mean_min_age = np.mean([a['min'] for a in site_ages])
    mean_median_age = np.mean([a['median'] for a in site_ages])
    
    return {
        'n_recced_sites': len(recced_sites),
        'n_nonrecced_sites': len(nonrecced_sites),
        'pct_eligible_recced': len(recced_sites) / len(eligible_sites),
        'pct_unique_recs': len(recced_sites) / (5 * 1000),
        'pct_recced_with_int': recced_inted,
        'pct_nonrecced_with_int': nonrecced_inted,
        'pct_recced_without_int': 1 - recced_inted,
        'pct_nonrecced_without_int': 1 - nonrecced_inted,
        'ratio_int': recced_inted / nonrecced_inted,
        'ratio_noint': (1 - recced_inted) / (1 - nonrecced_inted),
        'mean_min_age': mean_min_age,
        'mean_median_age': mean_median_age,
    }

In [None]:
coverage_metrics_list = []
for model_name in ev.models.keys():
    model_coverage_scores = ev.models[model_name]['coverage_scores']
    coverage_metrics = compute_coverage_metrics(model_coverage_scores, cov_helper)
    coverage_metrics_list.append({
        'model_name': model_name,
        **coverage_metrics
    })
pd.DataFrame(coverage_metrics_list)