Baseline Compute
===

Code to compute test and coverage metrics for the baselines.

Note this approach assumes that the baselines have no hyperparameters to tune.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import re
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# HuggingFace packages
import transformers
import tokenizers
import torch

# more torch imports
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# evaluation
from scipy.stats import rankdata

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

## Load data

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
inits_df = ints_df.sort_values(by='created_at').drop_duplicates(subset=['user_id', 'site_id'], keep='first').copy()
len(inits_df)

In [None]:
inits_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(inits_df.user_id, inits_df.site_id)]

In [None]:
author_usp_set = set([(user_id, site_id) for user_id, site_id in zip(journal_df.user_id, journal_df.site_id)])
len(author_usp_set)

In [None]:
inits_df = inits_df[~inits_df.usp.isin(author_usp_set)]
len(inits_df)

## Create fast look-ups

In [None]:
usp_journal_timestamp_map = {}

current_usp = None
current_timestamp_list = []
for row in tqdm(journal_df[journal_df.published_at > 0].sort_values(by=['user_id', 'site_id', 'published_at']).itertuples(), total=len(journal_df), desc="JournalIdLookup map construction"):
    usp = (row.user_id, row.site_id)
    if usp != current_usp:
        current_usp = usp
        current_timestamp_list = []
        usp_journal_timestamp_map[usp] = current_timestamp_list
    current_timestamp_list.append(row.published_at)
logging.info(f"Translated {len(journal_df)} journals into a map of {len(usp_journal_timestamp_map)} USPs.")

In [None]:
def get_journal_data(usps, timestamp):
    journal_data_list = []
    for usp in usps:
        journal_data = {
            'n_recent_journals': 0,
        }
    return journal_data_list

def get_journal_updates_before(self, usp, timestamp):
if usp in self.usp_journal_timestamp_map:
    timestamp_list = self.usp_journal_timestamp_map[usp]
    end_ind = bisect.bisect_right(timestamp_list, timestamp)
    if end_ind is None:
        return []
    start_ind = max(end_ind - self.config.journal_update_memory, 0)
    journal_id_list = self.usp_journal_id_map[usp]
    journal_ids = journal_id_list[start_ind:end_ind]
    return journal_ids
else:
    return []

In [None]:
#self.compute_metrics(y_score_site_count, 'MostInitiatedWithRecently')
#self.compute_metrics(y_score_site_recent, 'MostRecentlyInitiatedWith')

In [None]:
import cbrec.recentActivityCounter
initiation_counter = cbrec.recentActivityCounter.RecentActivityCounter(config.activity_count_duration_ms)

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath)]
len(md_list)

In [None]:
c = 0
for md in md_list:
    if not md['is_initiation_eligible'] and not md['is_self_initiation']:
        c += 1
        print(md)
        break
c

In [None]:
c = 0
for md in md_list:
    if md['is_initiation_eligible']:
        c += 1
c

## Compute baselines

Score all test_contexts (including test and predict contexts) with the baselines.

In [None]:
try:
    import cbrec
except:
    sys.path.append("/home/lana/levon003/repos/recsys-peer-match/src")

import cbrec.featuredb
import cbrec.genconfig
import cbrec.utils
import cbrec.evaluation
import cbrec.reccontext
import cbrec.recentActivityCounter
import cbrec.modeling.text_loader
import cbrec.modeling.reccontext_builder
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
test_md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
test_md_map = {md['metadata_id']: md for md in test_md_list}
len(test_md_map)

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath)]
print(f"Tracking initiations from {len(md_list)} captured initations.")

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000

def compute_scores(rc, scorer, rac):
    # produce scores for the baselines that generate scores for all source/candidate usp pairs
    y_score_mat = scorer.get_empty_score_arr('full')
    for j in range(y_score_mat.shape[1]):  # for each source_usp
        start_ind = j * len(rc.candidate_usp_arr)
        stop_ind = start_ind + len(rc.candidate_usp_arr)
        user_feats = rc.user_pair_mat[start_ind:stop_ind,0:3]
        y_score_mat[:,j] = user_feats.sum(axis=1)
    y_score_site = scorer.reduce_usp_ranking_to_site(scorer.merge_multisource_rankings(y_score_mat))
    scorer.compute_metrics(y_score_site, 'NaiveNetwork')

    # produce scores for the baselines that generate scores for all candidate usp pairs
    y_score_usp = scorer.get_empty_score_arr('merged')
    assert y_score_usp.shape == rc.candidate_usp_mat[:,0].shape
    y_score_usp = rc.candidate_usp_mat[:,11].copy()  # time to first update
    y_score_usp = np.abs(y_score_usp) * -1
    y_score_site = scorer.reduce_usp_ranking_to_site(y_score_usp)
    scorer.compute_metrics(y_score_site, "NewestAuthor")
    
    #y_score_usp = scorer.get_empty_score_arr('merged')
    y_score_usp = rc.candidate_usp_mat[:,3].copy()  # n_recent journal
    y_score_site = scorer.reduce_usp_ranking_to_site(y_score_usp)
    scorer.compute_metrics(y_score_site, "MostJournalsRecently")
    
    #y_score_usp = scorer.get_empty_score_arr('merged')
    y_score_usp = rc.candidate_usp_mat[:,4].copy()  # time_to_most_recent journal
    y_score_usp[y_score_usp == 0] = y_score_usp.max() + 1  # set all zeros to be the largest value
    y_score_usp = np.abs(y_score_usp) * -1
    y_score_site = scorer.reduce_usp_ranking_to_site(y_score_usp)
    scorer.compute_metrics(y_score_site, "MostRecentJournal")
    
    y_score_usp = rc.candidate_usp_mat[:,[5,7,9]].sum(axis=1)  # n_recent amp + comment + guestbook
    y_score_site = scorer.reduce_usp_ranking_to_site(y_score_usp)
    scorer.compute_metrics(y_score_site, "MostInteractiveAuthorRecently")
    
    #user_ids = self.test_context.candidate_usp_arr[:,0]
    #for i, user_id in enumerate(user_ids):
    #    first_journal_timestamp = ram.get_first_journal_update_timestamp(user_id)
    #    first_journal_timestamp = first_journal_timestamp / self.config.ms_per_hour if first_journal_timestamp is not None else np.finfo(featuredb.NUMPY_DTYPE).max
    #    y_score_usp[i] = first_journal_timestamp
    #y_score_usp -= ram.get_first_journal_update_timestamp(self.test_context.source_user_id) / self.config.ms_per_hour
    #y_score_usp = np.abs(y_score_usp) * -1
    #y_score_site = self.reduce_usp_ranking_to_site(y_score_usp)
    #scorer.compute_metrics(y_score_site, "ClosestToStart")
    
    # note: probably don't implement this one (a global counter of the most-initiated-with sites...)
    #y_score_usp = self.compute_MostInitiatedWith()
    #y_score_site = self.reduce_usp_ranking_to_site(y_score_usp)
    #self.compute_metrics(y_score_site, 'MostInitiatedWith')

    # produce scores for the baselines that generate scores for all sites
    #y_score_site_count, y_score_site_recent = self.compute_MostInitiatedWithRecently(activity_manager)
    #self.compute_metrics(y_score_site_count, 'MostInitiatedWithRecently')
    #self.compute_metrics(y_score_site_recent, 'MostRecentlyInitiatedWith')    
    
    y_score_site_count, y_score_site_recent = get_scores_from_site_counter(scorer, rac)
    scorer.compute_metrics(y_score_site_count, 'MostInitiatedWithRecently')
    scorer.compute_metrics(y_score_site_recent, 'MostRecentlyInitiatedWith')
    
    # Random baseline
    y_score_site_random = scorer.get_empty_score_arr('reduced')
    y_score_site_random = config.rng.uniform(0, 1, size=y_score_site_random.shape)
    scorer.compute_metrics(y_score_site_random, 'Random')


def get_scores_from_site_counter(scorer, rac):
    """
    Generate y_score_site arrays from the given RecentActivityCounter, which is assumed to be tracking site_ids.

    :rac -- recentActivityCounter.RecentActivityCounter

    :returns
        y_score_site_count -- count of recent activity
        y_score_site_recent -- number of seconds to current timestamp
    """
    y_score_site_count = scorer.get_empty_score_arr('reduced')
    y_score_site_recent = scorer.get_empty_score_arr('reduced')
    no_recent_score = scorer.test_context.timestamp / scorer.config.ms_per_hour
    for i, site_id in enumerate(scorer.site_id_arr):
        n_recent = rac.get_count(site_id)
        if n_recent > 0:
            most_recent = rac.get_most_recent_activity(site_id)
            if most_recent is None:
                
            time_to_most_recent = scorer.test_context.timestamp - most_recent
            # convert difference from ms to hours
            time_to_most_recent /= scorer.config.ms_per_hour
            #time_to_most_recent *= -1  # invert most recent, so that the highest possible value is 0 and the lowest possible value is self.test_context.timestamp
        else:
            time_to_most_recent = no_recent_score

        y_score_site_count[i] = n_recent
        y_score_site_recent[i] = time_to_most_recent
    y_score_site_recent *= -1
    return y_score_site_count, y_score_site_recent
    
config = cbrec.genconfig.Config()

db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    baseline_test_filepath = os.path.join(config.feature_data_dir, 'baseline_metrics.ndjson')
    baseline_scores_filepath = os.path.join(config.feature_data_dir, 'baseline_coverage_scores.pkl')
    scores = []
    with open(baseline_test_filepath, 'w') as metrics_outfile, open(baseline_scores_filepath, 'wb') as scores_outfile:
        curr_timestamp = 0
        md_list_counter = 0
        initiation_counter = cbrec.recentActivityCounter.RecentActivityCounter(config.activity_count_duration_ms)
        
        for test_context in tqdm(cbrec.featuredb.stream_test_contexts(db, config), desc='Streaming test contexts', total=len(test_md_map)):
            test_context_md = test_md_map[test_context['metadata_id']]
            if test_context_md['timestamp'] > curr_timestamp:
                curr_timestamp = test_context_md['timestamp']
                while md_list_counter < len(md_list) and md_list[md_list_counter]['timestamp'] < curr_timestamp:
                    md = md_list[md_list_counter]
                    if md['has_target'] and not md['is_self_initiation']:
                    #if md['is_initiation_eligible']:
                        initiation_counter.add_interaction(md['target_site_id'], md['timestamp'])
                    md_list_counter += 1

            rc = cbrec.reccontext.RecContext.create_from_test_context(config, test_context_md, test_context)
            has_target = rc.md['has_target']

            if has_target and md['timestamp'] <= VALIDATION_END_TIMESTAMP:
                continue  # don't process validation timestamps
            
            save_scores = not has_target  # save scores if this is a prediction target
            scorer = cbrec.evaluation.Scorer(config, rc, save_scores=save_scores)

            if has_target:
                initiation_counter.update_counts(curr_timestamp)
            compute_scores(rc, scorer, initiation_counter)

            rc.md['metrics'] = scorer.metrics_dict
            if save_scores:
                rc.md['scores'] = scorer.scores_dict # {key: list(value) for key, value in scorer.scores_dict.items()}
                scores.append(rc.md)
                if len(scores) == 1000:
                    pickle.dump(scores, scores_outfile)
                    logging.info(f"Saved pickle with {len(scores)} scores.")
                    scores = []
                #scores[rc.metadata_id] = scorer.scores_dict
                #line = json.dumps(rc.md) + "\n"
                #scores_outfile.write(line)
            else:
                line = json.dumps(rc.md) + "\n"
                metrics_outfile.write(line)

In [None]:
!head /home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/baseline_metrics.ndjson

In [None]:
md_list[md_list_counter]

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
test_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] > VALIDATION_END_TIMESTAMP]
print(len(test_md_list))
test_metadata_ids = set([md['metadata_id'] for md in test_md_list])
len(test_metadata_ids)

In [None]:
from collections import defaultdict
target_ranks = defaultdict(list)
baseline_test_filepath = os.path.join(config.feature_data_dir, 'baseline_metrics.ndjson')
with open(baseline_test_filepath, 'r') as metadata_file:
    for line in tqdm(metadata_file, total=len(test_md_list), desc='Reading baseline metrics', disable=False):
        md = json.loads(line)
        if md['metadata_id'] not in test_metadata_ids:
            continue
        metrics_dict = md["metrics"]
        for model_name, metrics in metrics_dict.items():
            target_rank = metrics['target_rank']
            target_ranks[model_name].append(target_rank)
assert len(target_ranks) > 0
print(target_ranks.keys())
len(target_ranks)

In [None]:
d = []
for model_name in target_ranks.keys():
    ranks = np.array(target_ranks[model_name])

    mrr = (1 / ranks).mean()
    hr1 = (ranks == 1).sum() / len(ranks) * 100
    hr5 = (ranks <= 5).sum() / len(ranks) * 100
    d.append({
        'model': model_name,
        'n': len(ranks),
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    })
eval_df = pd.DataFrame(d).sort_values(by='mrr', ascending=False)
eval_df

In [None]:
class CoverageHelper:
    def __init__(self):
        pass

# load cov_helper from pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'rb') as coverage_helper_file:
    cov_helper = pickle.load(coverage_helper_file)
cov_helper.__dict__.keys()

In [None]:
len(cov_helper.sites_with_previous_ints)

In [None]:
def break_ties(site_id_arr, y_score_site, sort_inds):
    """
    This implementation is terrible, although I believe it works.
    """
    highest_scores = []
    highest_score_site_ids = []
    n_ties_broken = 0
    i = 0
    while len(highest_scores) < 5:
        i += 1
        score = y_score_site[sort_inds[-i]]
        if score == y_score_site[sort_inds[-(i+1)]]:
            inds = np.flatnonzero(y_score_site == score)
            n_remaining = 5 - len(highest_scores)
            if len(inds) <= n_remaining:
                highest_scores.extend([score,]*len(inds))
                highest_score_site_ids.extend(site_id_arr[inds])
                assert len(highest_scores) == len(highest_score_site_ids)
                i += len(inds) - 1
            else:
                highest_scores.extend([score,]*n_remaining)
                subset_inds = np.random.choice(inds, size=n_remaining, replace=False)
                highest_score_site_ids.extend(site_id_arr[subset_inds])
                assert len(highest_scores) == len(highest_score_site_ids)
                n_ties_broken += 1
        else:
            highest_scores.append(score)
            highest_score_site_ids.append(site_id_arr[sort_inds[-i]])
            assert len(highest_scores) == len(highest_score_site_ids)
        if len(highest_scores) == 5:
            break
    return np.array(highest_scores), np.array(highest_score_site_ids), n_ties_broken > 0

def compute_coverage_metrics(model_coverage_scores, cov_helper):
    model_recs = defaultdict(list)
    n_ties_broken = 0
    for scores_md in model_coverage_scores:
        metadata_id = scores_md['metadata_id']
        site_id_arr = cov_helper.site_id_arr_map[metadata_id]
        for model_name, y_score_site in scores_md['scores'].items():
        
            assert y_score_site.shape == site_id_arr.shape

            # create rec batch
            sort_inds = np.argsort(y_score_site)
            # TODO need to compute ranks if there are ties; for now, we'll assume there aren't any ties
            # in the case of ties, not clear what order argsort prefers
            #ranks = rankdata(-1 * y_score_site, method='max')

            highest_scores = y_score_site[sort_inds[-(cov_helper.n+1):]]
            if len(set(highest_scores)) != len(highest_scores):
                highest_scores, highest_score_site_ids, ties_broken = break_ties(site_id_arr, y_score_site, sort_inds)
                if not np.all(highest_scores == np.flip(y_score_site[sort_inds[-5:]])):
                    print(highest_scores)
                    print(y_score_site[sort_inds[-5:]])
                    return y_score_site
                if ties_broken:
                    n_ties_broken += 1
            else:
                #highest_scores = y_score_site[sort_inds[-cov_helper.n:]]
                highest_score_site_ids = site_id_arr[sort_inds[-cov_helper.n:]]
            model_recs[model_name].append(list(highest_score_site_ids))
    print(f"{n_ties_broken=}")
    
    cov_data = []
    for model_name, recs in model_recs.items():
        recced_sites = set()
        for rec in recs:
            recced_sites.update(rec)
        nonrecced_sites = cov_helper.eligible_sites - recced_sites

        recced_inted = len(recced_sites & cov_helper.sites_with_previous_ints) / len(recced_sites)
        nonrecced_inted = len(nonrecced_sites & cov_helper.sites_with_previous_ints) / len(nonrecced_sites)

        site_ages = []
        for rec in recs:
            ages = np.array([cov_helper.timestamp - cov_helper.site_first_journal_timestamp_map[site_id] for site_id in rec])
            ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
            assert np.all(ages > 0)
            site_ages.append({
                'min': ages.min(),
                #'mean': ages.mean(),
                #'std': ages.std(),
                'median': np.median(ages),
                #'max': ages.max(),
            })
        mean_min_age = np.mean([a['min'] for a in site_ages])
        mean_median_age = np.mean([a['median'] for a in site_ages])

        cov_data.append({
            'model': model_name,
            'n_recced_sites': len(recced_sites),
            'n_nonrecced_sites': len(nonrecced_sites),
            'pct_eligible_recced': len(recced_sites) / len(cov_helper.eligible_sites),
            'pct_unique_recs': len(recced_sites) / (5 * 1000),
            'pct_recced_with_int': recced_inted,
            'pct_nonrecced_with_int': nonrecced_inted,
            'pct_recced_without_int': 1 - recced_inted,
            'pct_nonrecced_without_int': 1 - nonrecced_inted,
            'ratio_int': recced_inted / nonrecced_inted,
            'ratio_noint': (1 - recced_inted) / (1 - nonrecced_inted),
            'mean_min_age': mean_min_age,
            'mean_median_age': mean_median_age,
        })
    return cov_data

In [None]:
baseline_scores_filepath = os.path.join(config.feature_data_dir, 'baseline_coverage_scores.pkl')
with open(baseline_scores_filepath, 'rb') as scores_file:
    scores = pickle.load(scores_file)
len(scores)

In [None]:
coverage_metrics = compute_coverage_metrics(scores, cov_helper)
len(coverage_metrics)

In [None]:
pd.DataFrame(coverage_metrics)

In [None]:
edf = eval_df.merge(pd.DataFrame(coverage_metrics), on='model')
edf

In [None]:
# print in latex table form
for r in edf.itertuples():
    print(f"{r.model} & {r.mrr:.3f} & {r.hr1:.2f}% & {r.hr5:.2f}% & {r.n_recced_sites} & {r.pct_unique_recs:.1%} & {r.mean_min_age:.1f} weeks & {r.pct_recced_without_int:.1%} / {r.pct_nonrecced_without_int:.1%} = {r.ratio_noint:.2f} \\\\".replace("%", "\\%"))