Eval Demo
===

Generate the evaluation data using the evalModels.py script.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

import os
import json
import torch
import pickle
from datetime import datetime
import pytz
import dateutil
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
from glob import glob

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

## Import cbrec

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))

In [None]:
import cbrec.genconfig

In [None]:
# create a config, which is needed by lots of the components for resolving paths, etc.
config = cbrec.genconfig.Config()

In [None]:
import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader
import cbrec.modeling
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

## Load the eval data

In [None]:
output_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/field_study_model_experiment_20220609032420/outputs/"
assert os.path.exists(output_dir)

In [None]:
def identify_model_filepaths(model_dir):
    logger = logging.getLogger("cbrec.modeling.submitEvalFromDirectory.identify_model_filepaths")
    if not os.path.exists(model_dir):
        raise ValueError(f"Dir '{model_dir}' does not exist.")
    model_filepaths = []
    for model_filepath in glob(os.path.join(model_dir, '*.json')):
        model_filepaths.append(model_filepath)
    if len(model_filepaths) == 0:
        raise ValueError(f"No .json files in dir '{model_dir}'.")
    logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {model_dir}.")
    return model_filepaths


class ModelEval:
    def __init__(self, model_output_dir):
        self.logger = logging.getLogger('eval.ModelEval')
        self.model_output_dir = model_output_dir
        self.model_filepaths = self.identify_model_filepaths()
        
        self.models = {}
        
        
    def identify_model_filepaths(self):
        if not os.path.exists(self.model_output_dir):
            raise ValueError(f"Dir '{self.model_output_dir}' does not exist.")
        model_filepaths = []
        for model_filepath in glob(os.path.join(self.model_output_dir, '*.json')):
            model_filepaths.append(model_filepath)
        if len(model_filepaths) == 0:
            raise ValueError(f"No .json files in dir '{model_dir}'.")
        self.logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {self.model_output_dir}.")
        return model_filepaths

        
    def create_managers(self):
        self.managers = []
        for model_filepath in self.model_filepaths:
            manager = cbrec.modeling.manager.ModelManager.load_from_filepath(model_filepath)
            self.managers.append(manager)
            
            self.models[manager.model_config.output_name] = {}
            
            
    def create_test_metrics(self):
        for manager in self.managers:
            manager.load_model(load_preprocessor=False, load_model_state_dict=False, load_training_metrics=True)
            self.models[manager.model_config.output_name]['train_metrics'] = manager.model_trainer.train_metrics
            self.models[manager.model_config.output_name]['test_metrics'] = manager.model_trainer.test_metrics

    def get_scores(self, subset=None):
        for manager in self.managers:
            if subset is not None and manager.model_config.output_name not in subset:
                continue
            #metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
            scores_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_coverage_scores.pkl')
            #assert os.path.exists(metadata_filepath)
            assert os.path.exists(scores_filepath)
            
            with open(scores_filepath, 'rb') as scores_infile:
                scores = pickle.load(scores_infile)
            self.models[manager.model_config.output_name]['coverage_scores'] = scores

In [None]:
ev = ModelEval(output_dir)
logging.disable(level=logging.INFO)
ev.create_managers()
logging.disable(logging.NOTSET)
len(ev.models)

In [None]:
logging.disable(level=logging.INFO)
ev.create_test_metrics()
logging.disable(logging.NOTSET)

In [None]:
grouping_keys = ['train_max_lr', 'LinearNet_n_hidden', 'train_weight_decay', 'LinearNet_dropout_p']
model_group_map = {}
for manager in ev.managers:
    model_name = manager.model_config.output_name
    mc = manager.model_config.as_dict()
    group_key = tuple([mc[key] for key in grouping_keys])
    if group_key not in model_group_map:
        model_group_map[group_key] = []
    model_group_map[group_key].append(model_name)
len(model_group_map)

In [None]:
key_short_name_map = {
    'LinearNet_n_hidden': 'n_hidden',
    'train_weight_decay': 'wd',
    'train_max_lr': 'max_lr',
    'LinearNet_dropout_p': 'do',
}

group_metrics_list = []
for group, model_names in model_group_map.items():
    best_validation_losses = [ev.models[model_name]['test_metrics'][1,:].min() for model_name in model_names]
    best_validation_accs = [ev.models[model_name]['test_metrics'][2,:].max() for model_name in model_names]
    final_validation_losses = [ev.models[model_name]['test_metrics'][1, -1] for model_name in model_names]
    final_validation_accs = [ev.models[model_name]['test_metrics'][2, -1] for model_name in model_names]
    group_name = "; ".join([f"{key if key not in key_short_name_map else key_short_name_map[key]}={value}" for key, value in zip(grouping_keys, group)])
    
    group_metrics_list.append({
        'group_name': group_name,
        **{key: value for key, value in zip(grouping_keys, group)},
        'val_loss_min': np.min(best_validation_losses),
        'val_loss_median': np.median(best_validation_losses),
        'val_loss_max': np.max(best_validation_losses),
        'val_loss_ptp': np.ptp(best_validation_losses),
        'val_acc_min': np.min(best_validation_accs),
        'val_acc_median': np.median(best_validation_accs),
        'val_acc_max': np.max(best_validation_accs),
        'val_acc_ptp': np.ptp(best_validation_accs),
    })
eval_df = pd.DataFrame(group_metrics_list)
len(eval_df)

In [None]:
eval_df.sample(n=1)

In [None]:
eval_df.sort_values(by='val_loss_min', ascending=True)

In [None]:
fig, ax = plt.subplots(1, 1)

ax.hist(eval_df.val_loss_ptp, bins=20)
ax.set_title("Distribution of within-group variance in validation loss")

plt.show()
eval_df.sort_values(by='val_loss_ptp', ascending=False)[['group_name', 'val_loss_min', 'val_acc_max', 'val_loss_ptp']].head(6)

In [None]:
for key in grouping_keys:
    display(eval_df.groupby(key).val_loss_min.agg(['min', np.median, 'max']))

In [None]:
eval_df.sort_values(by='val_loss_min', ascending=True)[['group_name', 'val_loss_min', 'val_acc_max']].head(10)

In [None]:
pd.crosstab(eval_df.train_max_lr, [eval_df.LinearNet_n_hidden, eval_df.LinearNet_dropout_p], values=eval_df.val_acc_max, aggfunc=np.max)

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
formula = """
val_acc_max ~ C(train_max_lr)*C(train_weight_decay)

"""
md = smf.ols(formula=formula, data=eval_df)
res = md.fit()
res.summary()

In [None]:
# final validation loss and accuracy after training for all loaded models
print(f"{'Model':>35} ValLoss ValAcc")
print("="*60)
for model_name in ev.models.keys():
    final_validation_loss = ev.models[model_name]['test_metrics'][1, -1]
    final_validation_acc = ev.models[model_name]['test_metrics'][2, -1]
    print(f"{model_name:>35}  {final_validation_loss:.4f} {final_validation_acc:.2%}")
    break

## Validation metrics

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
valid_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] <= VALIDATION_END_TIMESTAMP]
len(valid_md_list)

In [None]:
valid_metadata_ids = set([md['metadata_id'] for md in valid_md_list])
len(valid_metadata_ids)

In [None]:
for manager in tqdm(ev.managers, desc='Loading validation metrics'):
    metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_validation_metadata.ndjson')
    assert os.path.exists(metadata_filepath)
    target_ranks = []
    with open(metadata_filepath, 'r') as metadata_file:
        for line in tqdm(metadata_file, total=len(valid_md_list), desc=f'Reading metrics {manager.model_config.output_name}', disable=True):
            md = json.loads(line)
            if md['metadata_id'] not in valid_metadata_ids:
                continue
            metrics = md[manager.model_config.output_name + "_metrics"]
            target_rank = metrics['target_rank']
            target_ranks.append(target_rank)
            
    target_ranks = np.array(target_ranks)
    mrr = (1 / target_ranks).mean()
    hr1 = (target_ranks == 1).sum() / len(target_ranks) * 100
    hr5 = (target_ranks <= 5).sum() / len(target_ranks) * 100
    ev.models[manager.model_config.output_name]['metrics'] = {
        'n': len(target_ranks),
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    }

In [None]:
valid_df = pd.DataFrame([{'model_name': model_name, **ev.models[model_name]['metrics']} for model_name in ev.models.keys()])
print(len(valid_df))
(valid_df.n != len(valid_metadata_ids)).sum(), (valid_df.n - len(valid_metadata_ids)).value_counts().rename("n_missing")

In [None]:
grouping_keys = ['train_max_lr', 'LinearNet_n_hidden', 'train_weight_decay', 'LinearNet_dropout_p']
key_short_name_map = {
    'LinearNet_n_hidden': 'n_hidden',
    'train_weight_decay': 'wd',
    'train_max_lr': 'max_lr',
    'LinearNet_dropout_p': 'do',
}

In [None]:
group_metrics_list = []
for group, model_names in model_group_map.items():
    best_validation_losses = [ev.models[model_name]['test_metrics'][1,:].min() for model_name in model_names]
    best_validation_accs = [ev.models[model_name]['test_metrics'][2,:].max() for model_name in model_names]
    final_validation_losses = [ev.models[model_name]['test_metrics'][1, -1] for model_name in model_names]
    final_validation_accs = [ev.models[model_name]['test_metrics'][2, -1] for model_name in model_names]
    group_name = "; ".join([f"{key if key not in key_short_name_map else key_short_name_map[key]}={value}" for key, value in zip(grouping_keys, group)])
    
    group_metrics = {
        'group_name': group_name,
        **{key: value for key, value in zip(grouping_keys, group)},
        'n_models': len(model_names),
    }
    for metric in ['n', 'mrr', 'hr1', 'hr5']:
        metric_values = [ev.models[model_name]['metrics'][metric] for model_name in model_names]
        group_metrics[metric + "_min"] = np.min(metric_values)
        group_metrics[metric + "_median"] = np.median(metric_values)
        group_metrics[metric + "_max"] = np.max(metric_values)
    
    group_metrics.update({        
        'val_loss_min': np.min(best_validation_losses),
        'val_loss_median': np.median(best_validation_losses),
        'val_loss_max': np.max(best_validation_losses),
        'val_loss_ptp': np.ptp(best_validation_losses),
        'val_acc_min': np.min(best_validation_accs),
        'val_acc_median': np.median(best_validation_accs),
        'val_acc_max': np.max(best_validation_accs),
        'val_acc_ptp': np.ptp(best_validation_accs),
    })
    
    group_metrics_list.append(group_metrics)
eval_df = pd.DataFrame(group_metrics_list)
len(eval_df)

In [None]:
eval_df.columns

In [None]:
eval_df.head()

In [None]:
# strong correlations between training hold-out accuracy on validation MRR
for comparison_type in ['max', 'median']:
    for valid_comparison_metric in ['mrr', 'hr1', 'hr5']:
        for train_comparison_metric in ['val_loss', 'val_acc']:
            train_key = train_comparison_metric + "_" + comparison_type
            if comparison_type == 'max' and 'loss' in train_comparison_metric:
                train_key = train_comparison_metric + "_min"
            corr = eval_df[valid_comparison_metric + "_" + comparison_type].corr(eval_df[train_key])
            print(f"{comparison_type} {valid_comparison_metric} {train_comparison_metric} {corr:.4f}")

In [None]:
# very strong correlations between validation metrics
import itertools
for comparison_type in ['max', 'median']:
    for valid_comparison_metric1, valid_comparison_metric2 in itertools.combinations(['mrr', 'hr1', 'hr5'], 2):
        corr = eval_df[valid_comparison_metric1 + "_" + comparison_type].corr(eval_df[valid_comparison_metric2 + "_" + comparison_type])
        print(f"{comparison_type} {valid_comparison_metric1} {valid_comparison_metric2} {corr:.4f}")

In [None]:
eval_df.sort_values(by='mrr_median', ascending=False)[['group_name', 'mrr_median', 'hr5_median', 'hr1_median']]

In [None]:
# this is the configuration as used for the model
# see code from the time of b0: https://github.com/umncs-caringbridge/recsys-peer-match/blob/33d258d8c514f6fb14930a034e8a9c7e2270f745/src/cbrec/torchmodel.py
eval_df[(eval_df.LinearNet_n_hidden == 100)&(eval_df.LinearNet_dropout_p == 0.1)&(eval_df.train_weight_decay == 0)].sort_values(by='mrr_median', ascending=False)[['group_name', 'mrr_median', 'hr5_median', 'hr1_median']]

In [None]:
grouping_keys

In [None]:
tuned_models = model_group_map[(0.01, 300, 0.0001, 0.5)]
sorted(tuned_models)

In [None]:
study_models = model_group_map[(0.012, 100, 0, 0.1)]
sorted(study_models)

In [None]:
for model_name in tuned_models + study_models:
    model_filepath = os.path.join(output_dir, f"{model_name}.json")
    username = "levon003"
    script_path = f"/home/lana/{username}/repos/recsys-peer-match/src/cbrec/modeling/submitEvalFromDirectory.py"
    print(f"python {script_path} --username {username} --model-filepath {model_filepath} --test-only")

In [None]:
!du -h /home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/field_study_model_experiment_20220609032420/outputs/field_study_model_experiment_164.json

## Test metric computation

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
test_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] > VALIDATION_END_TIMESTAMP]
len(test_md_list)

In [None]:
test_metadata_ids = set([md['metadata_id'] for md in test_md_list])
len(test_metadata_ids)

In [None]:
save_ranks = True
for manager in ev.managers:
    metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
    if not os.path.exists(metadata_filepath):
        continue
    target_ranks = []
    with open(metadata_filepath, 'r') as metadata_file:
        for line in tqdm(metadata_file, total=len(test_md_list) + 1000, desc=f'Reading metrics {manager.model_config.output_name}'):
            md = json.loads(line)
            if md['metadata_id'] not in test_metadata_ids:
                continue
            metrics = md[manager.model_config.output_name + "_metrics"]
            target_rank = metrics['target_rank']
            target_ranks.append(target_rank)
            
    target_ranks = np.array(target_ranks)
    mrr = (1 / target_ranks).mean()
    hr1 = (target_ranks == 1).sum() / len(target_ranks) * 100
    hr5 = (target_ranks <= 5).sum() / len(target_ranks) * 100
    ev.models[manager.model_config.output_name]['metrics'] = {
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    }
    if save_ranks:
        ev.models[manager.model_config.output_name]['metrics']['ranks'] = target_ranks

In [None]:
pd.DataFrame([{'model_name': model_name, **ev.models[model_name]['metrics']} for model_name in ev.models.keys()])

In [None]:
for manager in tqdm(ev.managers, desc='Loading test metrics'):
    metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
    if not os.path.exists(metadata_filepath):
        continue
    target_ranks = []
    with open(metadata_filepath, 'r') as metadata_file:
        for line in tqdm(metadata_file, total=len(test_md_list), desc=f'Reading metrics {manager.model_config.output_name}', disable=False):
            md = json.loads(line)
            if md['metadata_id'] not in test_metadata_ids:
                continue
            metrics = md[manager.model_config.output_name + "_metrics"]
            target_rank = metrics['target_rank']
            target_ranks.append(target_rank)
    assert len(target_ranks) > 0
    
    target_ranks = np.array(target_ranks)
    mrr = (1 / target_ranks).mean()
    hr1 = (target_ranks == 1).sum() / len(target_ranks) * 100
    hr5 = (target_ranks <= 5).sum() / len(target_ranks) * 100
    ev.models[manager.model_config.output_name]['metrics'] = {
        'n': len(target_ranks),
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    }

In [None]:
test_df = pd.DataFrame([{
    'model': 'study' if model_name in study_models else 'tuned', 
    'model_name': model_name,
    **ev.models[model_name]['metrics']
} for model_name in study_models + tuned_models])
test_df

In [None]:
# these are the results for the MLP_study and MLP_tuned models
test_df.groupby('model').median()

### Sidebar: Metrics over time

In [None]:
ranks = []
for manager in ev.managers:
    test_metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_test_metadata.ndjson')
    if not os.path.exists(test_metadata_filepath):
        continue
    if manager.model_config.output_name not in study_models:
        continue
    validation_metadata_filepath = os.path.join(manager.model_config.output_dir, f'{manager.model_config.experiment_name}_{manager.model_config.output_name}_validation_metadata.ndjson')
    assert os.path.exists(validation_metadata_filepath)
    for metadata_filepath in [test_metadata_filepath, validation_metadata_filepath]:
        with open(metadata_filepath, 'r') as metadata_file:
            for line in tqdm(metadata_file, total=len(test_md_list), desc=f'Reading ranks from {os.path.basename(metadata_filepath)}', disable=False):
                md = json.loads(line)
                if not md['has_target']:
                    continue
                metrics = md[manager.model_config.output_name + "_metrics"]
                target_rank = metrics['target_rank']
                ranks.append({
                    'model': 'study' if model_name in study_models else 'tuned',
                    'model_name': manager.model_config.output_name,
                    'metadata_id': md['metadata_id'],
                    'timestamp': md['timestamp'],
                    'target_rank': target_rank,
                })
    
len(ranks)

In [None]:
rank_df = pd.DataFrame(ranks)
rank_df.sample(n=2)

In [None]:
end_time = dateutil.parser.parse("2022-01-01").replace(tzinfo=pytz.UTC)
start_time = dateutil.parser.parse("2021-01-01").replace(tzinfo=pytz.UTC)
bins = []
curr_time = start_time
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(weeks=1)
bins.append(curr_time.timestamp() * 1000)
print(len(bins))
rank_df['week'] = np.digitize(rank_df.timestamp, bins=bins)
rank_df.week.value_counts().head()

In [None]:
set(rank_df.model_name)

In [None]:
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5.4, 1.5))
cmap = matplotlib.cm.viridis
#ax.set_title("MRR throughout test period")
ax.set_xlabel("Weeks since September 1, 2020", fontsize=8)
ax.set_ylabel("Metric", fontsize=8)

#week_df = mdf.groupby(['model_name', 'week']).agg({'reciprocal_rank': np.mean, 'metadata_id': len}).rename(columns={'reciprocal_rank': 'mrr', 'metadata_id': 'n'}).reset_index().sort_values(by='week')

# 'NaiveNetwork'
i = 0
for metric_name in ['MRR', 'HR@1', 'HR@5']:
    sdf = rank_df
    sdf = sdf.groupby('week').target_rank.agg([
        lambda r: (1 / r).mean(),
        lambda r: (r == 1).sum() / len(r),
        lambda r: (r <= 5).sum() / len(r),
    ]).rename(columns={'<lambda_0>': 'MRR', '<lambda_1>': 'HR@1', '<lambda_2>': 'HR@5',}).reset_index()
    sdf = sdf.groupby('week').median().reset_index()
    
    print(sdf[metric_name].corr(sdf.week))
    
    # fit a model to check the linear slope over time
    # (is MRR decreasing over time?)
    if metric_name == 'MRR':
        md = smf.ols(formula='MRR ~ week', data=sdf)
        res = md.fit()
        #print(res.summary())
        beta, p = res.params.week, res.pvalues.week
        print(beta, p)
    label = f"{metric_name}"
    
    # plot the data
    linestyle = '-'
    if i == 1:
        linestyle = 'dashed'
    elif i == 2:
        linestyle = 'dotted'
    ax.plot(sdf.week, sdf[metric_name], label=label, color=cmap(i * 0.3), linestyle=linestyle)
    ax.text(0.7, sdf.loc[sdf.week == 1, metric_name].iloc[0], metric_name, fontsize=7, ha='right', va='center')
    i += 1

ax.set_xlim((-3.5, 52))

validation_end_week = 52 / 2
ax.axvline(validation_end_week, linestyle='dashdot', color='gray')#, label='Start of test period')

ax.tick_params(axis='both', which='major', labelsize=8)

#ax.legend(fontsize=7)

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f"{datetime.utcfromtimestamp(x / 1000).strftime('%m/%d/%Y')}"))

fig.tight_layout()
image_shortfilename = f"mlpstudy_metrics_over_time.pdf"
figures_dir = os.path.join(git_root_dir, 'figures')
image_filename = os.path.join(figures_dir, image_shortfilename)
fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches='tight')


plt.show()

In [None]:
# validating that the validation period ends after 52//2 weeks
(VALIDATION_END_TIMESTAMP - bins[52 // 2]) / 1000 / 60 / 60 / 24

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
cmap = matplotlib.cm.viridis
ax.set_title("MRR throughout test period")
ax.set_xlabel("Initiation timestamp")
ax.set_ylabel("MRR (by week)")

end_time = dateutil.parser.parse("2022-01-01").replace(tzinfo=pytz.UTC)
start_time = dateutil.parser.parse("2021-01-01").replace(tzinfo=pytz.UTC)
bins = []
curr_time = start_time
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(weeks=1)
bins.append(curr_time.timestamp() * 1000)
mdf['week'] = np.digitize(mdf.timestamp, bins=bins)

week_df = mdf.groupby(['model_name', 'week']).agg({'reciprocal_rank': np.mean, 'metadata_id': len}).rename(columns={'reciprocal_rank': 'mrr', 'metadata_id': 'n'}).reset_index().sort_values(by='week')

# 'NaiveNetwork'
for model_name in ['NaiveNetwork', 'MostRecentlyInitiatedWith', 'MostRecentJournal', 'MostInitiatedWithRecently', 'simnet_all']:
    sdf = week_df[week_df.model_name == model_name]
    
    # fit a model to check the linear slope over time
    # (is MRR decreasing over time?)
    md = smf.ols(formula='mrr ~ week', data=sdf)
    res = md.fit()
    #print(res.summary())
    beta, p = res.params.week, res.pvalues.week
    label = f"{model_name} ($\\beta$={beta:.3f}, p<{p:.3f})"
    
    # plot the data
    plt.plot(sdf.week, sdf.mrr, label=label)
    
    
    
ax.legend()

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f"{datetime.utcfromtimestamp(x / 1000).strftime('%m/%d/%Y')}"))

plt.show()

### Sidebar sidebar: personalized recs for new authors?

Option 1: compare first initiations to subsequent initations
#Option 2: compare authors with exactly 3 journal updates to authors with > 3 journal updates

In [None]:
# generate rank_df from above
rank_df.head()

In [None]:
sdf = rank_df[rank_df.model_name == 'field_study_model_experiment_219']
len(sdf)

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath)]
len(md_list)

In [None]:
md_list[0]

In [None]:
already_initiated_user_ids = set()
ds = []
for md in md_list:
    if md['type'] == 'ineligible' or md['type'] == 'predict':
        continue
    has_already_initiated = md['source_user_id'] in already_initiated_user_ids
    if not has_already_initiated:
        already_initiated_user_ids.add(md['source_user_id'])
    ds.append({
        'metadata_id': md['metadata_id'],
        'type': md['type'],
        'has_already_initiated': has_already_initiated,
    })
adf = pd.DataFrame(ds)
len(adf)

In [None]:
sdf = sdf.merge(adf, how='left', left_on='metadata_id', right_on='metadata_id')
len(sdf)

In [None]:
sdf.has_already_initiated.value_counts()

In [None]:
sdf.head(1)

In [None]:
ssdf = sdf[(sdf.type == 'test')&(sdf.timestamp > VALIDATION_END_TIMESTAMP)]
len(ssdf)

In [None]:
ssdf.groupby('has_already_initiated').agg({'target_rank': lambda tr: (1 / tr).mean()})

### Sidebar: Create the needed coverage data

Based on the sites available at the time.

In [None]:
class CoverageHelper:
    def __init__(self):
        pass

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
len(md_list)

In [None]:
coverage_md_list = [md for md in md_list if not md['has_target']]
len(coverage_md_list)

In [None]:
def load_coverage_predictions(config, coverage_md_list):
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    with db:
        coverage_rcs = []
        for test_context_md in tqdm(coverage_md_list, desc="Loading coverage data"):
            test_context = cbrec.featuredb.get_test_context_by_metadata_id(db, test_context_md['metadata_id'], config)
            rc = cbrec.reccontext.RecContext.create_from_test_context(config, test_context_md, test_context)
            coverage_rcs.append(rc)
    return coverage_rcs


cov_helper = CoverageHelper()
    
coverage_rcs = load_coverage_predictions(config, coverage_md_list)
assert len(coverage_rcs) == 1000

coverage_sites = set()
for coverage_rc in coverage_rcs:
    coverage_sites.update(set(coverage_rc.candidate_usp_arr[:,1]))
coverage_sites = sorted(list(coverage_sites))
print(f"# eligible coverage sites: {len(coverage_sites)}")
cov_helper.coverage_sites = coverage_sites

eligible_sites = set(coverage_sites)
len(eligible_sites)
cov_helper.eligible_sites = eligible_sites

site_id_arr_map = {}
for coverage_rc in coverage_rcs:
    site_id_arr, _ = np.unique(coverage_rc.candidate_usp_arr[:,1], return_index=True)
    assert len(site_id_arr) <= len(coverage_sites)
    site_id_arr_map[coverage_rc.metadata_id] = site_id_arr
cov_helper.site_id_arr_map = site_id_arr_map

In [None]:
cov_helper.timestamp = 1609502404437  # this is the timestamp when recommendations were generated for coverage
assert cov_helper.timestamp == coverage_rc.timestamp
datetime.utcfromtimestamp(cov_helper.timestamp/1000).isoformat()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
author_usp_set = set([(row.user_id, row.site_id) for row in journal_df.itertuples()])
len(author_usp_set)

In [None]:
inits_df = ints_df.sort_values(by='created_at').drop_duplicates(subset=['user_id', 'site_id'], keep='first').copy()
len(inits_df)

In [None]:
inits_df['usp'] = [(row.user_id, row.site_id) for row in inits_df.itertuples()]

In [None]:
inits_df = inits_df[~inits_df.usp.isin(author_usp_set)]
len(inits_df)

In [None]:
inits_df = inits_df[inits_df.created_at < cov_helper.timestamp]
len(inits_df)

In [None]:
previous_int_site_ids = set(inits_df.site_id)
len(previous_int_site_ids)

In [None]:
sites_with_previous_ints = previous_int_site_ids & cov_helper.eligible_sites
len(sites_with_previous_ints)

In [None]:
cov_helper.sites_with_previous_ints = sites_with_previous_ints

In [None]:
print(f"num eligible sites with > 0 indegree: {len(sites_with_previous_ints)}")
print(f"num eligible sites: {len(eligible_sites)}")
print(f"pct > 0 indegree: {len(sites_with_previous_ints) / len(eligible_sites):.3%}")
print(f"pct zero indegree: {1 - (len(sites_with_previous_ints) / len(eligible_sites)):.3%}")

In [None]:
sdf = journal_df[(journal_df.published_at.notna())&(journal_df.published_at > 0)].sort_values(by='published_at').drop_duplicates(subset='site_id', keep='first')
len(sdf)

In [None]:
site_first_journal_timestamp_map = sdf[sdf.site_id.isin(eligible_sites)].set_index('site_id').created_at.to_dict()
len(site_first_journal_timestamp_map)

In [None]:
ages = np.array([cov_helper.timestamp - site_first_journal_timestamp_map[site_id] for site_id in coverage_sites])
ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
len(ages)

In [None]:
# median eligible site has been around for 93 weeks
ages.min(), ages.mean(), ages.std(), np.median(ages), ages.max()

In [None]:
cov_helper.site_first_journal_timestamp_map = site_first_journal_timestamp_map

In [None]:
cov_helper.n = 5  # number of recs to make in each batch

In [None]:
# save cov_helper to pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'wb') as coverage_helper_file:
    pickle.dump(cov_helper, coverage_helper_file)
print("Finished.")

#### End of sidebar

## Make coverage predictions

In [None]:
class CoverageHelper:
    def __init__(self):
        pass

# load cov_helper from pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'rb') as coverage_helper_file:
    cov_helper = pickle.load(coverage_helper_file)
cov_helper.__dict__.keys()

In [None]:
def compute_coverage_metrics(model_coverage_scores, cov_helper):
    recs = []
    for scores_md in model_coverage_scores:
        metadata_id = scores_md['metadata_id']
        y_score_site = scores_md['scores']
        site_id_arr = cov_helper.site_id_arr_map[metadata_id]
        assert y_score_site.shape == site_id_arr.shape

        # create rec batch
        sort_inds = np.argsort(y_score_site)
        # TODO need to compute ranks if there are ties; for now, we'll assume there aren't any ties
        # in the case of ties, not clear what order argsort prefers
        #ranks = rankdata(-1 * y_score_site, method='max')

        #highest_scores = y_score_site[sort_inds[-cov_helper.n:]]
        highest_score_site_ids = site_id_arr[sort_inds[-cov_helper.n:]]
        recs.append(list(highest_score_site_ids))
        
    recced_sites = set()
    for rec in recs:
        recced_sites.update(rec)
    nonrecced_sites = cov_helper.eligible_sites - recced_sites
    
    recced_inted = len(recced_sites & cov_helper.sites_with_previous_ints) / len(recced_sites)
    nonrecced_inted = len(nonrecced_sites & cov_helper.sites_with_previous_ints) / len(nonrecced_sites)
    
    site_ages = []
    for rec in recs:
        ages = np.array([cov_helper.timestamp - cov_helper.site_first_journal_timestamp_map[site_id] for site_id in rec])
        ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
        assert np.all(ages > 0)
        site_ages.append({
            'min': ages.min(),
            #'mean': ages.mean(),
            #'std': ages.std(),
            'median': np.median(ages),
            #'max': ages.max(),
        })
    mean_min_age = np.mean([a['min'] for a in site_ages])
    mean_median_age = np.mean([a['median'] for a in site_ages])
    
    return {
        'n_recced_sites': len(recced_sites),
        'n_nonrecced_sites': len(nonrecced_sites),
        'pct_eligible_recced': len(recced_sites) / len(cov_helper.eligible_sites),
        'pct_unique_recs': len(recced_sites) / (5 * 1000),
        'pct_recced_with_int': recced_inted,
        'pct_nonrecced_with_int': nonrecced_inted,
        'pct_recced_without_int': 1 - recced_inted,
        'pct_nonrecced_without_int': 1 - nonrecced_inted,
        'ratio_int': recced_inted / nonrecced_inted,
        'ratio_noint': (1 - recced_inted) / (1 - nonrecced_inted),
        'mean_min_age': mean_min_age,
        'mean_median_age': mean_median_age,
    }

In [None]:
ev.get_scores(subset=study_models + tuned_models)

In [None]:
coverage_metrics_list = []
for model_name in study_models + tuned_models:
    model_coverage_scores = ev.models[model_name]['coverage_scores']
    coverage_metrics = compute_coverage_metrics(model_coverage_scores, cov_helper)
    coverage_metrics_list.append({
        'model': 'study' if model_name in study_models else 'tuned',
        'model_name': model_name,
        **coverage_metrics
    })
pd.DataFrame(coverage_metrics_list)

In [None]:
cov_df = pd.DataFrame(coverage_metrics_list)
#cov_df.set_index(['model', 'n_recced_sites']).loc[cov_df.groupby('model').n_recced_sites.median().reset_index().set_index(['model', 'n_recced_sites'])]
cov_df.groupby('model').n_recced_sites.median().reset_index()

In [None]:
model_coverage_scores = ev.models['field_study_model_experiment_219']['coverage_scores']
len(model_coverage_scores)

In [None]:
model_coverage_scores[0]

In [None]:
def create_rec_df(model_coverage_scores, cov_helper):
    recs = []
    for scores_md in model_coverage_scores:
        metadata_id = scores_md['metadata_id']
        y_score_site = scores_md['scores']
        site_id_arr = cov_helper.site_id_arr_map[metadata_id]
        assert y_score_site.shape == site_id_arr.shape

        # create rec batch
        sort_inds = np.argsort(y_score_site)
        # TODO need to compute ranks if there are ties; for now, we'll assume there aren't any ties
        # in the case of ties, not clear what order argsort prefers
        #ranks = rankdata(-1 * y_score_site, method='max')

        #highest_scores = y_score_site[sort_inds[-cov_helper.n:]]
        highest_score_site_ids = site_id_arr[sort_inds[-cov_helper.n:]]
        recs.append({
            'metadata_id': metadata_id,
            'recced_site_ids': list(highest_score_site_ids),
        })
        
    return pd.DataFrame(recs)

In [None]:
rec_df = create_rec_df(model_coverage_scores, cov_helper)
len(rec_df)

In [None]:
rec_df.head(1)

In [None]:
already_initiated_user_ids = set()
ds = []
for md in md_list:
    if md['type'] == 'ineligible':
        continue
    if md['type'] == 'predict':
        break
    has_already_initiated = md['source_user_id'] in already_initiated_user_ids
    if not has_already_initiated:
        already_initiated_user_ids.add(md['source_user_id'])
len(already_initiated_user_ids)

In [None]:
metadata_id_to_user_list = {}
for md in md_list:
    if md['type'] != 'predict':
        continue
    metadata_id_to_user_list[md['metadata_id']] = md['source_user_id']

In [None]:
rec_df['user_id'] = rec_df.metadata_id.map(lambda mdid: metadata_id_to_user_list[mdid])

In [None]:
rec_df['has_already_initiated'] = rec_df.user_id.map(lambda uid: uid in already_initiated_user_ids)
rec_df.has_already_initiated.value_counts()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

for has_already_initiated, group in rec_df.groupby('has_already_initiated'):
    recced_site_ids = group.recced_site_ids
    unique_sites_recced = set()
    for l in recced_site_ids:
        unique_sites_recced.update(l)
    print(has_already_initiated, len(unique_sites_recced))
    
    site_ids = []
    for l in group.recced_site_ids:
        site_ids.extend(l)
    value_counts = pd.Series(site_ids).value_counts()
    ax = axes[0] if has_already_initiated == True else axes[1]
    ax.hist(value_counts, bins=np.arange(0, 100, 5), log=True)
    ax.set_xlabel("Number of times recommended")
    ax.set_ylabel("Number of sites")
    ax.set_title(f"{has_already_initiated=}")
    print(f"{len(value_counts)} {value_counts.median()} {value_counts.mean():.2f} {(value_counts == 1).sum() / len(value_counts):.2%}")
plt.show()