In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

import os
import json
import torch
import pickle
from datetime import datetime
import pytz
import dateutil
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig
config = cbrec.genconfig.Config()
import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader
cbrec.logutils.set_up_logging()
# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
from glob import glob
output_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/concatnet_20220329151435/outputs"
assert os.path.exists(output_dir)
def identify_model_filepaths(model_dir):
    logger = logging.getLogger("cbrec.modeling.submitEvalFromDirectory.identify_model_filepaths")
    if not os.path.exists(model_dir):
        raise ValueError(f"Dir '{model_dir}' does not exist.")
    model_filepaths = []
    for model_filepath in glob(os.path.join(model_dir, '*.ndjson')):
        model_filepaths.append(model_filepath)
    if len(model_filepaths) == 0:
        raise ValueError(f"No .ndjson files in dir '{model_dir}'.")
    logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {model_dir}.")
    return model_filepaths
model_filepaths = identify_model_filepaths(output_dir)
all_dfs = []
metrics_list = []
count = 0
num_files = len(model_filepaths)

for i in tqdm(range(num_files)):
    metadata_filepath = model_filepaths[i]
    assert os.path.exists(metadata_filepath)
    md_list = []
    with open(metadata_filepath, 'r') as infile:
        for line in infile:
            md = json.loads(line)
            md_list.append(md)
        model_names = [key[:-8] for key in md_list[0].keys() if key.endswith('_metrics') and key != 'baseline_metrics']
        for model_name in model_names:
            for md in md_list:
                metrics = md[model_name + '_metrics']
                metrics['model_name'] = model_name
                metrics['metadata_id'] = md['metadata_id']
                metrics_list.append(metrics)       
# also include the baseline results
        if count == 0:
            count = count + 1
            for md in tqdm(md_list):
                baseline_models = md['baseline_metrics']
                for model_name, metrics in baseline_models.items():
                    metrics['model_name'] = model_name
                    metrics['metadata_id'] = md['metadata_id']
                    metrics_list.append(metrics)
mdf = pd.DataFrame(metrics_list)
len(mdf)
mdf['mrr'] = 1 / mdf.target_rank
for k in [1, 3, 5, 50]:
    mdf[f'hr@{k}'] = mdf.target_rank <= k
mdf.groupby('model_name').mrr.mean().sort_values(ascending=False)
mdf.groupby('model_name')[['mrr', 'hr@1', 'hr@3', 'hr@5', 'hr@50']].mean().sort_values(by='mrr', ascending=False)

In [None]:
mdf.groupby('model_name')

In [None]:
mdf.groupby('model_name')["vals"]

In [None]:
mdf.groupby('model_name').mrr

In [None]:
print(mdf.groupby('model_name').mrr.mean())

In [None]:
mdf.groupby('model_name').mrr.mean().sort_values(ascending=False)
