In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

import os
import json
import torch
import pickle
from datetime import datetime
import pytz
import dateutil
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())

import sys
sys.path.append(os.path.join(git_root_dir, 'src'))

import cbrec.genconfig

# create a config, which is needed by lots of the components for resolving paths, etc.
config = cbrec.genconfig.Config()

import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader

cbrec.logutils.set_up_logging()

# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
metadata_filepath = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/adam_randomsearch_experiment_20220213194145/outputs/adam_randomsearch_experiment_test_metadata.ndjson"
assert os.path.exists(metadata_filepath)

In [None]:
md_list = []
with open(metadata_filepath, 'r') as infile:
    for line in infile:
        md = json.loads(line)
        md_list.append(md)
len(md_list)

In [None]:
df = pd.DataFrame(md_list, columns=['metadata_id', 
                                     'timestamp',
                                     'source_user_id',
                                     'target_site_id',
                                     'is_test_period',
                                     'n_source_sites',
                                     'n_target_users',
                                     'source_user_is_existing',
                                     'n_existing_users_on_target_site',
                                     'source_user_is_eligible',
                                     'target_site_has_eligible_user',
                                     'is_self_initiation',
                                     'is_initiation_eligible',  
                                     # and the features that come with being initiation eligible...
                                     'n_eligible_users',
                                     'n_eligible_coauthors',
                                     'n_source_usps',
                                     'n_active_user_ids',
                                     'source_user_is_active',
                                     'n_active_target_users',
                                     'n_target_usps',
                                     'n_eligible_inactive_users',
                                     'n_existing_initiations_from_source_user_id',
                                     'n_candidate_user_ids',
                                     'n_candidate_usps',
                                     # test-only features
                                     'test_target_usp_adjustment',
                                     'source_user_initiated_in_train_period', 
                                     'target_site_initiated_with_in_train_period',
                                   ]
)
print(len(df))
df.head()

In [None]:
model_names = [key[:-8] for key in md_list[0].keys() if key.endswith('_metrics') and key != 'baseline_metrics']
metrics_list = []
for model_name in model_names:
    for md in md_list:
        metrics = md[model_name + '_metrics']
        metrics['model_name'] = model_name
        metrics['metadata_id'] = md['metadata_id']
        metrics_list.append(metrics)

# also include the baseline results
for md in tqdm(md_list):
    baseline_models = md['baseline_metrics']
    for model_name, metrics in baseline_models.items():
        metrics['model_name'] = model_name
        metrics['metadata_id'] = md['metadata_id']
        metrics_list.append(metrics)
        
mdf = pd.DataFrame(metrics_list)
len(mdf)

In [None]:
model_names = [key[:] for key in md_list[0].keys() if key.endswith('_metrics') and key != 'baseline_metrics']

In [None]:
mdf['reciprocal_rank'] = 1 / mdf.target_rank
for k in [1, 3, 5, 50]:
    mdf[f'hr@{k}'] = mdf.target_rank <= k

In [None]:
mdf.model_name.value_counts()

In [None]:
mdf.groupby('model_name')[['reciprocal_rank', 'hr@1', 'hr@3', 'hr@5', 'hr@50']].mean().rename(columns={'reciprocal_rank': 'mrr'}).sort_values(by='mrr', ascending=False)

In [None]:
# best model: adam_randomsearch_experiment_40
# configs:
#   train_Adam_beta1: 0.8114556965716483
#   train_Adam_beta2: 0.9027771890512277
#   train_Adam_eps: 0.037702454558030354
#   train_lr_init: 0.09156343147932729

In [None]:
new = mdf.groupby('model_name')[['reciprocal_rank', 'hr@1', 'hr@3', 'hr@5', 'hr@50']].mean().rename(columns={'reciprocal_rank': 'mrr'})

In [None]:
num_bins = 20
new_list = new['mrr']
n, bins, patches = plt.hist(new_list, num_bins, facecolor='blue', alpha=0.5)

In [None]:
num_bins = 20
new_list = new['hr@1']
n, bins, patches = plt.hist(new_list, num_bins, facecolor='blue', alpha=0.5)

In [None]:
num_bins = 20
new_list = new['hr@3']
n, bins, patches = plt.hist(new_list, num_bins, facecolor='blue', alpha=0.5)

In [None]:
num_bins = 20
new_list = new['hr@5']
n, bins, patches = plt.hist(new_list, num_bins, facecolor='blue', alpha=0.5)

In [None]:
num_bins = 20
new_list = new['hr@50']
n, bins, patches = plt.hist(new_list, num_bins, facecolor='blue', alpha=0.5)

In [None]:
# graph config parameters
# difference between stuff that doesn't
adam_models = new[9:109]
adam_models = adam_models['mrr']
print(adam_models)

In [None]:
config_path = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/adam_randomsearch_experiment_20220213194145/configs"
beta1_lst = []
beta2_lst = []
eps_lst = []
lr_init_lst = []

# creating a list of indexes that are lexigraphically sorted in correlation with model_name
values = list(range(100))
value_strings = []
for val in values:
    value_strings.append(str(val))
value_strings.sort()
sorted_indexes = []
for val in value_strings:
    sorted_indexes.append(int(val))
    
for i in sorted_indexes:
    with open(f"{config_path}/adam_randomsearch_experiment_{i}.json", 'r') as fp:
        config = json.load(fp)
        beta1_lst.append(config['train_Adam_beta1'])
        beta2_lst.append(config['train_Adam_beta2'])
        eps_lst.append(config['train_Adam_eps'])
        lr_init_lst.append(config['train_lr_init'])

In [None]:
plt.scatter(beta1_lst, adam_models)

In [None]:
plt.scatter(beta2_lst, adam_models)

In [None]:
plt.scatter(eps_lst, adam_models)

In [None]:
plt.scatter(lr_init_lst, adam_models)