Debug Eval
===


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import json
import sys
from tqdm import tqdm

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.TestConfig()

In [None]:
md_list = []
test_md_list = []
with open(config.metadata_filepath, 'r') as infile:
    for line in tqdm(infile):
        md = json.loads(line)
        md_list.append(md)
        
        if md['is_test_period'] and md['is_initiation_eligible']:
            test_md_list.append(md)
len(md_list), len(test_md_list)

In [None]:
list(md_list[5458].keys())

In [None]:
df = pd.DataFrame(md_list, columns=['metadata_id', 
                                     'timestamp',
                                     'source_user_id',
                                     'target_site_id',
                                     'is_test_period',
                                     'n_source_sites',
                                     'n_target_users',
                                     'source_user_is_existing',
                                     'n_existing_users_on_target_site',
                                     'source_user_is_eligible',
                                     'target_site_has_eligible_user',
                                     'is_self_initiation',
                                     'is_initiation_eligible',  
                                     # and the features that come with being initiation eligible...
                                     'n_eligible_users',
                                     'n_source_usps',
                                     'n_active_user_ids',
                                     'source_user_is_active',
                                     'n_active_target_users',
                                     'n_target_usps',
                                     'n_eligible_inactive_users',
                                     'n_existing_initiations_from_source_user_id',
                                     'n_candidate_user_ids',
                                     'n_candidate_usps',
                                     # test-only features
                                     'test_target_usp_adjustment',
                                   ]
)
print(len(df))
df.head()

In [None]:
pd.crosstab(df.source_user_is_eligible, [df.target_site_has_eligible_user,  df.is_self_initiation])

In [None]:
pd.crosstab((df.n_existing_users_on_target_site > 0).rename("At least one existing user on site?"), (df.n_target_users > 0).rename("At least one eligible user on site?"))

In [None]:
sdf = df[df.is_initiation_eligible]
len(sdf), np.sum(sdf.is_test_period)

In [None]:
# take a quick look at the generated triples
sdf[~sdf.is_test_period].sample(n=5)

In [None]:
pd.crosstab((sdf.n_source_sites > 1).rename('multiple_source_sites'), (sdf.n_target_users > 1).rename('multiple_target_users'))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.set_title("Distribution of number of target USPs")
ax.set_xlabel("# of Target USPs for Initiation")
ax.set_ylabel("Initiation Count")

ax.hist(sdf.n_target_usps, bins=np.arange(1, 10), log=True)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.set_title("Distribution of number of candidate USPs")
ax.set_xlabel("# of Candidate USPs for Initiation")
ax.set_ylabel("Initiation Count")

ax.hist(sdf.n_candidate_usps, bins=20)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.set_title("Distribution of total candidate USPs - total candidate users i.e. basically the number of multi-site authors")
ax.set_xlabel("# of extra USPs generated from user candidates")
ax.set_ylabel("Initiation Count")

diff = sdf.n_candidate_usps - sdf.n_candidate_user_ids
ax.hist(diff, bins=20, log=True, align='right')

ax.axvline(np.median(diff), color='black', linestyle='--', alpha=0.8, label=f'Median ({np.median(diff)})')
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
cmap = matplotlib.cm.viridis
ax.set_title("Distribution of number of candidate USPs")
ax.set_xlabel("# of Candidate USPs for Initiation")
ax.set_ylabel("Initiation Count")

end_time = dateutil.parser.parse("2019-01-01").replace(tzinfo=pytz.UTC)
start_time = dateutil.parser.parse("2014-01-01").replace(tzinfo=pytz.UTC)
#total_time = end_time - start_time
#total_days = total_time.days
bins = [ timestamp
    for timestamp in np.linspace(start_time.timestamp() * 1000, end_time.timestamp() * 1000)
]

xticks = [(start_time + relativedelta(years=year)).timestamp() * 1000 for year in range(6)]
xticklabels = [datetime.utcfromtimestamp(xtick / 1000).replace(tzinfo=pytz.UTC).strftime("%Y") for xtick in xticks]
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)

ax.hist(sdf.timestamp, bins=bins)

ax.axvline(config.generation_start_timestamp, color=cmap(0), linestyle='--', alpha=0.8, label='Start of Training Period')
ax.axvline(config.generation_stop_timestamp, color=cmap(0.5), linestyle='--', alpha=0.8, label='End of Data')
ax.axvline(config.test_generation_start_timestamp, color='black', linestyle='--', alpha=0.8, label='Start of Test Period')
ax.legend()

plt.show()

## Test initiation analysis

In [None]:
test_df =  df[(df.is_initiation_eligible)&(df.is_test_period)]
len(test_df)

In [None]:
test_df.head()

In [None]:
pd.DataFrame(test_df.test_target_usp_adjustment.value_counts().rename("Were test targets forced into candidates due to inactivity? (test_target_usp_adjustment)"))

In [None]:

fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.set_title("Distribution of previous initiation counts")
ax.set_xlabel("# of existing initiations")
ax.set_ylabel("Initiation Count")

ax.hist(sdf.n_existing_initiations_from_source_user_id, bins=np.arange(0,15), log=True)

plt.show()

In [None]:
pd.crosstab(test_df.source_user_is_active.rename("Source is active"), (test_df.n_active_target_users > 0).rename("At least one target is active"), margins=True)

In [None]:
# Confusion matrix of median candidate users, for active + eligible axes
pd.DataFrame([[np.nan, np.median(test_df.n_active_user_ids - test_df.n_candidate_user_ids - test_df.n_existing_initiations_from_source_user_id)], 
              [np.median(test_df.n_eligible_inactive_users), np.median(test_df.n_candidate_user_ids + test_df.n_existing_initiations_from_source_user_id)]], 
             index=['Non-eligible', 'Eligible'], columns=['Inactive', 'Active'])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.set_title("Distribution of candidate users (for test initiations)")
ax.set_xlabel("# of candidate user ids")
ax.set_ylabel("Initiation Count")

ax.hist(sdf.n_candidate_user_ids, bins=20, log=True)

plt.show()

### Baselines

In [None]:
test_md_list[0]['baseline_metrics']['NaiveNetwork']

In [None]:
baselines = test_md_list[0]['baseline_metrics'].keys()
baselines

In [None]:
baselines = test_md_list[0]['baseline_metrics'].keys()
print(baselines)
baseline_df_dict = {}
for baseline in tqdm(baselines):
    metrics_list = []
    for md in test_md_list:
        metrics = md['baseline_metrics'][baseline]
        metrics['metadata_id'] = md['metadata_id']
        metrics_list.append(metrics)
    bdf = pd.DataFrame(metrics_list)
    baseline_df_dict[baseline] = bdf
    print(baseline, len(bdf))
len(baseline_df_dict)

In [None]:
for baseline in baselines:
    bdf = baseline_df_dict[baseline]
    print(f"{baseline:>25} {np.min(bdf.target_raw_score):10.2f} {np.median(bdf.target_raw_score):10.2f} {np.max(bdf.target_raw_score):10.2f}")

In [None]:
baseline_df_dict['MostJournalsRecently']

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.reshape(-1, 1)[:,0]):
    ax.set_title(baseline)
    ax.set_xlabel("Raw Score")
    bdf = baseline_df_dict[baseline]
    x = bdf.target_raw_score
    ax.hist(x, bins=20)
    ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.reshape(-1, 1)[:,0]):
    ax.set_title(baseline)
    bdf = baseline_df_dict[baseline]
    x = bdf.target_rank
    ax.hist(x, bins=20)
    ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.reshape(-1, 1)[:,0]):
    ax.set_title(baseline)
    bdf = baseline_df_dict[baseline]
    x = bdf.reciprocal_rank
    ax.hist(x, bins=np.linspace(0,1,num=20), log=True)
    ax.axvline(np.mean(x), label=f'MRR ({np.mean(x):.3f})', color='black', linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))
cmap = matplotlib.cm.Set1

for baseline, ax in zip(baselines, axes.reshape(-1, 1)[:,0]):
    ax.set_title(baseline)
    bdf = baseline_df_dict[baseline]
    bins = np.linspace(0,1,num=20)
    for i, k in enumerate([1, 5, 10, 50]):
        x = bdf[f'ndcg_{k}']
        ax.hist(x, bins=bins, log=True, alpha=0.9, color=cmap(i))
        ax.axvline(np.mean(x), label=f'NDCG@{k} ({np.mean(x):.3f})', color=cmap(i), linestyle='--', alpha=0.9)
    ax.legend()
    
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 16))

for baseline, ax in zip(baselines, axes.reshape(-1, 1)[:,0]):
    bdf = baseline_df_dict[baseline]
    #print(baseline, np.abs(np.quantile(bdf.target_raw_score, 0.1) / np.median(bdf.target_raw_score)))
    if np.abs(np.quantile(bdf.target_raw_score, 0.1) / np.median(bdf.target_raw_score)) > 4:
        is_clipped = True
        x = np.maximum(bdf.target_raw_score, np.quantile(bdf.target_raw_score, 0.1))
    else:
        is_clipped = False
        x = bdf.target_raw_score
    y = bdf.target_rank
    ax.hexbin(x, y, gridsize=15, bins='log')
    ax.set_title(baseline if not is_clipped else baseline + " (low end clipped)")
    ax.set_xlabel("Score")
    ax.set_ylabel("Rank")
    #ax.axvline(np.median(x), label=f'Median ({np.median(x):.1f})', color='black', linestyle='--', alpha=0.9)
    #ax.legend()
    
plt.show()