PyTorch Training
===

Experiments with PyTorch optimization of rec models.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import re
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# HuggingFace packages
import transformers
import tokenizers
import torch

# more torch imports
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# evaluation
from scipy.stats import rankdata

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

## One-time startup: identfying journal_oids

### Training triples

In [None]:
import cbrec.feature_loader
fl = cbrec.feature_loader.FeatureLoader(config)

In [None]:
jil = fl.journal_id_lookup

journal_oids = set()
n_not_enough_source = 0
n_not_enough_target = 0
n_not_enough_alt = 0
n_skipped = 0
db = cbrec.featuredb.get_db_by_filepath(fl.config.feature_db_filepath)
with db:
    for triple_dict in tqdm(cbrec.featuredb.stream_triples(db), desc='Streaming train triples'):
        source_usp = (triple_dict['source_user_id'], triple_dict['source_site_id'])
        target_usp = (triple_dict['target_user_id'], triple_dict['target_site_id'])
        alt_usp = (triple_dict['alt_user_id'], triple_dict['alt_site_id'])
        source_journal_ids = jil.get_journal_updates_before(source_usp, triple_dict['interaction_timestamp'])
        target_journal_ids = jil.get_journal_updates_before(target_usp, triple_dict['interaction_timestamp'])
        alt_journal_ids = jil.get_journal_updates_before(alt_usp, triple_dict['interaction_timestamp'])
        error = False
        if len(source_journal_ids) < 3:
            n_not_enough_source += 1
            error = True
        if len(target_journal_ids) < 3:
            n_not_enough_target += 1
            error = True
        if len(alt_journal_ids) < 3:
            n_not_enough_alt += 1
            error = True
        if not error:
            journal_oids.update(source_journal_ids)
            journal_oids.update(target_journal_ids)
            journal_oids.update(alt_journal_ids)
        else:
            n_skipped += 1
logging.info(f"{len(journal_oids)} journal ids identified for training triples.")
logging.info(f"Skipped {n_skipped} triples with insufficient journals available. (source missing = {n_not_enough_source}; target missing = {n_not_enough_target}; alt missing = {n_not_enough_alt})")
required_journal_ids_filepath = os.path.join(config.model_data_dir, 'train_journal_oids.txt')
with open(required_journal_ids_filepath, 'w') as outfile:
    for journal_oid in journal_oids:
        outfile.write(journal_oid + "\n")
logging.info(f"Wrote {len(journal_oids)} journal ids to '{required_journal_ids_filepath}'.")

### Test RecContexts

Note: this takes about 40 minutes. It adds all required journal updates to the file.

Note that eval processes assume the availability of these texts.

In [None]:
test_md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test' or md['type'] == 'predict']
test_md_map = {md['metadata_id']: md for md in test_md_list}

required_journal_ids = set()
n_invalid = 0
n_error = 0

db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    for test_context in tqdm(cbrec.featuredb.stream_test_contexts(db, config), desc='Streaming test contexts', total=33592):
        test_context_md = test_md_map[test_context['metadata_id']]
        interaction_timestamp = int(test_context_md['timestamp'])
        source_usp_arr = test_context['source_usp_arr']
        source_usps = [(source_usp_arr[i,0], source_usp_arr[i,1]) for i in range(source_usp_arr.shape[0])]
        candidate_usp_arr = test_context['candidate_usp_arr']
        candidate_usps = [(candidate_usp_arr[i,0], candidate_usp_arr[i,1]) for i in range(candidate_usp_arr.shape[0])]
        error = False
        for usp in source_usps + candidate_usps:
            journal_ids = fl.journal_id_lookup.get_journal_updates_before(usp, interaction_timestamp)
            if len(journal_ids) < 3:
                n_invalid += 1
                error = True
            else:
                required_journal_ids.update(journal_ids)
        if error:
            n_error += 1
logging.info(f"Identified {len(required_journal_ids)}, of which {n_error} had 1+ errors (with {n_invalid} total errors).")

required_journal_ids_filepath = os.path.join(config.model_data_dir, 'test_journal_oids.txt')
with open(required_journal_ids_filepath, 'w') as outfile:
    for journal_oid in required_journal_ids:
        outfile.write(journal_oid + "\n")
logging.info(f"Wrote {len(required_journal_ids)} journal ids to '{required_journal_ids_filepath}'.")

## Create training data

In [None]:
import cbrec.feature_loader

In [None]:
fl = cbrec.feature_loader.FeatureLoader(config)

In [None]:
X, y_true, missing_journal_id_list = fl.get_pointwise_training_triples()
# 2021-09-30 11:43:20,364 - cbrec.feature_loader.FeatureLoader.get_input_arrs_from_triple_dicts - DEBUG - After processing 254776 triple dicts, identified 226 invalid (and an additional 4 invalid due to missing text features)
X.shape, y_true.shape, len(missing_journal_id_list)

In [None]:
X.shape, y_true.shape, len(missing_journal_id_list)

In [None]:
# save train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
with open(os.path.join(feature_cache_dir, 'X_train_raw.pkl'), 'wb') as outfile:
    pickle.dump(X, outfile, protocol=pickle.HIGHEST_PROTOCOL)
with open(os.path.join(feature_cache_dir, 'y_train_raw.pkl'), 'wb') as outfile:
    pickle.dump(y_true, outfile, protocol=pickle.HIGHEST_PROTOCOL)

## Load test data

In [None]:
test_md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test']
len(test_md_list)

### Create test2train triples

In [None]:
# only read entries >= first_metadata_id
# the first_metadata_id should be the name of the checkpoint that started generating the 
# note for Sept 22: 734780 - 847406 should be the metadata_ids generated during the test period
first_metadata_id = 734780
timestamp = 0
test2train_md_list = []
for md in test_md_list:
    if md['metadata_id'] < first_metadata_id:
        continue
    if md['metadata_id'] >= 863252 and md['metadata_id'] < 865053:
        continue
    if md['timestamp'] < timestamp and len(test2train_md_list) < 1000:
        print(f"reset, dropping {len(test2train_md_list)} test contexts")
        test2train_md_list = []
    timestamp = md['timestamp']
    test2train_md_list.append(md)
len(test2train_md_list)

In [None]:
test2train_md_list[0].keys()

In [None]:
datetime.utcfromtimestamp(test2train_md_list[0]['timestamp'] / 1000).isoformat(), datetime.utcfromtimestamp(test2train_md_list[-1]['timestamp'] / 1000).isoformat()

In [None]:
# originally: 36307 triples, 223979 required journals
triple_dicts, required_journal_ids = fl.create_train_triples_from_test_contexts(test2train_md_list)
len(triple_dicts), len(required_journal_ids)

In [None]:
triple_dicts[0]

In [None]:
with open(os.path.join(config.model_data_dir, 'test2train_triple_dicts.pkl'), 'wb') as outfile:
    pickle.dump(triple_dicts, outfile, protocol=pickle.HIGHEST_PROTOCOL)
with open(os.path.join(config.model_data_dir, 'test2train_required_journal_oids.txt'), 'w') as outfile:
    for journal_oid in required_journal_ids:
        outfile.write(str(journal_oid) + "\n")

Can top up on the fly:

````
python cbrec/text/createTextFeatureSqlite.py --text-id-txt /home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data/test2train_required_journal_oids.txt --n-processes 3
````

OR

````
sbatch -p amdsmall make_text_features_test2train.sh
````

In [None]:
with open(os.path.join(config.model_data_dir, 'test2train_triple_dicts.pkl'), 'rb') as infile:
    triple_dicts = pickle.load(infile)
len(triple_dicts)

In [None]:
feature_arrs, ys, missing_journal_id_list = fl.get_input_arrs_from_triple_dicts(triple_dicts)
y_true = np.array(ys)
if len(feature_arrs) > 0:
    X = np.vstack(feature_arrs)
else:
    X = np.array(feature_arrs)
X.shape, y_true.shape, len(missing_journal_id_list)

In [None]:
# save train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
with open(os.path.join(feature_cache_dir, 'X_test2train_raw.pkl'), 'wb') as outfile:
    pickle.dump(X, outfile, protocol=pickle.HIGHEST_PROTOCOL)
with open(os.path.join(feature_cache_dir, 'y_test2train_raw.pkl'), 'wb') as outfile:
    pickle.dump(y_true, outfile, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]
Xs = []
ys = []
for x_filename, y_filename in filenames:
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
        Xs.append(X)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
        ys.append(y)


In [None]:
X = np.concatenate(Xs, axis=0)
y_true = np.concatenate(ys, axis=0)
X.shape, y_true.shape

In [None]:
# shuffle the data
inds = np.arange(len(X))
np.random.shuffle(inds)
X = X[inds]
y_true = y_true[inds]

In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
model_config = cbrec.modeling.modelconfig.ModelConfig()
model_config.train_n_epochs = 700
model_config.experiment_name = 'main'
model_config.train_weight_decay = 0.0001
model_config.LinearNet_dropout_p = 0.5
model_config

In [None]:
model_manager = cbrec.modeling.manager.ModelManager(model_config, config=config)
model_manager.model_config.output_basename

In [None]:
#model_manager.train_model(X, y_true)

In [None]:
model_manager.save_model()

In [None]:
model_manager.model_trainer.load_model_state_dict(description='e560')

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'main')
model_manager.load_model(load_training_metrics=True)

In [None]:
model_manager.model_trainer.load_model_state_dict(description='e750')

In [None]:
# now that we've loaded the saved model data, we can 
train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.legend()

plt.show()

In [None]:
# TODO get the interaction_timestamp of one of the metadata ids generated last week
# later edit: why?

In [None]:
first_metadata_id = 866854
last_metadata_id = 866933
md_list = []
for md in cbrec.utils.stream_metadata_list(config.metadata_filepath):
    if md['type'] != 'predict':
        continue
    metadata_id = md['metadata_id']
    if metadata_id >= first_metadata_id and metadata_id <= last_metadata_id:
        md_list.append(md)
len(md_list)

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
# identify unsubscribed people
email_address = 'unsubscribed@example.com'
participant_df[participant_df.real_email_address == email_address]

In [None]:
# remove any unsubscribed people
new_md_list = []
for md in md_list:
    if md['source_user_id'] in [0, 0, 0, 0, 0, 0]:
        continue
    new_md_list.append(md)
md_list = new_md_list
len(md_list)

In [None]:
# restrict which sites are considered

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df.head()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df.published_at.notna().value_counts()

In [None]:
start_time = datetime.strptime('2021-05-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
start_timestamp = int(start_time.timestamp() * 1000)
end_time = datetime.strptime('2021-12-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
end_timestamp = int(end_time.timestamp() * 1000)
sdf = journal_df[(journal_df.created_at >= start_timestamp)&(journal_df.created_at <= end_timestamp)]

curr_time = start_time
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(days=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

fig, ax = plt.subplots(1, 1, figsize=(8, 2))

x = sdf.created_at
total_counts, bin_edges = np.histogram(x, bins=bins)
ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)

ax.set_title(f"{len(sdf):,} journals")

ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f"{datetime.utcfromtimestamp(x / 1000).strftime('%m-%d')}"))

plt.show()

In [None]:
most_recent_journal = journal_df.groupby('site_id').published_at.max()
len(most_recent_journal)

In [None]:
most_recent_journal.value_counts(dropna=False).head(3)

In [None]:

# we allow only sites that have had a journal update in the last 12 days (to account for the delay between generating and sending out)
recency_required_days = 12
required_recent_journal_timestamp = int(datetime.strptime('2021-11-24 08:55', '%Y-%m-%d %H:%M').timestamp() * 1000) - (recency_required_days * 1000 * 60 * 60 * 24)
print(required_recent_journal_timestamp)

# otherwise, we insist on low-privacy, searchable, non-spam sites
eligible_site_df = site_df[(~site_df.isDeactivated)&(site_df.privacy == 'low')&(site_df.isGoogleable == '1')&(site_df.isSearchable == '1')]
print(len(eligible_site_df))

sites_with_recent_updates = set(most_recent_journal[most_recent_journal >= required_recent_journal_timestamp].index)
eligible_site_df = eligible_site_df[eligible_site_df.site_id.isin(sites_with_recent_updates)]
print(len(eligible_site_df))

eligible_site_ids = set(eligible_site_df.site_id)
len(eligible_site_ids)

In [None]:
np.sum(most_recent_journal >= required_recent_journal_timestamp), len(sites_with_recent_updates)

In [None]:
fl.rec_input_matrix_cache = {}

In [None]:
print(len(md_list))
rc_list = fl.get_reccontexts_from_test_contexts(md_list, site_allowlist=eligible_site_ids)
len(rc_list)

In [None]:
predictions = []
for rc in rc_list:
    scorer = model_manager.score_reccontext(rc)
    predictions.append(scorer)
len(predictions)

In [None]:
# create a map of participant_id -> list of (site_id, score) tuples
site_scores_map = {}
for rc, scorer in zip(rc_list, predictions):
    #rc = prediction[0]
    #scorer = prediction[1]
    
    participant_id = rc.source_user_id
    site_id_arr = scorer.site_id_arr
    y_score_site = scorer.scores_dict[model_manager.model_config.model_name]
    site_scores_map[participant_id] = [(int(site_id), float(score)) for site_id, score in zip(site_id_arr, y_score_site)]

In [None]:
predictions[0].scores_dict[model_manager.model_config.model_name].shape

In [None]:
site_id_arr = scorer.site_id_arr
y_score_site = scorer.scores_dict[model_manager.model_config.model_name]

sort_inds = np.argsort(y_score_site)

ranks = rankdata(-1 * y_score_site, method='max')

n=10
highest_scores = y_score_site[sort_inds[-n:]]
highest_score_site_ids = site_id_arr[sort_inds[-n:]]
for site_id, score in zip(highest_score_site_ids, highest_scores):
    print(f"{site_id:>10} {score:.4f}")

In [None]:
ranks

In [None]:
pred_arrs = []
for scorer in predictions:
    pred_arr = scorer.scores_dict[model_manager.model_config.model_name]
    pred_arrs.append(pred_arr)
len(pred_arrs)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

for pred_arr in pred_arrs:
    ys = sorted(list(pred_arr), reverse=True)
    xs = range(len(ys))
    ax.plot(xs, ys, linestyle='-', color='black', alpha=0.2)
    
ax.set_title(f"Distribution of predicted scores for ~{np.mean([len(pred_arr) for pred_arr in pred_arrs]):,.1f} sites")
ax.set_xlabel("Site rank")
ax.set_ylabel("Site score")
plt.show()

In [None]:
site_rank_dict = {}
for scorer in predictions:
    site_id_arr = scorer.site_id_arr
    y_score_site = scorer.scores_dict[model_manager.model_config.model_name]
    sort_inds = np.argsort(y_score_site)
    ranks = rankdata(-1 * y_score_site, method='max')
    for site_id, rank in zip(site_id_arr, ranks):
        if site_id not in site_rank_dict:
            site_rank_dict[site_id] = []
        site_ranks = site_rank_dict[site_id]
        site_ranks.append(rank)


In [None]:
# enable fast look-ups of site features
site_index = site_df.set_index('site_id')
site_index.head(1)

In [None]:
s = []
for site_id, ranks in site_rank_dict.items():
    ranks = np.array(ranks)
    n_top_appearances = [site_id, site_index.at[site_id, 'name']] + [np.sum(ranks <= i) for i in [1, 5, 10]]
    mean_rank = f"{np.mean(ranks):.1f}"
    n_visits = site_index.at[site_id, 'visits']
    n_top_appearances += [mean_rank, n_visits,]
    s.append(n_top_appearances)
s.sort(key = lambda t: t[3], reverse=True)

In [None]:
s[:20]

In [None]:
# lowest-ranking are spam and relatively inactive users
sorted(s, key=lambda t: float(t[-2]), reverse=True)[:20]

In [None]:
# eligible sites with fewer than 1918 visits are in the bottom-25% by visit count
np.quantile([t[-1] for t in s], 0.25)

In [None]:
np.sum(np.array([t[-1] for t in s]) < 1000) / len(s)

In [None]:
np.sum(np.array([t[-1] for t in s]) < 10000) / len(s)

In [None]:
# verify that there are no ties
def get_ties(ranks, y_score_site):
    #unique, counts = np.unique(ranks, return_counts=True)
    #return np.sum(counts > 1)
    return len(y_score_site) - len(ranks)
ties_list = []
for rc, scorer in zip(rc_list, predictions):
    participant_id = rc.source_user_id
    site_id_arr = scorer.site_id_arr
    y_score_site = scorer.scores_dict[model_manager.model_config.model_name]
    sort_inds = np.argsort(y_score_site)
    ranks = rankdata(-1 * y_score_site, method='max')
    n_ties = get_ties(ranks, y_score_site)
    ties_list.append(n_ties)
ties_list = np.array(ties_list)
np.mean(ties_list), np.sum(ties_list > 0)

In [None]:
# load in recommendations from previous rounds
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')

d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)

batch_df = pd.DataFrame(d)

participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
#participant_recced_site_map = {row.participant_id: [site['site_id'] for site in row.sse_site_list] for row in batch_df.itertuples()}
len(participant_recced_site_map)

In [None]:
n_recced_sites = [len(v) for v in participant_recced_site_map.values()]

fig, ax = plt.subplots(1, 1, figsize=(5,5))

ax.hist(n_recced_sites)

plt.show()

In [None]:
previously_recced_site_ids = set()
previously_recced_site_ids.update(*[[site['site_id'] for site in row.sse_site_list] for row in batch_df.itertuples()])
len(previously_recced_site_ids)

In [None]:
# approach for building rec lists: use greedy algorithm
# "draft" sites, with a maximum of 10 top-5 appearances for any site with < 1918 visits
# to do this, just choose a random traversal order over the participants

In [None]:
import random
from collections import defaultdict
participant_site_dict = {}  # map of participant_id -> list of site_ids
n_picks = 5
rng = np.random.default_rng(13)

restricted_site_count = {site_id: 10
    for site_id in site_rank_dict.keys()
}  # if site_index.at[site_id, 'visits'] < 100000
disallowed_sites = [
    #0,  # sept17th, deleted journal update
    0,  # oct1, health misinformation https://www.caringbridge.org/visit/prayforkainoa/journal
    0,  # oct8, weird self-promotion, possibly mirroring posts from LinkedIn (markkageyama)
]
participant_incurred_loss = defaultdict(float)  # how much did participants "give up" by not getting their top picks?

n_missed_picks = 0
n_previous_rec_attempts = 0
n_disallowed_picks = 0

finished_drafting = set()
while len(finished_drafting) < len(predictions):
    inds = np.arange(len(predictions))
    inds = rng.permutation(inds)
    for ind in inds:
        rc = rc_list[ind]
        scorer = predictions[ind]
        participant_id = rc.source_user_id
        if participant_id not in participant_site_dict:
            participant_site_dict[participant_id] = []
        participant_sites = participant_site_dict[participant_id]
        if len(participant_sites) >= n_picks:
            finished_drafting.add(participant_id)
            continue
        slot_to_fill = len(participant_sites) # e.g. 0 if the first pick, 4 if the 5th pick
    
        site_id_arr = scorer.site_id_arr
        y_score_site = scorer.scores_dict[model_manager.model_config.model_name]

        sort_inds = np.argsort(y_score_site)
        pick_made = False
        max_score = 0
        while not pick_made:
            preferred_site = site_id_arr[sort_inds[-(slot_to_fill) - 1]]
            if preferred_site in participant_sites:  # attempted duplicate pick
                slot_to_fill += 1
                continue
                
            # check if site has previously been recced
            if participant_id in participant_recced_site_map and preferred_site in participant_recced_site_map[participant_id]:
                slot_to_fill += 1
                n_previous_rec_attempts += 1
                continue
            # check if site is manually disallowed
            if preferred_site in disallowed_sites:
                slot_to_fill += 1
                n_disallowed_picks += 1
                continue
            # check if preferred_site is available
            if preferred_site in restricted_site_count:
                if restricted_site_count[preferred_site] <= 0:
                    restricted_site_count[preferred_site] -= 1
                    n_missed_picks += 1
                    slot_to_fill += 1
                    max_score = max(max_score, y_score_site[sort_inds[-(slot_to_fill) - 1]])
                else:
                    restricted_site_count[preferred_site] -= 1
                    pick_made = True
            else:
                pick_made = True
        if max_score > 0:
            incurred_loss = max_score - y_score_site[sort_inds[-(slot_to_fill) - 1]]
            participant_incurred_loss[participant_id] += incurred_loss
        participant_sites.append(preferred_site)
n_missed_picks, n_disallowed_picks, n_previous_rec_attempts

In [None]:
# distribution of number of times picked for the "restricted distribution" sites
# 10 sites were picked the maximum number of times (and thus could trigger conflicts)
# note this doesn't show number of ATTEMPTED picks, which could be higher (and thus the number of sites with 11+ attempted picks could be lower than the number below)
pd.Series(data=[10 - v for k, v in restricted_site_count.items() if v < 10], index=[k for k, v in restricted_site_count.items() if v < 10]).value_counts()

In [None]:
pd.Series(data=[v for k, v in restricted_site_count.items() if v < 10], index=[k for k, v in restricted_site_count.items() if v < 10]).sort_values().head(20)

In [None]:
# how much are participants "giving up" due to the duplication restriction?
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.hist([v for v in participant_incurred_loss.values()], bins=len(participant_incurred_loss), log=False)

plt.show()
np.max(list(participant_incurred_loss.values()))

In [None]:
site_counts = defaultdict(int)
for participant_id, site_ids in participant_site_dict.items():
    for site_id in site_ids[:5]:  # include all sites that appear in the top 5
        site_counts[site_id] += 1
site_counts = list(site_counts.items())
site_counts.sort(key=lambda t: t[1], reverse=True)
site_counts[:5]

In [None]:
for site_id, count in site_counts:
    site_name = site_index.at[site_id, 'name']
    print(f"{count:>3} {site_id:>9}{'*' if site_id in previously_recced_site_ids else ' '} {site_index.at[site_id, 'visits']:>7} https://www.caringbridge.org/visit/{site_name}/journal")

In [None]:
# reload the participant_df
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
# verify that the participant data contains entries for every person we are generating recs for
assert len(set(participant_site_dict.keys()) - set(participant_df.user_id)) == 0

In [None]:
# set the batch_id
batch_id = 10

In [None]:
# get customized survey links

survey_link_filepath = os.path.join(git_root_dir, f'data/survey/CaringBridge_Author_Recommendations_Feedback__b{batch_id}-Distribution_History.csv')
survey_link_df = pd.read_csv(survey_link_filepath)
print(len(survey_link_df))
survey_link_df.head()

In [None]:
# merge feedback_survey_link into the participant_df
survey_link_df['real_email_address'] = survey_link_df.Email.map(lambda e: e.strip().lower())
survey_link_df['feedback_survey_link'] = survey_link_df.Link
print(len(participant_df))
participant_df = participant_df.merge(survey_link_df[['real_email_address', 'feedback_survey_link']], how='left', on='real_email_address', validate='one_to_one')
print(len(participant_df))
participant_df.sample(n=3)

In [None]:
participant_index = participant_df.set_index('user_id')
participant_index.head(1)

In [None]:
import cbsend.compose

In [None]:
import cbsend.templates

In [None]:
from cbrec.text import textdb

def truncate_body(body):
    if len(body) > 175:
        return body[:175] + " ..."
    return body

generated_messages = []
participant_data_list = []

td = textdb.TextDatabase(config)
text_db = td.get_text_db()
try:
    for participant_id, site_ids in tqdm(participant_site_dict.items(), total=len(participant_site_dict)):
        email_address = participant_index.at[participant_id, 'real_email_address']
        first_name = participant_index.at[participant_id, 'first_name']
        feedback_survey_link = participant_index.at[participant_id, 'feedback_survey_link']
        #print(email_address, first_name, feedback_survey_link)
        rec_list = []
    
        empty_title = False
        empty_body = False
        for site_id in site_ids[:5]:  # include all sites that appear in the top 5
            site_name = site_index.at[site_id, 'name']
            site_title = site_index.at[site_id, 'title']
            recent_update = int(most_recent_journal[most_recent_journal >= required_recent_journal_timestamp][site_id])
            journal_oid = journal_df[(journal_df.site_id == site_id)&(journal_df.published_at == recent_update)].journal_oid.iloc[0]
            raw_title, raw_body = td.get_raw_journal_text_from_db(text_db, journal_oid)
            title = cbrec.text.textdb.clean_text(raw_title)
            body = cbrec.text.textdb.clean_text(raw_body.replace("</div>", "</div> "))
            
            body = body.replace("\n", " ").replace("\t", " ").replace("\r", " ")
            body = truncate_body(body)
            if title.strip() == "":
                empty_title = True
            if body.strip() == "":
                empty_body = True
            
            # identify links and replace them with "[link] "
            link_replaced = False
            if not empty_body:
                # search for links
                delinked_body = re.sub('https?:\/\/[\\S]*\s', '[link] ', body)
                if delinked_body != body:
                    print(f"Removed link from '{body}', new text '{delinked_body}'")
                    body = delinked_body
                    link_replaced = True
                # look for things that google might think are links...
                if re.search('(^|\\s|\/)[^\\.\\s][^\\.\\s]?[^\\.\\s]?\\.[^\\.\\s][^\\s]*', body):
                    print(f"Suspicious maybe-link: '{body}'")
            if link_replaced:
                # TODO search for a new body eligible body text?
                pass
            
            #print(site_name, site_title, recent_update, title)
            #print(body)
            rec = {
                'site_id': int(site_id),  # convert from int64
                'site_name': site_name,
                'site_title': site_title,
                'journal_oid': journal_oid,
                'journal_timestamp': recent_update,
                'journal_body': body,
                'journal_title': title,
            }
            rec_list.append(rec)
        
        #email_address = 'zwlevonian@gmail.com'  # override recipient email during testing
        msg = cbsend.compose.create_email(participant_id, batch_id, email_address, first_name, feedback_survey_link, rec_list)
        generated_messages.append((email_address, msg))
        
        d = {
            'participant_id': participant_id,
            'real_email_address': email_address,
            'first_name': first_name,
            'feedback_survey_link': feedback_survey_link,
            'site_scores': site_scores_map[participant_id],
            'sse_site_list': rec_list,
            'sse_sent_timestamp': -1,
        }
        participant_data_list.append(d)
finally:
    text_db.close()
len(generated_messages)

In [None]:
email_address, msg = generated_messages[4]
cbsend.compose.send_email("zwlevonian@gmail.com", msg)

In [None]:
# check: is this what you expect it to be?
batch_id

In [None]:
emails_sent = set()

In [None]:
email_address_send_time = {}
for email_address, msg in tqdm(generated_messages, desc='Sending emails'):
    if email_address in emails_sent:
        raise ValueError(email_address)
    result = cbsend.compose.send_email(email_address, msg)
    email_address_send_time[email_address] = int(datetime.now().timestamp() * 1000)
    if result:
        emails_sent.add(email_address)
len(emails_sent), len(email_address_send_time)

In [None]:
# can manually verify that all emails were sent by looking in Sent folder
email_address, msg = generated_messages[36]
email_address

In [None]:
# if one wasn't sent, attempt resending manually
email_address, msg = generated_messages[36]
cbsend.compose.send_email(email_address, msg)
email_address_send_time[email_address] = int(datetime.now().timestamp() * 1000)

In [None]:
# add sent times to the participant_data_list
for participant_data in participant_data_list:
    participant_id = participant_data['participant_id']
    email_address = participant_data['real_email_address']
    participant_data['sse_sent_timestamp'] = email_address_send_time[email_address]

In [None]:
batch_id

In [None]:
participant_data_list[0].keys()

In [None]:
participant_data_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', f'participant_rec_data_b{batch_id}.ndjson')
with open(participant_data_filepath, 'w') as outfile:
    for participant_data in participant_data_list:
        outfile.write(json.dumps(participant_data) + "\n")
print(f"Finished writing {participant_data_filepath}.")

In [None]:
# note: we accidentally used batch_id 3 for batch 4...

In [None]:
# first batch: hard-coded
email_send_date = datetime.strptime("2021-09-02 14:57:26", "%Y-%m-%d %H:%M:%S").astimezone(pytz.timezone('US/Central'))
email_send_timestamp = int(email_send_date.timestamp() * 1000)
email_send_date.isoformat()

In [None]:
# tmp
# save everything we can for the future
# this was replaced with the code above...

What do we want to save about a batch?

 - For each participant:
   - All site ids + scores
   - Sites included in order, and what order
     - The messages included in the email, 

In [None]:
site_scores_map = {}
for prediction in predictions:
    rc = prediction[0]
    scorer = prediction[1]
    
    participant_id = rc.source_user_id
    site_id_arr = scorer.site_id_arr
    y_score_site = scorer.scores_dict['PointwiseLinearTorchModel']
    site_scores_map[participant_id] = [(int(site_id), float(score)) for site_id, score in zip(site_id_arr, y_score_site)]

In [None]:
sent_rec_emails_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'sent_rec_emails.tsv')
email_send_timestamps = pd.read_csv(sent_rec_emails_filepath, sep='\t', header=None, names=['email_send_timestamp', 'email_address']).set_index('email_address').email_send_timestamp
email_send_timestamps.head()

In [None]:
# save the data that was generated for each participant
td = textdb.TextDatabase(config)
text_db = td.get_text_db()
try:
    participant_data_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'participant_rec_data.ndjson')
    with open(participant_data_filepath, 'w') as outfile:
        for participant_id, site_ids in tqdm(participant_site_dict.items(), total=len(participant_site_dict)):
            email_address = participant_index.at[participant_id, 'real_email_address']
            first_name = participant_index.at[participant_id, 'first_name']
            feedback_survey_link = participant_index.at[participant_id, 'feedback_survey_link']

            rec_list = []
            for site_id in site_ids[:5]:  # include all sites that appear in the top 5
                site_name = site_index.at[site_id, 'name']
                site_title = site_index.at[site_id, 'title']
                recent_update = int(most_recent_journal[most_recent_journal >= required_recent_journal_timestamp][site_id])
                journal_oid = journal_df[(journal_df.site_id == site_id)&(journal_df.published_at == recent_update)].journal_oid.iloc[0]

                raw_title, raw_body = td.get_raw_journal_text_from_db(text_db, journal_oid)
                title = cbrec.text.textdb.clean_text(raw_title)
                body = cbrec.text.textdb.clean_text(raw_body.replace("</div>", "</div> "))
                body = body.replace("\n", " ").replace("\t", " ")
                body = truncate_body(body)

                rec = {
                    'site_id': site_id,
                    'site_name': site_name,
                    'site_title': site_title,
                    'journal_oid': journal_oid,
                    'journal_timestamp': recent_update,
                    'cleaned_journal_body': body,
                    'cleaned_journal_title': title,
                }
                rec_list.append(rec)
            d = {
                'participant_id': participant_id,
                'real_email_address': email_address,
                'first_name': first_name,
                'feedback_survey_link': feedback_survey_link,
                'site_scores': site_scores_map[participant_id],
                'sse_site_list': rec_list,
                'sse_sent_timestamp': int(email_send_timestamps.at[email_address]),
            }
            outfile.write(json.dumps(d) + "\n")
finally:
    text_db.close()


### Original training implementation

Maybe some useful stuff here, but generally now defunct.

In [None]:
test_md_list = cbrec.utils.get_test_metadata(md_list)
torch_model.test_model(test_md_list)

In [None]:
def get_triples():
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    #triple_metadata = []
    arrs = []
    ys = []
    
    try:
        for row in cbrec.featuredb.stream_triples(db):
            #md = {key: row[key] for key in row.keys() if not key.endswith("_arr")}
            #triple_metadata.append(md)
            target_feature_arr = np.concatenate([row['source_feature_arr'], row['target_feature_arr'], row['source_target_feature_arr']])
            alt_feature_arr = np.concatenate([row['source_feature_arr'], row['alt_feature_arr'], row['source_alt_feature_arr']])
            arrs.append(target_feature_arr)
            ys.append(1)
            arrs.append(alt_feature_arr)
            ys.append(0)
        #df = pd.DataFrame(triple_metadata)
        #return df
    finally:
        db.close()
    return arrs, ys
        
feature_arrs, ys = get_triples()
len(ys)

In [None]:
X = np.vstack(feature_arrs)
y_true = np.array(ys)
X.shape, y_true.shape

In [None]:
# what numbers evenly divide out the total?
for i in range(2, len(y_true) + 1):
    if np.isclose((len(y_true) // i) - (len(y_true) / i), 0):
        print(i, len(y_true) / i)

In [None]:
# how much memory is being used?
import resource
kbytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
f"{kbytes / 1024 / 1024:.2f}GB"

In [None]:

class LinearNet(nn.Module):
    def __init__(self, n_hidden, dropout_p=0.2):
        super(LinearNet, self).__init__()
        # note: 768 is the size of the roBERTa outputs
        self.fc1 = nn.Linear(27, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, 1, bias=False)
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.dropout2 = nn.Dropout(p=dropout_p)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)  # note: not using F.sigmoid here, as the loss used includes the Sigmoid transformation
        return x
    
    
def train_pytorch_model(X_train, y_train, X_test, y_test):
    """
    Trains a PyTorch-based Neural Net using the parameters defined in the learner_config
    """
    
    n_train = len(y_train)
    n_test = len(y_test)
    
    verbose = True
    n_hidden = 100
    n_epochs = 100
    lr_init = 0.01
    max_lr = 0.1
    dropout_p = 0.1
    minibatch_size = len(y_train)
    minibatch_size = min(n_train, minibatch_size)  # if minibatch_size is larger than n_train, force it to n_train
    n_minibatches = int(np.ceil(n_train / minibatch_size))
    
    net = LinearNet(n_hidden, dropout_p)
    
    #optimizer = optim.SGD(net.parameters(), lr=lr_init, momentum=0.9)
    optimizer = optim.Adam(net.parameters(), lr=lr_init)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=max_lr,
        steps_per_epoch=n_minibatches,
        epochs=n_epochs,
    )
    
    criterion = nn.BCEWithLogitsLoss()  # pointwise loss function
    
    X_train_tensor = torch.from_numpy(X_train)
    y_train_tensor = torch.from_numpy(y_train)
    y_train_tensor = y_train_tensor.view(-1, 1)  # make labels 2-dimensional
    y_train_tensor = y_train_tensor.type_as(X_train_tensor)
    if verbose:
        print(X_train_tensor.size(), y_train_tensor.size())
    
    net.train()
    for epoch in range(n_epochs):
        s = datetime.now()
        optimizer.zero_grad()
        
        # shuffle the training data
        # I am not sure if this matters at all
        epoch_order = torch.randperm(n_train)
        
        mb_metrics = []  # store the minibatch_metrics, then average after
        for minibatch in range(n_minibatches):
            minibatch_start = minibatch * minibatch_size
            minibatch_end = min(minibatch_start + minibatch_size, n_train)
            if verbose and epoch == 0:
                print(f"    Minibatch for inds in {minibatch_start} - {minibatch_end}.")
            minibatch_inds = epoch_order[minibatch_start:minibatch_end]
            
            inputs = X_train_tensor[minibatch_inds]
            labels = y_train_tensor[minibatch_inds]

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # compute and log the loss
            y_train_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
            y_train_pred = (y_train_pred >= 0.5).astype(int)  # binarize predictions with a 0.5 decision boundary
            y_train_minibatch = y_train[minibatch_inds.numpy()]
            acc = np.sum(y_train_pred == y_train_minibatch) / len(y_train_minibatch)
            
            mb_metrics.append((loss.item(), acc))
        loss, acc = np.mean(np.array(mb_metrics), axis=0)
            
        should_stop_early = loss < 0.001
        if verbose and (epoch < 5 or epoch == n_epochs - 1 or epoch % 10 == 0 or should_stop_early):
            print(f"{epoch:>3} ({datetime.now() - s}): loss={loss:.4f} accuracy={acc*100:.2f}% LR={optimizer.param_groups[0]['lr']:.2E}")
        if should_stop_early:
            break
    # this is a hack, but we store training results info back through the learner_config dictionary
    final_train_loss = loss
    final_epoch_count = epoch + 1
    if verbose:
        print(f"Completed {final_epoch_count} epochs with a final train loss of {final_train_loss:.4f}.")
        
    net.eval()
    with torch.no_grad():
        X_test_tensor = torch.from_numpy(X_test)
        outputs = net(X_test_tensor)
        y_test_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
        y_test_pred = (y_test_pred >= 0.5).astype(int)
        acc = np.sum(y_test_pred == y_test) / len(y_test)
        print(f"Test acc: {acc*100:.2f}%")
    return net



In [None]:
X_full = X

In [None]:
X = sklearn.preprocessing.StandardScaler().fit_transform(X)

In [None]:
n_train = int(np.ceil(len(y_true) * 0.99))
X_train = X[:n_train,:]
X_test = X[n_train:,:]
y_train = y_true[:n_train]
y_test = y_true[n_train:]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_test_pred = train_pytorch_model(X_train, y_train, X_test, y_test)

In [None]:
X_synth = np.random.normal(size=X.shape).astype(np.float32)
y_true_synth = np.zeros(y_true.shape).astype(int)
y_true_synth[np.arange(0, len(y_true_synth), 2)] = 1
print(np.sum(y_true_synth) / len(y_true_synth))
X_synth[y_true_synth == 1,0] += 3

X_train_synth = X_synth[:n_train,:]
X_test_synth = X_synth[n_train:,:]
y_train_synth = y_true_synth[:n_train]
y_test_synth = y_true_synth[n_train:]
X_train_synth.shape, X_test_synth.shape, y_train_synth.shape, y_test_synth.shape

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7,7))
ax.hexbin(X_synth[:,0], X_synth[:,1])
plt.show()

In [None]:
y_test_pred = train_pytorch_model(X_train_synth, y_train_synth, X_test_synth, y_test_synth)

In [None]:
test_md_list = cbrec.utils.get_test_metadata(md_list)
len(test_md_list)

In [None]:
tdf = df[df.type == 'test']
len(tdf)

In [None]:
def get_test_contexts(config, test_md_list, clf):
    db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
    
    try:
        for md in test_md_list:
            metadata_id = md['metadata_id']
            test_context = cbrec.featuredb.get_test_context_by_metadata_id(db, metadata_id, config)
            rc = cbrec.reccontext.RecContext.create_from_test_context(config, md, test_context)
            
            scorer = cbrec.evaluation.SklearnModelScorer(config, rc, clf, "PointwiseLogreg")
            metric_dict = scorer.score_proba()
            md['baseline_metrics']['PointwiseLogreg'] = metric_dict
    finally:
        db.close()
        
get_test_contexts(config, test_md_list, clf)

In [None]:
models = test_md_list[0]['baseline_metrics'].keys()
print(models)
model_df_dict = {}
for model in tqdm(models):
    metrics_list = []
    for md in test_md_list:
        metrics = md['baseline_metrics'][model]
        metrics['metadata_id'] = md['metadata_id']
        metrics['source_user_initiated_in_train_period'] = md['source_user_initiated_in_train_period']
        metrics['target_site_initiated_with_in_train_period'] = md['target_site_initiated_with_in_train_period']
        metrics_list.append(metrics)
    mdf = pd.DataFrame(metrics_list)
    mdf['reciprocal_rank'] = 1 / mdf.target_rank
    #mdf['reciprocal_rank_10'] = 1 / mdf.target_rank
    model_df_dict[model] = mdf
    print(model, len(mdf))
len(model_df_dict)

In [None]:
scores = []
for model in models:
    mdf = model_df_dict[model][['target_raw_score', 'target_rank', 'reciprocal_rank', 'ndcg_1', 'ndcg_5', 'ndcg_10', 'ndcg_50']]
    means = mdf.mean()
    means = pd.concat([pd.Series([np.sum(mdf.target_rank <= 5) / len(mdf),], index=['% <= rank 5',]), means])
    means = pd.concat([pd.Series([np.sum(mdf.target_rank <= 1) / len(mdf),], index=['% rank 1',]), means])
    means = pd.concat([pd.Series([model,], index=['model',]), means])
    scores.append(means)
score_df = pd.DataFrame(scores).rename(columns={'target_rank': 'mean_rank', 'reciprocal_rank': 'mrr', 'target_raw_score': 'mean_raw_score'}).sort_values(by='mean_rank')
score_df

In [None]:
mdf = model_df_dict['PointwiseLinearTorchModel']
print(len(mdf))
mdf.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

ax.hist(mdf.acc, bins=np.linspace(0, 1, 20), log=True)
ax.axvline(np.mean(mdf.acc), color='black', linestyle='--', alpha=0.8, label=f'Mean ({np.mean(mdf.acc):.2f})')
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

ax.set_title("Distribution of predictions for targets")
ax.hist(mdf.target_raw_score, bins=np.linspace(0, 1, 20), log=True)
ax.axvline(np.mean(mdf.target_raw_score), color='black', linestyle='--', alpha=0.8, label=f'Mean ({np.mean(mdf.target_raw_score):.2f})')
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

ax.set_title("Distribution of predictions for targets")
for source_in_train in [False, True]:
    for target_in_train in [False, True]:
        sdf = mdf[(mdf.source_user_initiated_in_train_period == source_in_train)&(mdf.target_site_initiated_with_in_train_period == target_in_train)]
        ax.hist(sdf.target_raw_score, bins=np.linspace(0, 1, 20), log=True, alpha=0.5)
        ax.axvline(np.mean(sdf.target_raw_score), color='black', linestyle='--', alpha=0.8, label=f'Mean (n={len(sdf)},s={source_in_train},t={target_in_train}) ({np.mean(sdf.target_raw_score):.2f})')
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

ax.set_title("Distribution of target ranks")
bins = np.linspace(0, 5000, 20)
for source_in_train in [False, True]:
    for target_in_train in [False, True]:
        sdf = mdf[(mdf.source_user_initiated_in_train_period == source_in_train)&(mdf.target_site_initiated_with_in_train_period == target_in_train)]
        _, _, patches = ax.hist(sdf.target_rank, bins=bins, log=True, alpha=0.5)
        ax.axvline(np.mean(sdf.target_rank), color=patches[0]._facecolor, linestyle='--', alpha=0.8, label=f'Mean Rank = {np.mean(sdf.target_rank):.1f} (n={len(sdf)},s={source_in_train},t={target_in_train})')
ax.legend()

plt.show()

In [None]:
pd.crosstab(mdf.source_user_initiated_in_train_period, mdf.target_site_initiated_with_in_train_period, margins=True)

In [None]:
pd.crosstab(mdf.source_user_initiated_in_train_period, mdf.target_site_initiated_with_in_train_period, margins=True, values=mdf.target_rank, aggfunc=np.mean)