Zach's Torch Experiments Demo
===

Notes from Sept 28 and Oct 1, 2021 meetings about using PyTorch and loading feature data.

What we did:
 - Installed the pytorch-cpuonly kernel so that we could use it with Jupyter.
 - Imported some stuff from the notebook/eval/PytorchTraining notebook
 - Loaded feature data from a directory
 - Used that feature data to train a model
 
 
## Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html#metrics-and-scoring-quantifying-the-quality-of-predictions

## Neural Net training tips

Training neural nets (although some of this is vision-specific): 
http://karpathy.github.io/2019/04/25/recipe/

https://twitter.com/jmhessel/status/1111715093404884992?s=21

## Random Q&A

 - How are texts tokenized? https://huggingface.co/transformers/tokenizer_summary.html#byte-pair-encoding
 - How can we combine word embeddings? https://arxiv.org/abs/1805.09843

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import torch
import pickle
from datetime import datetime
from tqdm import tqdm

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

## Import cbcore

Only necessary for paths and some utility functions, so you may not need this.

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

## Import cbrec

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))

In [None]:
import cbrec.genconfig

In [None]:
# create a config, which is needed by lots of the components for resolving paths, etc.
config = cbrec.genconfig.Config()

In [None]:
import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader

In [None]:
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

# Load feature matrices

In [None]:
# load train features
feature_cache_dir = os.path.join(config.torch_experiments_dir, 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]

def get_features(x_filename, y_filename):
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
    return X, y

x_filename, y_filename = filenames[0]
X_train, y_train = get_features(x_filename, y_filename)
    
x_filename, y_filename = filenames[1]
X_test, y_test = get_features(x_filename, y_filename)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train[:40]

## Feature details

Why is each row 1563?

Each row represents TWO USPs.
Each USP is composed of activity features (9), network features (3), and text features (768).
In addition, the 2 USPs have SHARED features. (3)

The two USPs are:
 - the SOURCE
 - the CANDIDATE
 
For the non-text features, you can find the code that generates them in cbrec.feature_extraction.

In [None]:
(9 + 3 + 768) * 2 + 3

In [None]:
# half of the training data is 1s, the other half is 0s
np.sum(y_train)

In [None]:
# shuffle the data
inds = np.arange(len(X_train))
np.random.shuffle(inds)
X_train = X_train[inds]
y_train = y_train[inds]

In [None]:
np.mean(X_train, axis=0)[:5]

In [None]:
# scale the data to speed up convergence
import sklearn.preprocessing
scaler = sklearn.preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
np.mean(X_train, axis=0)[:5]

In [None]:
import torch
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class LinearNet(nn.Module):
    """
    Simple neural net with 2 hidden layers.
    """
    def __init__(self, n_input, n_hidden, dropout_p=0.2):
        super(LinearNet, self).__init__()
        # note: 768 is the size of the roBERTa outputs
        self.fc1 = nn.Linear(n_input, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, 1, bias=False)
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.dropout2 = nn.Dropout(p=dropout_p)

        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)  # note: not using F.sigmoid here, as the loss used includes the Sigmoid transformation
        return x

In [None]:
logger = logging.getLogger("notebook.ZachTorchExperimentsDemo")
    
n_train = len(y_train)
n_test = len(y_test)

verbose = True
n_hidden = 100
n_epochs = 100
lr_init = 0.01
max_lr = 0.02  # 0.0155
dropout_p = 0.1
minibatch_size = len(y_train)
minibatch_size = min(n_train, minibatch_size)  # if minibatch_size is larger than n_train, force it to n_train
n_minibatches = int(np.ceil(n_train / minibatch_size))

n_input = X_train.shape[1]
# note: input dim is 27 for non-text features + 768 for text features
net = LinearNet(n_input, n_hidden, dropout_p)

#optimizer = optim.SGD(net.parameters(), lr=lr_init, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=lr_init)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=max_lr,
    steps_per_epoch=n_minibatches,
    epochs=n_epochs,
)

criterion = nn.BCEWithLogitsLoss()  # pointwise loss function

X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
y_train_tensor = y_train_tensor.view(-1, 1)  # make labels 2-dimensional
y_train_tensor = y_train_tensor.type_as(X_train_tensor)
if verbose:
    logger.info(f"Input tensor sizes: {X_train_tensor.size()}, {y_train_tensor.size()}")

net.train()
for epoch in range(n_epochs):
    s = datetime.now()
    optimizer.zero_grad()

    # shuffle the training data
    # I am not sure if this matters at all
    epoch_order = torch.randperm(n_train)

    mb_metrics = []  # store the minibatch_metrics, then average after
    for minibatch in range(n_minibatches):
        minibatch_start = minibatch * minibatch_size
        minibatch_end = min(minibatch_start + minibatch_size, n_train)
        if verbose and epoch == 0:
            logger.info(f"    Minibatch for inds in {minibatch_start} - {minibatch_end}.")
        minibatch_inds = epoch_order[minibatch_start:minibatch_end]

        inputs = X_train_tensor[minibatch_inds]
        labels = y_train_tensor[minibatch_inds]

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # compute and log the loss
        y_train_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
        y_train_pred = (y_train_pred >= 0.5).astype(int)  # binarize predictions with a 0.5 decision boundary
        y_train_minibatch = y_train[minibatch_inds.numpy()]
        acc = np.sum(y_train_pred == y_train_minibatch) / len(y_train_minibatch)

        mb_metrics.append((loss.item(), acc))
    loss, acc = np.mean(np.array(mb_metrics), axis=0)

    should_stop_early = loss < 0.001
    if verbose and (epoch < 5 or epoch == n_epochs - 1 or epoch % 10 == 0 or should_stop_early):
        # TODO we should compute loss and accuracy based on the validation set here
        logger.info(f"{epoch:>3} ({datetime.now() - s}): loss={loss:.4f} accuracy={acc*100:.2f}% LR={optimizer.param_groups[0]['lr']:.2E}")
    if should_stop_early:
        break
# this is a hack, but we store training results info back through the learner_config dictionary
final_train_loss = loss
final_epoch_count = epoch + 1
if verbose:
    logger.info(f"Completed {final_epoch_count} epochs with a final train loss of {final_train_loss:.4f}.")

net.eval()
with torch.no_grad():
    X_test_tensor = torch.from_numpy(X_test)
    outputs = net(X_test_tensor)
    y_test_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
    y_test_pred = (y_test_pred >= 0.5).astype(int)
    acc = np.sum(y_test_pred == y_test) / len(y_test)
    logger.info(f"Test acc: {acc*100:.2f}%")

In [None]:
# save the model
model_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'model_cache')
torch.save(net.state_dict(), os.path.join(model_cache_dir, 'ZachTestNet.pt'))

# Create test RecContexts

## Create RecContexts from scratch

None of this code needs to be run; you can just skip to the next section, which loads from the pickle file containing the instantiated RecContext objects.

Note that the FULL RecContext file is 117 GB!  That's why you should load the random set of 2000 instead.

In [None]:
test_md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test']
len(test_md_list)

In [None]:
fl = cbrec.feature_loader.FeatureLoader(config)

In [None]:
import cbrec.text.embeddingdb

In [None]:
journal_embedding_map = {}

config.text_feature_db_filepath = os.path.join(config.feature_data_dir, 'test_text_feature.sqlite')
db = cbrec.text.embeddingdb.get_text_feature_db(config)
with db:
    for text in tqdm(cbrec.text.embeddingdb.stream_text_features(db), total=998905):
        journal_id = text['text_id']
        journal_embedding_map[journal_id] = text['feature_arr']
len(journal_embedding_map)

In [None]:
test_md_map = {md['metadata_id']: md for md in test_md_list}

required_journal_ids = set()
n_invalid = 0
n_error = 0

db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    for test_context in tqdm(cbrec.featuredb.stream_test_contexts(db, config), desc='Streaming test contexts', total=32612):
        test_context_md = test_md_map[test_context['metadata_id']]
        interaction_timestamp = int(test_context_md['timestamp'])
        source_usp_arr = test_context['source_usp_arr']
        source_usps = [(source_usp_arr[i,0], source_usp_arr[i,1]) for i in range(source_usp_arr.shape[0])]
        candidate_usp_arr = test_context['candidate_usp_arr']
        candidate_usps = [(candidate_usp_arr[i,0], candidate_usp_arr[i,1]) for i in range(candidate_usp_arr.shape[0])]
        error = False
        for usp in source_usps + candidate_usps:
            journal_ids = fl.journal_id_lookup.get_journal_updates_before(usp, interaction_timestamp)
            if len(journal_ids) < 3:
                n_invalid += 1
                error = True
            else:
                required_journal_ids.update(journal_ids)
        if error:
            n_error += 1
len(required_journal_ids), n_error, n_invalid

In [None]:
required_journal_ids_filepath = os.path.join(config.model_data_dir, 'test_journal_oids.txt')
with open(required_journal_ids_filepath, 'w') as outfile:
    for journal_oid in required_journal_ids:
        outfile.write(journal_oid + "\n")
logging.info(f"Wrote {len(required_journal_ids)} journal ids to '{required_journal_ids_filepath}'.")

In [None]:
test_md_map = {md['metadata_id']: md for md in test_md_list}

db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    for test_context in tqdm(cbrec.featuredb.stream_test_contexts(db, config), desc='Streaming test contexts'):
        test_context_md = test_md_map[test_context['metadata_id']]
        rc = cbrec.reccontext.RecContext.create_from_test_context(config, md, test_context)
        
        

In [None]:
rng = np.random.default_rng(13)
subset_md_list = rng.choice(test_md_list, size=1000, replace=False)
len(subset_md_list)

In [None]:
rc_list = []
db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    for md in tqdm(subset_md_list, desc="Creating test RecContexts"):
        metadata_id = md['metadata_id']
        test_context = cbrec.featuredb.get_test_context_by_metadata_id(db, metadata_id, config)
        rc = cbrec.reccontext.RecContext.create_from_test_context(config, md, test_context)
        rc_list.append(rc)

In [None]:
fl = cbrec.feature_loader.FeatureLoader(config)

In [None]:
# identify how many journal ids are required to create the appropriate feature matrices WITH text data
required_journal_ids = fl.identify_required_journal_ids(subset_md_list)
len(required_journal_ids)

In [None]:
# test equivalence with previously written required journals
# note: just makes sure required_journal_ids are in test1000_required_journal_oids.txt, not if we are missing some ids in required_journal_ids
with open(os.path.join(config.model_data_dir, 'test1000_required_journal_oids.txt'), 'r') as infile:
    for line in infile:
        journal_id = line.strip()
        if journal_id != "":
            assert journal_id in required_journal_ids

In [None]:
with open(os.path.join(config.model_data_dir, 'test1000_required_journal_oids.txt'), 'w') as outfile:
    for journal_id in required_journal_ids:
        outfile.write(journal_id + "\n")

To generate the feature database:

    sbatch -p amdsmall make_text_features_test.sh

Which runs: 

    python cbrec/text/createTextFeatureSqlite.py --text-id-txt /home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data/test1000_required_journal_oids.txt --text-feature-db-filename test_text_feature.sqlite --n-processes 3



In [None]:
rc = rc_list[0]
rc.target_inds

In [None]:
rc.candidate_usp_arr.shape

In [None]:
# this is the target USP, from within the list of candidates
rc.candidate_usp_arr[rc.target_inds]

In [None]:
# note that this takes almost 2 hours!
# it also uses a ton of RAM
# load from the pickle instead
for rc in tqdm(rc_list, desc='Creating feature matrices'):
    arrs = []
    for i in range(len(rc.source_usp_mat)):
        source_feature_arr = rc.source_usp_mat[i,:]
        for j in range(len(rc.candidate_usp_mat)):
            candidate_feature_arr = rc.candidate_usp_mat[j,:]

            ind = (i * len(rc.candidate_usp_arr)) + j
            source_candidate_feature_arr = rc.user_pair_mat[ind,:]

            arr = np.concatenate([source_feature_arr, candidate_feature_arr, source_candidate_feature_arr])
            arrs.append(arr)
    X = np.vstack(arrs)
    rc.X_test = X

In [None]:
# save rc_list to pickle
s = datetime.now()
feature_cache_dir = os.path.join(config.torch_experiments_dir, 'feature_cache')
with open(os.path.join(feature_cache_dir, 'rc_test_notext.pkl'), 'wb') as outfile:
    pickle.dump(rc_list, outfile, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Saved {len(rc_list)} to pickle in {datetime.now() - s}.")

In [None]:
# save subset of rc_list to pickle
rng = np.random.default_rng(12)
subset_rc_list = rng.choice(rc_list, size=2000, replace=False)

s = datetime.now()
feature_cache_dir = os.path.join(config.torch_experiments_dir, 'feature_cache')
with open(os.path.join(feature_cache_dir, 'rc_test_notext_2000.pkl'), 'wb') as outfile:
    pickle.dump(subset_rc_list, outfile, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Saved {len(subset_rc_list)} to pickle in {datetime.now() - s}.")

In [None]:
fl.config.text_feature_db_filepath = os.path.join(fl.config.feature_data_dir, 'test_text_feature.sqlite')
fl.config.text_feature_db_filepath

In [None]:
for rc in tqdm(rc_list):
    rc.X_test = fl.get_input_matrix_from_test_context(rc)

In [None]:
dm = cbrec.data.DataManager(config, load_ints=False, load_journals=True)

In [None]:
journal_df = dm.get_filtered_journals().sort_values(by=['user_id', 'site_id', 'published_at'])
len(journal_df)

In [None]:
timestamp = rc_list[0].timestamp
usp = (0, 0)

In [None]:
rc = rc_list[0]
int_created_at = int(rc.timestamp)
int_user_id = rc.source_user_id
int_site_id = rc.target_site_id
int_created_at, int_user_id, int_site_id

In [None]:
journal_df[(journal_df.user_id == usp[0])&(journal_df.site_id == usp[1])]

In [None]:
journal_df[journal_df.site_id == usp[1]].user_id.value_counts()

In [None]:
journal_df[journal_df.user_id == usp[0]].site_id.value_counts()

In [None]:
# TODO write a function to identify invalid usps... but more importantly, investigate why those usps are in the candidate list?

In [None]:
n_not_present = 0
n_candidate_usps = 0
for usp in rc.candidate_usp_arr:
    usp = (usp[0], usp[1])
    n_candidate_usps += 1 
    if usp not in fl.journal_id_lookup.usp_journal_timestamp_map:
        n_not_present += 1
n_not_present, n_candidate_usps

In [None]:
assert usp in fl.journal_id_lookup.usp_journal_timestamp_map
fl.journal_id_lookup.get_journal_updates_before(usp, timestamp)

In [None]:
# if the rc_list isn't already generated, can create it WITH text data using the following
rc_list = fl.get_reccontexts_from_test_contexts(subset_md_list)
len(rc_list)

## Score and inspect scores for RecContexts

Loads the subset of 2000 RecContexts from pickle.

In [None]:
# load rc_list from pickle
s = datetime.now()
feature_cache_dir = os.path.join(config.torch_experiments_dir, 'feature_cache')
with open(os.path.join(feature_cache_dir, 'rc_test_notext_2000.pkl'), 'rb') as infile:
    rc_list = pickle.load(infile)
print(f"Loaded {len(rc_list)} RecContexts in {datetime.now() - s}.")

In [None]:
for rc in rc_list:
    if len(rc.source_usp_arr) > 2:
        print(rc.X_test.shape, len(rc.source_usp_arr), len(rc.candidate_usp_arr))

In [None]:
# see the feature description above!
rc.X_test[0,:]

In [None]:
import cbrec.evaluation
class CustomModelScorer(cbrec.evaluation.Scorer):
    def __init__(self, config, test_context: cbrec.reccontext.RecContext, 
                 # TODO pass in a model object here, if appropriate
                model_name="CustomModel"):
        super().__init__(config, test_context, coverage_tracker=None, save_scores=True)
        self.model_name = model_name

    def score(self):
        """
        Score the RecContext.
        
        Use self.text_context to produce a y_score_site list, and return a dictionary of metrics.
        
        """
        X = self.test_context.X_test
        #y_score = np.random.random(size=(X.shape[0]))  # random model
        y_score = X[:,-1].astype(int)  # in the rc arrays, the last feature corresponds to "is_reciprocal"
        
        y_score_mat = self.get_empty_score_arr('full')
        y_score_mat = y_score.reshape((y_score_mat.shape[1], y_score_mat.shape[0])).T

        y_score_site = self.reduce_usp_ranking_to_site(self.merge_multisource_rankings(y_score_mat))
        self.compute_metrics(y_score_site, model_name=self.model_name)
        
        return self.metrics_dict[self.model_name]

In [None]:
scorer = CustomModelScorer(config, rc_list[0])
scorer

In [None]:
scorer.score()

In [None]:
metric_dicts = []
for rc in tqdm(rc_list):
    scorer = CustomModelScorer(config, rc)
    metric_dict = scorer.score()
    metric_dicts.append(metric_dict)
len(metric_dicts)

In [None]:
# convert the metrics into a Pandas dataframe for easier management
df = pd.DataFrame(metric_dicts)
df.sample(n=5)

In [None]:
# compute "Mean Reciprocal Rank"
df['mrr'] = 1 / df.target_rank

In [None]:
# this is a key model metric!
df.mrr.mean()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.hist(df.target_raw_score)
ax.set_title("Distribution of target scores")

plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

ax = axes[0]
sdf = df[df.target_raw_score == 0]
ax.hist(1 - (sdf.target_rank / sdf.n), bins=20)

ax = axes[1]
sdf = df[df.target_raw_score == 1]
ax.hist(1 - (sdf.target_rank / sdf.n), bins=50)

plt.show()

## Demo: Inspecting the test data

In [None]:
# these are all of the objects defined on the test RecContexts
[v for v in rc.__dir__() if not v.startswith("_")]

In [None]:
# we can create a dataframe from the metadata that describes these RecContexts
md_df = pd.DataFrame([rc.md for rc in rc_list])
len(md_df)

In [None]:
md_df.columns

In [None]:
# value_counts is a very useful function
md_df.n_target_usps.value_counts()

In [None]:
# every RC has a target
md_df.has_target.value_counts()

In [None]:
# these are cases where the target user was not an active user
md_df.test_target_usp_adjustment.value_counts()

In [None]:
# we can add new columns to the dataframe to aid in our analysis
# for example, we add the length of target_inds
n_target_inds = [len(rc.target_inds) for rc in rc_list]
md_df['n_target_inds'] = n_target_inds
md_df.n_target_inds.value_counts()

In [None]:
# uh oh! this looks like a bug: target_inds should have the same length as n_target_usps
# but it looks like it is incorrectly empty when there are multiple target_usps
# for now, we'll have to just ignore those cases where n_target_inds == 0
pd.crosstab(md_df.n_target_inds, md_df.n_target_usps)

In [None]:
# create a matrix that contains the features for all targets
arrs = []
for rc in rc_list:
    if len(rc.target_inds) == 0:
        continue
    target_feature_arr = rc.candidate_usp_mat[rc.target_inds]  # shape: 12 x 1
    source_target_feature_arr = rc.user_pair_mat[rc.target_inds]  # shape: 3 x 1
    arr = np.concatenate([target_feature_arr.reshape(-1), source_target_feature_arr.reshape(-1)])
    arrs.append(arr)
X_target = np.vstack(arrs)
X_target.shape

In [None]:
# in this example, we inspect the distributions of the features that are computed, in terms of their value for the TARGET usp
fig, axes = plt.subplots(4, 4, figsize=(14, 12))
axes = np.array(axes).reshape(-1)

# this is a custom list of feature names, which you may find useful to use
usp_feature_ind2name_map = {
    0: 'indegree',
    1: 'outdegree',
    2: 'component_size',
    3: 'journal_count',
    4: 'journal_time_to_most_recent',
    5: 'amp_count',
    6: 'amp_time_to_most_recent',
    7: 'comment_count',
    8: 'comment_time_to_most_recent',
    9: 'guestbook_count',
    10: 'guestbook_time_to_most_recent',
    11: 'time_to_first_update',
}
pair_feature_ind2name_map = {
    0: 'are_weakly_connected',
    1: 'is_fof',
    2: 'is_reciprocal',
}

for i in range(X_target.shape[1]):  # for each feature column....
    x = X_target[:,i]
    ax = axes[i]
    ax.hist(x, bins=20, log=True)
    if i in usp_feature_ind2name_map:
        ax.set_title(usp_feature_ind2name_map[i])
    else:
        ax.set_title(pair_feature_ind2name_map[i - 12])
    

fig.tight_layout()
plt.show()

## Demo: Inspecting the training data

In [None]:
# column order in the training data is source_feature_arr, candidate_feature_arr, source_candidate_feature_arr, source_text_arr, candidate_text_arr
# column order in the test data (as defined in this notebook) is source_feature_arr, candidate_feature_arr, source_candidate_feature_arr
X_train.shape, y_train.shape

In [None]:
# a quick-and-dirty plot with Matplotlib, revealing that indegree follows a power law distribution
plt.hist(X_train[:,0], log=True, bins=np.arange(0, 100))
plt.title("Distribution of source user's indegree")
plt.show()

In [None]:
# a more complex Matplotlib plot, using the Pythonic API
# we compare targets and alts
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
ax.hist(X_train[y_train==1,12], log=True, bins=np.arange(0, 100), alpha=0.5, label='Target')
ax.hist(X_train[y_train==0,12], log=True, bins=np.arange(0, 100), alpha=0.5, label='Alts')
ax.set_title("Distribution of target and alt indegree")
ax.legend()
plt.show()

In [None]:
sdf = pd.DataFrame(data={'y': y_train, 'indegree': X_train[:,12]})
len(sdf)

In [None]:
pd.crosstab(sdf.y, (sdf.indegree > 0).rename('previously received initiation?'), normalize='index' )

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
ax.hist(X_train[y_train==1,13], log=True, bins=np.arange(0, 100), alpha=0.5, label='Target')
ax.hist(X_train[y_train==0,13], log=True, bins=np.arange(0, 100), alpha=0.5, label='Alts')
ax.set_title("Distribution of target and alt outdegree")
ax.legend()
plt.show()

In [None]:
sdf = pd.DataFrame(data={'y': y_train, 'outdegree': X_train[:,13]})
len(sdf)

In [None]:
pd.crosstab(sdf.y, (sdf.outdegree > 0).rename('previous initiation?'), normalize='index')

## Bugfix: the target_inds bug identified above

Here's the original buggy snippet of code (with an added assertion that will fail when n_target_inds > 1):

```python
sort_inds = self.candidate_usp_arr[:,1].argsort()
self.candidate_usp_arr = self.candidate_usp_arr[sort_inds]
# update which inds contain the target (if any)
n_target_inds = len(self.target_inds)
self.target_inds = np.argwhere(self.target_inds == sort_inds)
assert len(self.target_inds) == n_target_inds
```


In [None]:
candidate_usp_arr = np.array([
    [2, 3],
    [1, 1],
    [1, 2],
    [2, 4],
    [3, 5],
])
target_inds = np.array([
    0, 3
])

In [None]:
sort_inds = candidate_usp_arr[:,1].argsort()
candidate_usp_arr = candidate_usp_arr[sort_inds]
candidate_usp_arr

In [None]:
sort_inds

In [None]:
np.argwhere(np.isin(sort_inds, target_inds))

In [None]:
# update which inds contain the target (if any)
n_target_inds = len(target_inds)
target_inds = np.argwhere(np.isin(sort_inds, target_inds))
assert len(target_inds) == n_target_inds

In [None]:
target_inds

In [None]:
# test with single target ind
# which revealed that we should also be called .ravel() to maintain the shape of the target_inds
candidate_usp_arr = np.array([
    [2, 3],
    [1, 1],
    [1, 2],
    [3, 5],
])
target_inds = np.array([
    0,
])

sort_inds = candidate_usp_arr[:,1].argsort()
candidate_usp_arr = candidate_usp_arr[sort_inds]
# update which inds contain the target (if any)
n_target_inds = len(target_inds)
target_inds = np.argwhere(np.isin(sort_inds, target_inds)).ravel()
assert len(target_inds) == n_target_inds

candidate_usp_arr, target_inds

## cbrec.modeling Demo

In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
model_config = cbrec.modeling.modelconfig.ModelConfig()
model_config

In [None]:
# override default configuration values here
model_config.train_n_epochs = 20

In [None]:
model_manager = cbrec.modeling.manager.ModelManager(model_config, config=config)
model_manager

In [None]:
model_manager.train_model(X_train, y_train)

In [None]:
# use the model_manager to score the data
y_test_score = model_manager.score_test_matrix(X_test)
y_test_score.shape, y_test.shape

In [None]:
# compute test accuracy from the scores
y_pred = (y_test_score >= 0.5).astype(int)
np.sum(y_pred == y_test) / len(y_test)

In [None]:
model_manager.save_model()

In [None]:
mm = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet')

In [None]:
mm.load_model(load_model_state_dict=True, load_training_metrics=True)

In [None]:
# now that we've loaded the saved model data, we can plot the model's metrics
train_metrics, test_metrics = mm.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.legend()

plt.show()