In [22]:
%load_ext autoreload
%autoreload 2

# add src to path
import sys
sys.path.append('/cluster/home/kheuto01/code/play-with-learning-army/src')
import numpy as np
# change directory to this files directory
import os
os.chdir('/cluster/home/kheuto01/code/play-with-learning-army')
from data_loader import load_processed, make_dataset
from embedder_registry import initialize_embedding, initialize_criteria_embedding, initialize_combiner
from domain_models import initialize_domain_models
from loss_opt import initialize_loss, initialize_optimizer
import torch
import yaml
import transformers
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, average_precision_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
experiment_directory = '/cluster/tufts/hugheslab/kheuto01/sensemaking/bertfinetune_test/test15_lr5e-07_alpha0.001_beta0.1'
problem_config_path = '/cluster/home/kheuto01/code/play-with-learning-army/config/problem_config.yaml'
test_metrics_path = os.path.join(experiment_directory, 'test_metrics.csv')
config_path = os.path.join(experiment_directory, 'config.yaml')
model_path = os.path.join(experiment_directory, 'final_model.pth')
hyper_config = yaml.load(open(config_path, 'r'), Loader=yaml.FullLoader)
problem_config = yaml.load(open(problem_config_path, 'r'), Loader=yaml.FullLoader)
num_domains = problem_config['num_domains']

In [4]:
save_dict = torch.load(model_path, weights_only=False, map_location=torch.device('cpu'))
domain_model_dict = save_dict['domain_model_dict']
embedder = save_dict['embed_func']
embedder.device = 'cpu'

In [5]:
processed_test_features, processed_test_labels = load_processed(hyper_config['test_x_file'], 
                                                                hyper_config['test_y_file'])
(xs, ys, problem_ids, student_ids) = make_dataset(processed_test_features, processed_test_labels)
xs, ys, problem_ids, student_ids = embedder.preprocess_data((xs, ys, torch.tensor(problem_ids), torch.tensor(student_ids)), hyper_config)

In [45]:
all_preds, all_labels, all_weights, all_domains = [], [], [], []

if isinstance(xs, transformers.BatchEncoding):
    batch_length = len(next(iter(xs.values())))
else:
    batch_length = len(xs)
    
for i in range(batch_length):
    if isinstance(xs, transformers.BatchEncoding):
        x = {k: v[i].unsqueeze(0) for k, v in xs.items()}
    else:
        x = xs[i].unsqueeze(0)
    y = ys[i]
    p = problem_ids[i]
    s = student_ids[i]

    x_embed = embedder.forward(x)

    criteria_counter = 0
    for d in range(num_domains):
        num_criteria = problem_config['problems'][p]['domains'][d]["num_criteria"]
        for c in range(num_criteria):
            c_embed = torch.tensor([c])
            final_representation = torch.cat((x_embed, c_embed.unsqueeze(0)), dim=1)
            y_pred = domain_model_dict[d](final_representation)
            weight = 1/num_criteria

            all_preds.append(y_pred.detach().cpu().numpy())
            all_labels.append(y[criteria_counter].cpu().numpy())
            all_weights.append(weight)
            all_domains.append(d)
            criteria_counter += 1

all_preds_np = np.concatenate(all_preds).flatten()
all_labels_np = np.array(all_labels).flatten()
all_labels = torch.tensor(np.array(all_labels)).squeeze()
all_preds = torch.tensor(all_preds).squeeze()


In [63]:
def bootstrap_metric_across_domains(y_pred, y_true, metric_func, domain_ids):
    domain_metrics = {}
    unique_domains = np.unique(domain_ids)
    print(unique_domains)
    for d in unique_domains:
        d_idx = np.where(domain_ids == d)[0]
        domain_metrics[d] = {}
        _,_,_,domain_metrics[d]['bootstrap_scores'] = bootstrap_metric(y_pred[d_idx], y_true[d_idx], metric_func)
        domain_metrics[d]['mean'] = domain_metrics[d]['bootstrap_scores'].mean()
        domain_metrics[d]['std_err'] =domain_metrics[d]['bootstrap_scores'].std()/np.sqrt(len(domain_metrics[d]['bootstrap_scores']))
    return domain_metrics

#https://github.com/tufts-ml/SupContrast/blob/tmlr/bootstrap_lin_acc.py
def bootstrap_metric(y_pred, y_true, metric_func,
                     n_bootstraps=1000, rng_seed=123):
    """Compute test set boostrapping of a metric
    Args:
        y_pred (tensor): Model predictions for some output y
        y_true (tensor): True value of output y
        metric_func (function): function with parameters (y_pred, y_true)
                                returning a Tensor castable metric
        n_bootstraps (int, optional): Number of bootstrap samples to take.
                                      Defaults to 200.
        rng_seed (int, optional): Random seed for reproducibility.
                                  Defaults to 123.
    Returns:
        tuple: metric_mean: Tensor with bootstrapped mean of metric
               ci_low: Low value from 95% confidence interval
               ci_high: High value from 95% confidence interval
               b_scores: Bootstrapped metric outputs
    """
    b_scores = None
    rng = torch.random.manual_seed(rng_seed)
    # bootstrap
    for _ in range(n_bootstraps):
        sample_idx = torch.randint(y_pred.shape[0], size=(y_pred.shape[0],), generator=rng)
        score = torch.Tensor([metric_func(y_pred[sample_idx], y_true[sample_idx])])
        # store results from each run along axis 0, with other axes' shape determined by metric
        if b_scores is None:
            b_scores = score.unsqueeze(0)
        else:
            b_scores = torch.vstack((b_scores, score))
    # compute mean and confidence interval
    metric_mean = torch.mean(b_scores, dim=0)
    ci_low = torch.quantile(b_scores, 0.025, dim=0)
    ci_high = torch.quantile(b_scores, 0.975, dim=0)
    return (metric_mean, ci_low, ci_high, b_scores)


def bootstrap_dif(b_scores_1, b_scores_2):
    """Examine the difference of two bootstrapped metrics

    Args:
        b_scores_1 (Tensor): Bootstrapped metric outputs, with same seed as 2
        b_scores_2 (Tensor): Bootstrapped metric outputs, with same seed as 1
    Returns:
        tensor: True if 95% CI does not contain 0 so result is statistically significant
                False if 95% CI contains 0 so result is not statistically significant
    """
    dif_scores = b_scores_1 - b_scores_2
    # compute confidence interval of the difference
    ci_low = torch.quantile(dif_scores, 0.025, dim=0)
    ci_high = torch.quantile(dif_scores, 0.975, dim=0)
    return ~torch.logical_and(ci_low <= 0, ci_high >= 0)

In [70]:
domain_auprc = bootstrap_metric_across_domains(all_labels, all_preds, average_precision_score, all_domains)
micro_auprc, _, _, micro_auprc_scores = bootstrap_metric(all_labels, all_preds, average_precision_score)

[0 1 2 3 4 5 6 7]




In [68]:
all_domain_properties = problem_config['problems'][0]['domains']
for d in range(num_domains):
    print(f"Domain {all_domain_properties[d]['name']}")
    print(f"AUPRC: {domain_auprc[d]['mean']} +/- {domain_auprc[d]['std_err']}")
    print(f"Properties: {all_domain_properties[d]}")
    print("\n")

Domain Objects
AUPRC: 0.982384204864502 +/- 0.0003669500001706183
Properties: {'name': 'Objects', 'num_criteria': 1}


Domain Influences
AUPRC: 0.832855761051178 +/- 0.002788752783089876
Properties: {'name': 'Influences', 'num_criteria': 0}


Domain Properties
AUPRC: 0.7330579161643982 +/- 0.00782721396535635
Properties: {'name': 'Properties', 'num_criteria': 0}


Domain Positioning
AUPRC: 0.7180802226066589 +/- 0.0022123451344668865
Properties: {'name': 'Positioning', 'num_criteria': 1}


Domain Movements
AUPRC: 0.751388669013977 +/- 0.0019306187750771642
Properties: {'name': 'Movements', 'num_criteria': 3}


Domain Interactions
AUPRC: 0.8882994651794434 +/- 0.0014369270065799356
Properties: {'name': 'Interactions', 'num_criteria': 0}


Domain Descriptive Relationships
AUPRC: 0.6425483822822571 +/- 0.0036470675840973854
Properties: {'name': 'Descriptive Relationships', 'num_criteria': 2}


Domain Mechanistic Relationships
AUPRC: 0.647122323513031 +/- 0.004285923205316067
Properties: {

In [71]:
print(f'micro_auprc: {micro_auprc} +/- {micro_auprc_scores.std()/np.sqrt(len(micro_auprc_scores))}')

micro_auprc: tensor([0.8618]) +/- 0.0006026943447068334
