In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
from sklearn.cluster import KMeans

from helpers.utils import deferral_metrics, get_metrics, set_matplotlib_params
from networks.nonlinearnet_aihuman import optimize_alpha, test_time_prediction

set_matplotlib_params()
warnings.filterwarnings('ignore')
seed = 11
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(seed)
rng = np.random.default_rng(seed) 
torch.set_default_dtype(torch.double)
torch.set_default_tensor_type(torch.DoubleTensor)



In [3]:
### From the undersampled training set, create two training sets: one for clf, one for human, based on K-means (K=2)

drd2_train_undersampled = pd.read_csv("datasets/drd2_train_undersampled_ECFP_counts.csv")
ndrop = 0
drop = rng.choice(range(len(drd2_train_undersampled)), ndrop, replace=False)
drd2_train_undersampled.drop(drop, inplace=True)
d = 2048

X_train_all = torch.tensor(drd2_train_undersampled[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_train_all = torch.tensor(drd2_train_undersampled.activity.values, dtype=torch.double)

algo = KMeans(n_clusters=2, random_state=seed, n_init=20, max_iter=5000, init="k-means++")
algo.fit(X_train_all)
c1_train = np.where(algo.labels_)[0]
c2_train = [i for i in range(len(X_train_all)) if i not in c1_train]

X_train = X_train_all[c1_train, :]
X_train_h = X_train_all[c2_train,:]

y_train = y_train_all[c1_train].unsqueeze(1)
y_train_h = y_train_all[c2_train].unsqueeze(1)

### Test set, making it balanced

drd2_test = pd.read_csv("datasets/drd2_test_ECFP_counts.csv")
X_test_all = torch.tensor(drd2_test[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_test_all = torch.tensor(drd2_test.activity.values, dtype=torch.double)

idx_active = torch.where(y_test_all)[0].numpy()
idx_inactive = [i for i in range(len(y_test_all)) if i not in idx_active]
idx_inactive = rng.choice(range(len(y_test_all)), 600, replace=False)
idx = np.r_[idx_active,idx_inactive]
y_test_all = y_test_all[idx]
X_test_all = X_test_all[idx]

### Validation set as a fraction of the test set

frac = .2
ntest = len(X_test_all)
nval = int(ntest * frac)
idx = rng.choice(range(ntest), nval, replace=False)
X_val = X_test_all[idx]
y_val = y_test_all[idx].unsqueeze(1)

notidx = [i for i in range(ntest) if i not in idx]
X_test_all = X_test_all[notidx]
y_test_all = y_test_all[notidx].unsqueeze(1)
print(f"Global train set size: {len(X_train_all)}")
print(f"Clf train set size: {len(X_train)}")
print(f"Human train set size: {len(X_train_h)}\n")
print(f"Validation set size: {len(X_val)}\n")
print(f"Global test set size: {len(X_test_all)}")

### We split the test set to evaluate the accuracy of clf1 and clf2 using the learned clustering

c1 = np.where(algo.predict(X_test_all))[0]
c2 = [i for i in range(len(X_test_all)) if i not in c1]

X_test = X_test_all[c1, :]
X_test_h = X_test_all[c2,:]

y_test = y_test_all[c1]
y_test_h = y_test_all[c2]

print(f"Clf test set size: {len(X_test)}")
print(f"Human test set size: {len(X_test_h)}")

Global train set size: 2420
Clf train set size: 1415
Human train set size: 1005

Validation set size: 186

Global test set size: 744
Clf test set size: 466
Human test set size: 278


In [4]:
# example usage:
num_features = X_train.shape[1]  # number of input features
dropout = 0.2
num_epochs = 150
lr = 0.1

# define the loss function and optimizer for the l2d_model
criterion = nn.BCEWithLogitsLoss()  # use BCEWithLogitsLoss for binary classification

In [5]:
best_alpha, l2d_model = optimize_alpha([.1,.3,.5,.8,1.], lr, num_features, dropout, num_epochs, X_val, y_val, X_train, X_train_h, y_train, y_train_h, criterion)

Epoch [10/150], Loss: 0.0005914840885259426
Epoch [20/150], Loss: 0.0005916699729256175
Epoch [30/150], Loss: 0.0005910379166022064
Epoch [40/150], Loss: 0.0005912674163563718
Epoch [50/150], Loss: 0.0005910384467973082
Epoch [60/150], Loss: 0.0005911965491318176
Epoch [70/150], Loss: 0.0005911877461539912
Epoch [80/150], Loss: 0.0005912143691048357
Epoch [90/150], Loss: 0.0005910306354741356
Epoch [100/150], Loss: 0.0005910399706228473
Epoch [110/150], Loss: 0.000590974022806421
Epoch [120/150], Loss: 0.0005910340422956789
Epoch [130/150], Loss: 0.0005909850552665349
Epoch [140/150], Loss: 0.0005908852096537817
Epoch [150/150], Loss: 0.0005910314526315853
0.1 0.6007604562737643
Epoch [10/150], Loss: 0.000723754001502766
Epoch [20/150], Loss: 0.0007241374512846534
Epoch [30/150], Loss: 0.0007239572641477162
Epoch [40/150], Loss: 0.0007240153128168552
Epoch [50/150], Loss: 0.0007240287457027897
Epoch [60/150], Loss: 0.0007238417218035907
Epoch [70/150], Loss: 0.0007240246502108092
Epoch

In [6]:
metrics = {}
test_labels = [y_test, y_test_h]
test_features = [X_test, X_test_h]

print('Metrics computed using same distributions for train and test set')
for i in range(2):
    final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, test_features[i])
    metrics[f"clf_{i+1}"] = get_metrics(test_labels[i], pred_clf[:, i])
print (json.dumps(metrics, indent=2, default=str))
metrics = {}
# for the system, we use the global test set
print('Metrics computed using the whole test set')
final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, X_test_all)
for i in range(2):
    metrics[f"clf_{i+1}"] = get_metrics(y_test_all, pred_clf[:, i])
metrics[f"system"] = get_metrics(y_test_all, final_predictions)
print (json.dumps(metrics, indent=2, default=str))
print(f"Percentage of deferral: {boolean.mean()}")
for i in range(2):
     print(f'--- For label {i} ---')
     ndefer, ndefersuccess, ndeferuseful = deferral_metrics(y_test_all.squeeze(), pred_clf, boolean, i)
     print(f"Deferral: {ndefer} / {(y_test_all == i).sum()}")
     print(f"Successful deferrals: {ndefersuccess} / {ndefer}")
     print(f"Useful deferrals: {ndeferuseful} / {ndefersuccess}\n")
# print(f"of which {} were successful defers")

# print(f"Deferral for negative samples: {boolean[(y_test_all == 0).squeeze()].mean()}")

Metrics computed using same distributions for train and test set
{
  "clf_1": {
    "Accuracy": 0.5665236051502146,
    "Precision": 0.5227802956162575,
    "Recall": 0.5665236051502146,
    "F1-Score": 0.5060729833778393
  },
  "clf_2": {
    "Accuracy": 0.3345323741007194,
    "Precision": 0.11191190932146368,
    "Recall": 0.3345323741007194,
    "F1-Score": 0.16771703930656012
  }
}
Metrics computed using the whole test set
{
  "clf_1": {
    "Accuracy": 0.6061827956989247,
    "Precision": 0.5620847423331478,
    "Recall": 0.6061827956989247,
    "F1-Score": 0.5428001563621314
  },
  "clf_2": {
    "Accuracy": 0.3817204301075269,
    "Precision": 0.1457104867614753,
    "Recall": 0.3817204301075269,
    "F1-Score": 0.2109116773356763
  },
  "system": {
    "Accuracy": 0.6061827956989247,
    "Precision": 0.5620847423331478,
    "Recall": 0.6061827956989247,
    "F1-Score": 0.5428001563621314
  }
}
Percentage of deferral: 0.13037633895874023
--- For label 0 ---
Deferral: 53 / 460
S

In [7]:
## Individual metrics

for i in range(len(X_test_all)):
    print(
        f"y = {y_test_all[i].item()},", 
        f"classifier pred = {combined_outputs[i,0]:.3f},", 
        f"hum model pred = {combined_outputs[i,1]:.3f}",
        "deferred" if boolean[i] else "NOT deferred"
    )

y = 1.0, classifier pred = 0.485, hum model pred = 0.562 NOT deferred
y = 1.0, classifier pred = 0.478, hum model pred = 0.528 NOT deferred
y = 1.0, classifier pred = 0.489, hum model pred = 0.556 NOT deferred
y = 1.0, classifier pred = 0.494, hum model pred = 0.567 NOT deferred
y = 1.0, classifier pred = 0.494, hum model pred = 0.555 NOT deferred
y = 1.0, classifier pred = 0.488, hum model pred = 0.571 NOT deferred
y = 1.0, classifier pred = 0.476, hum model pred = 0.564 NOT deferred
y = 1.0, classifier pred = 0.495, hum model pred = 0.546 NOT deferred
y = 1.0, classifier pred = 0.478, hum model pred = 0.566 NOT deferred
y = 1.0, classifier pred = 0.487, hum model pred = 0.553 NOT deferred
y = 1.0, classifier pred = 0.472, hum model pred = 0.567 NOT deferred
y = 1.0, classifier pred = 0.483, hum model pred = 0.548 NOT deferred
y = 1.0, classifier pred = 0.502, hum model pred = 0.556 deferred
y = 1.0, classifier pred = 0.495, hum model pred = 0.541 NOT deferred
y = 1.0, classifier pred

In [8]:
best_alpha

0.1

In [None]:
torch.save(l2d_model.state_dict(), "models/l2d_model_demo2.pt")

In [None]:
drd2_train_undersampled.iloc[c1].rename(columns = {"activity": "activity_y"}).to_csv("datasets/drd2_train_undersampled_y_ECFP_counts.csv")
drd2_train_undersampled.iloc[c2].rename(columns = {"activity": "activity_h"}).to_csv("datasets/drd2_train_undersampled_h_ECFP_counts.csv")