In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
from sklearn.cluster import KMeans

from helpers.utils import get_metrics, set_matplotlib_params
from networks.nonlinearnet_aihuman import optimize_alpha, test_time_prediction

set_matplotlib_params()
warnings.filterwarnings('ignore')
seed = 12
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(seed)
rng = np.random.default_rng(seed) 
torch.set_default_dtype(torch.double)
torch.set_default_tensor_type(torch.DoubleTensor)



In [3]:
### From the undersampled training set, create two training sets: one for clf, one for human, based on K-means (K=2)

drd2_train_undersampled = pd.read_csv("datasets/drd2_train_undersampled_ECFP_counts.csv")
d = 2048

X_train_all = torch.tensor(drd2_train_undersampled[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_train_all = torch.tensor(drd2_train_undersampled.activity.values, dtype=torch.double)

algo = KMeans(n_clusters=2, random_state=seed, n_init=20, max_iter=5000, init="k-means++")
algo.fit(X_train_all)
c1_train = np.where(algo.labels_)[0]
c2_train = [i for i in range(len(X_train_all)) if i not in c1_train]

X_train = X_train_all[c1_train, :]
X_train_h = X_train_all[c2_train,:]

y_train = y_train_all[c1_train].unsqueeze(1)
y_train_h = y_train_all[c2_train].unsqueeze(1)

### Test set, making it balanced

drd2_test = pd.read_csv("datasets/drd2_test_ECFP_counts.csv")
X_test_all = torch.tensor(drd2_test[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_test_all = torch.tensor(drd2_test.activity.values, dtype=torch.double)

idx_active = torch.where(y_test_all)[0].numpy()
idx_inactive = [i for i in range(len(y_test_all)) if i not in idx_active]
idx_inactive = rng.choice(range(len(y_test_all)), 600, replace=False)
idx = np.r_[idx_active,idx_inactive]
y_test_all = y_test_all[idx]
X_test_all = X_test_all[idx]

### Validation set as a fraction of the test set

frac = .2
ntest = len(X_test_all)
nval = int(ntest * frac)
idx = rng.choice(range(ntest), nval, replace=False)
X_val = X_test_all[idx]
y_val = y_test_all[idx].unsqueeze(1)

notidx = [i for i in range(ntest) if i not in idx]
X_test_all = X_test_all[notidx]
y_test_all = y_test_all[notidx].unsqueeze(1)
print(f"Global test set size: {len(X_train_all)}")
print(f"Clf train set size: {len(X_train)}")
print(f"Human train set size: {len(X_train_h)}\n")
print(f"Validation set size: {len(X_val)}\n")
print(f"Global test set size: {len(X_test_all)}")

### We split the test set to evaluate the accuracy of clf1 and clf2 using the learned clustering

c1 = np.where(algo.predict(X_test_all))[0]
c2 = [i for i in range(len(X_test_all)) if i not in c1]

X_test = X_test_all[c1, :]
X_test_h = X_test_all[c2,:]

y_test = y_test_all[c1]
y_test_h = y_test_all[c2]

print(f"Clf test set size: {len(X_test)}")
print(f"Human test set size: {len(X_test_h)}")

Global test set size: 2420
Clf train set size: 1012
Human train set size: 1408

Validation set size: 186

Global test set size: 744
Clf test set size: 300
Human test set size: 444


In [4]:
# example usage:
num_features = X_train.shape[1]  # number of input features
dropout = 0.2
num_epochs = 200
lr = 0.1

# define the loss function and optimizer for the l2d_model
criterion = nn.BCEWithLogitsLoss()  # use BCEWithLogitsLoss for binary classification

In [5]:
best_alpha, l2d_model = optimize_alpha([.1,.3,.5,.8,1.], lr, num_features, dropout, num_epochs, X_val, y_val, X_train, X_train_h, y_train, y_train_h, criterion)

Epoch [10/200], Loss: 0.7643599052138833
Epoch [20/200], Loss: 0.7441517724432181
Epoch [30/200], Loss: 0.6396574377686103
Epoch [40/200], Loss: 0.6124412517352997
Epoch [50/200], Loss: 0.6053559315815038
Epoch [60/200], Loss: 0.602081245684523
Epoch [70/200], Loss: 0.6002247143188766
Epoch [80/200], Loss: 0.5992114086860408
Epoch [90/200], Loss: 0.5983495294508554
Epoch [100/200], Loss: 0.5976883897425262
Epoch [110/200], Loss: 0.5971472037732188
Epoch [120/200], Loss: 0.5964232997609825
Epoch [130/200], Loss: 0.5958011621436834
Epoch [140/200], Loss: 0.5949454879862559
Epoch [150/200], Loss: 0.5938390543117975
Epoch [160/200], Loss: 0.5925029179041997
Epoch [170/200], Loss: 0.5909528399295685
Epoch [180/200], Loss: 0.5893787683982141
Epoch [190/200], Loss: 0.5879261790706111
Epoch [200/200], Loss: 0.5867168636245716
0.1 0.0
Epoch [10/200], Loss: 0.8694318812716406
Epoch [20/200], Loss: 0.8510250335209577
Epoch [30/200], Loss: 0.8369244236563651
Epoch [40/200], Loss: 0.828210573923751

In [15]:
def deferral_metrics(y_test, pred_clf, boolean, label):
    ndefer = boolean[(y_test == label)].sum()
    ndefersuccess = ((boolean == 1) * (y_test == label) * (pred_clf[:, 1] == y_test)).float().sum()
    ndeferuseful = ((boolean == 1) * (y_test == label) * (pred_clf[:, 1] == y_test) * (pred_clf[:,1] != pred_clf[:, 0])).float().sum()
    return int(ndefer), int(ndefersuccess), int(ndeferuseful)


metrics = {}
test_labels = [y_test, y_test_h]
test_features = [X_test, X_test_h]

print('Metrics computed using same distributions for train and test set')
for i in range(2):
    final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, test_features[i])
    metrics[f"clf_{i+1}"] = get_metrics(test_labels[i], pred_clf[:, i])
print (json.dumps(metrics, indent=2, default=str))

# for the system, we use the global test set
print('Metrics computed using the whole test set')
final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, X_test_all)
metrics[f"clf_{i+1}"] = get_metrics(y_test_all, pred_clf[:, i])
metrics[f"system"] = get_metrics(y_test_all, final_predictions)
print (json.dumps(metrics, indent=2, default=str))
print(f"Percentage of deferral: {boolean.mean()}")
for i in range(2):
     print(f'--- For label {i} ---')
     ndefer, ndefersuccess, ndeferuseful = deferral_metrics(y_test_all.squeeze(), pred_clf, boolean, i)
     print(f"Deferral: {ndefer} / {(y_test_all == i).sum()}")
     print(f"Successful deferrals: {ndefersuccess} / {ndefer}")
     print(f"Useful deferrals: {ndeferuseful} / {ndefersuccess}\n")
# print(f"of which {} were successful defers")

# print(f"Deferral for negative samples: {boolean[(y_test_all == 0).squeeze()].mean()}")

Metrics computed using same distributions for train and test set
{
  "clf_1": {
    "Accuracy": 0.94,
    "Precision": 0.9485039370078739,
    "Recall": 0.94,
    "F1-Score": 0.9408046191097039
  },
  "clf_2": {
    "Accuracy": 0.9617117117117117,
    "Precision": 0.9635093952139869,
    "Recall": 0.9617117117117117,
    "F1-Score": 0.9617852473834543
  }
}
Metrics computed using the whole test set
{
  "clf_1": {
    "Accuracy": 0.94,
    "Precision": 0.9485039370078739,
    "Recall": 0.94,
    "F1-Score": 0.9408046191097039
  },
  "clf_2": {
    "Accuracy": 0.8696236559139785,
    "Precision": 0.8983570193247613,
    "Recall": 0.8696236559139785,
    "F1-Score": 0.8703038803179055
  },
  "system": {
    "Accuracy": 0.9543010752688172,
    "Precision": 0.9574746650343468,
    "Recall": 0.9543010752688172,
    "F1-Score": 0.9545267489711933
  }
}
Percentage of deferral: 0.24596774578094482
--- For label 0 ---
Deferral: 181 / 434
Successful deferrals: 179 / 181
Useful deferrals: 0 / 179


In [16]:
## Individual metrics

for i in range(len(X_test_all)):
    print(
        f"y = {y_test_all[i].item()},", 
        f"classifier pred = {combined_outputs[i,0]:.3f},", 
        f"hum model pred = {combined_outputs[i,1]:.3f}",
        "deferred" if boolean[i] else "NOT deferred"
    )

y = 1.0, classifier pred = 0.997, hum model pred = 0.999 NOT deferred
y = 1.0, classifier pred = 0.989, hum model pred = 0.993 NOT deferred
y = 1.0, classifier pred = 0.938, hum model pred = 0.856 NOT deferred
y = 1.0, classifier pred = 0.889, hum model pred = 0.985 NOT deferred
y = 1.0, classifier pred = 0.933, hum model pred = 0.963 NOT deferred
y = 1.0, classifier pred = 0.975, hum model pred = 0.993 NOT deferred
y = 1.0, classifier pred = 0.973, hum model pred = 0.994 NOT deferred
y = 1.0, classifier pred = 0.989, hum model pred = 0.995 NOT deferred
y = 1.0, classifier pred = 0.968, hum model pred = 0.985 NOT deferred
y = 1.0, classifier pred = 0.983, hum model pred = 0.996 NOT deferred
y = 1.0, classifier pred = 0.850, hum model pred = 0.991 NOT deferred
y = 1.0, classifier pred = 0.981, hum model pred = 0.988 NOT deferred
y = 1.0, classifier pred = 0.755, hum model pred = 0.534 NOT deferred
y = 1.0, classifier pred = 0.915, hum model pred = 0.983 NOT deferred
y = 1.0, classifier 

In [None]:
torch.save(l2d_model.state_dict(), "models/l2d_model_demo2.pt")

In [None]:
drd2_train_undersampled.iloc[c1].rename(columns = {"activity": "activity_y"}).to_csv("datasets/drd2_train_undersampled_y_ECFP_counts.csv")
drd2_train_undersampled.iloc[c2].rename(columns = {"activity": "activity_h"}).to_csv("datasets/drd2_train_undersampled_h_ECFP_counts.csv")