In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
import torch.optim as optim
from sklearn.cluster import BisectingKMeans

from helpers.utils import get_metrics, set_matplotlib_params
from networks.nonlinearnet_aihuman import NonLinearNetDefer, optimization_loop

set_matplotlib_params()
warnings.filterwarnings('ignore')
seed = 12
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(seed)
rng = np.random.default_rng(seed) 
torch.set_default_dtype(torch.double)
torch.set_default_tensor_type(torch.DoubleTensor)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
drd2_train = pd.read_csv("datasets/drd2_train_ECFP_counts.csv")
drd2_train_undersampled = pd.read_csv("datasets/drd2_train_undersampled_ECFP_counts.csv")
drd2_test = pd.read_csv("datasets/drd2_test_ECFP_counts.csv")
CLUSTERING = True

# keeping some of the samples to create the AL pool. npool is the number of samples we REMOVE from the training set
# setting it to 0, as you get additional human samples from Reinvent.
npool = 0
idxpool = rng.choice(range(len(drd2_train_undersampled)), npool, replace=False)
pool = drd2_train_undersampled.iloc[idxpool]
drd2_train_undersampled.drop(idxpool, axis=0, inplace=True)
d = 2048

npts_more_human = 0
keep_pool = np.random.choice(pool.index, npts_more_human, replace=False)
drd2_train_undersampled_h = pd.concat((drd2_train_undersampled, pool.loc[keep_pool]))

print(f"Train size: {drd2_train.shape}")
print(f"Train undersampled size: {drd2_train_undersampled.shape}")
print(f"Train undersampled human size: {drd2_train_undersampled.shape}")

Train size: (21302, 2052)
Train undersampled size: (2420, 2053)
Train undersampled human size: (2420, 2053)


In [4]:
drd2_train_undersampled["activity_y"] = drd2_train_undersampled.activity.values.tolist()
drd2_train_undersampled_h["activity_h"] = drd2_train_undersampled_h.activity.values.tolist()

drd2_test["activity_y"] = drd2_test.activity.values.tolist()
drd2_test["activity_h"] = drd2_test.activity.values.tolist()

In [5]:
train_features = drd2_train_undersampled[[f"bit{i}" for i in range(d)]].values
train_features_h = drd2_train_undersampled_h[[f"bit{i}" for i in range(d)]].values
train_labels = drd2_train_undersampled[["activity_y"]].values
train_labels_h = drd2_train_undersampled_h[["activity_h"]].values

test_features = drd2_test[[f"bit{i}" for i in range(d)]].values
test_labels = drd2_test[["activity_y", "activity_h"]].values

X_train = torch.tensor(train_features, dtype=torch.double)
X_train_h = torch.tensor(train_features_h, dtype=torch.double)
y_train = torch.tensor(train_labels, dtype=torch.double)
h_train = torch.tensor(train_labels_h, dtype=torch.double)

X_test = torch.tensor(test_features, dtype=torch.double)
y_test = torch.tensor(test_labels, dtype=torch.double)

idx_active = torch.where(y_test[:, 0])[0].numpy()
idx_inactive = [i for i in range(len(y_test)) if i not in idx_active]
idx_inactive = np.random.choice(range(len(y_test)), 600, replace=False)
idx = np.r_[idx_active,idx_inactive]
y_test = y_test[idx]
X_test = X_test[idx]
print(f"Test size: {X_test.shape}")

Test size: torch.Size([930, 2048])


In [6]:
# example usage:
num_features = train_features.shape[1]  # number of input features
dropout = 0.2
num_epochs = 200
lr = 0.1

# create an instance of the NonLinearNetDefer
l2d_model = NonLinearNetDefer(num_features, dropout)

# define the loss function and optimizer for the l2d_model
criterion = nn.BCEWithLogitsLoss()  # use BCEWithLogitsLoss for binary classification
optimizer = optim.SGD(l2d_model.parameters(), lr=lr)

In [7]:
X = X_train
X_h = X_train_h

# binary labels for classifier 1 and classifier 2 (or human model)
y_train = y_train[:,0].unsqueeze(1)
h_train = h_train[:,0].unsqueeze(1)

In [8]:
if CLUSTERING:
    algo = BisectingKMeans(n_clusters=2, random_state=seed, n_init=20, max_iter=5000, init="k-means++")
    algo.fit(X)
    c1 = np.where(algo.labels_)[0]
    c2 = [i for i in range(len(X_train)) if i not in c1]

    X_train = X_train[c1, :]
    X_train_h = X_train_h[c2,:]

    y_train = y_train[c1,0].unsqueeze(1)
    h_train = h_train[c2,0].unsqueeze(1)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3dcc7b03a0>
Traceback (most recent call last):
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3dcc7b03a0>
Traceback (most recent call last):
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3dcc7b03a0>
Traceback (most recent call last):
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/klgx638/miniconda3/envs/deferral/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function

In [10]:
# # add random noise to h
# oldh = torch.clone(h)
# p =torch.bernoulli(0.75 *torch.ones(len(h))).unsqueeze(1)
# h = p*h + (1-p)*(1-h)

In [9]:
optimization_loop(num_epochs, optimizer, l2d_model, X_train, X_train_h, y_train, h_train, criterion)

Epoch [10/200], Loss: 2.5256675985156045
Epoch [20/200], Loss: 2.4397184287079146
Epoch [30/200], Loss: 2.3826507492201285
Epoch [40/200], Loss: 2.347770090995159
Epoch [50/200], Loss: 2.3236257983767246
Epoch [60/200], Loss: 2.3077316098528167
Epoch [70/200], Loss: 2.295738894564296
Epoch [80/200], Loss: 2.2854990321041067
Epoch [90/200], Loss: 2.2786942040212432
Epoch [100/200], Loss: 2.2724456162019284
Epoch [110/200], Loss: 2.2691005712077854
Epoch [120/200], Loss: 2.2645606375157636
Epoch [130/200], Loss: 2.2618875307331265
Epoch [140/200], Loss: 2.2590909780306614
Epoch [150/200], Loss: 2.256724721045802
Epoch [160/200], Loss: 2.2544739582339273
Epoch [170/200], Loss: 2.2528981287220233
Epoch [180/200], Loss: 2.2508799137470237
Epoch [190/200], Loss: 2.249451420673546
Epoch [200/200], Loss: 2.248472963832273


In [10]:
y_test = y_test[:,0].unsqueeze(1)
h_test = y_test[:,0].unsqueeze(1) # y = h
metrics = {}
labels = [y_test, h_test]

with torch.no_grad():
    l2d_model.eval()
    combined_outputs, decision_outputs = l2d_model(X_test)
    pred_clf = (combined_outputs > 0.5).float()
    for i in range(2):
        metrics[f"clf_{i+1}"] = get_metrics(labels[i], pred_clf[:, i])
    boolean = (labels[0]==1).reshape(-1) * (
        decision_outputs[:,0] > combined_outputs[:, 0]
    ) * (
        combined_outputs[:, 1] > combined_outputs[:, 0]
    ) + (labels[0]==0).reshape(-1) * (
        decision_outputs[:,0] > combined_outputs[:, 0]
    ) * (
        combined_outputs[:, 1] < combined_outputs[:, 0]
    ) * 1.
    boolean = torch.tensor(boolean, dtype=torch.float32)

    final_predictions = (boolean * pred_clf[:, 1]) + (1 - boolean) * pred_clf[:, 0]
    metrics[f"system"] = get_metrics(labels[0], final_predictions)
print (json.dumps(metrics, indent=2, default=str))
print(f"Percentage of deferral: {boolean.mean()}")

{
  "clf_1": {
    "Accuracy": 0.6053763440860215,
    "Precision": 0.3664805179789571,
    "Recall": 0.6053763440860215,
    "F1-Score": 0.4565664858947489
  },
  "clf_2": {
    "Accuracy": 0.8623655913978494,
    "Precision": 0.8969570842262878,
    "Recall": 0.8623655913978494,
    "F1-Score": 0.8637771442860716
  },
  "system": {
    "Accuracy": 0.9989247311827957,
    "Precision": 0.9989266376877908,
    "Recall": 0.9989247311827957,
    "F1-Score": 0.9989244747607345
  }
}
Percentage of deferral: 0.39462366700172424


In [None]:
for i in range(len(X_test)):
    if (y_test[i]==1) and (decision_outputs[i] > combined_outputs[i, 0]) and (combined_outputs[i, 1] > combined_outputs[i, 0]) or (y_test[i]==0) and (decision_outputs[i] > combined_outputs[i, 0]) and (combined_outputs[i, 1] < combined_outputs[i, 0]):
        print(
            f"y = {y_test[i].item()},", 
            f"h = {h_test[i].item()},", 
            f"classifier pred = {combined_outputs[i,0]:.3f},", 
            f"hum model pred = {combined_outputs[i,1]:.3f}",
            "--> deferred"
        )
    else:
        print(
            f"y = {y_test[i].item()},", 
            f"h = {h_test[i].item()},", 
            f"classifier pred = {combined_outputs[i,0]:.3f},", 
            f"hum model pred = {combined_outputs[i,1]:.3f}",
            "--> NOT deferred"
        )

In [None]:
torch.save(l2d_model.state_dict(), "models/l2d_model_demo2.pt")

In [None]:
drd2_train_undersampled.iloc[c1].rename(columns = {"activity": "activity_y"}).to_csv("datasets/drd2_train_undersampled_y_ECFP_counts.csv")
drd2_train_undersampled.iloc[c2].rename(columns = {"activity": "activity_h"}).to_csv("datasets/drd2_train_undersampled_h_ECFP_counts.csv")