In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
import torch.optim as optim
from sklearn.cluster import KMeans

from helpers.utils import get_metrics, set_matplotlib_params
from networks.nonlinearnet_aihuman import NonLinearNetDefer, optimization_loop

set_matplotlib_params()
warnings.filterwarnings('ignore')
seed = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(seed)
rng = np.random.default_rng(seed) 
torch.set_default_dtype(torch.double)
torch.set_default_tensor_type(torch.DoubleTensor)

In [4]:
drd2_train = pd.read_csv("datasets/drd2_train_ECFP_counts.csv")
drd2_train_undersampled = pd.read_csv("datasets/drd2_train_undersampled_ECFP_counts.csv")
drd2_test = pd.read_csv("datasets/drd2_test_ECFP_counts.csv")
CLUSTERING = True

# keeping some of the samples to create the AL pool. npool is the number of samples we REMOVE from the training set
# setting it to 0, as you get additional human samples from Reinvent.
npool = 1000
idxpool = rng.choice(range(len(drd2_train_undersampled)), npool, replace=False)
pool = drd2_train_undersampled.iloc[idxpool]
drd2_train_undersampled.drop(idxpool, axis=0, inplace=True)
d = 2048

npts_more_human = 0
keep_pool = np.random.choice(pool.index, npts_more_human, replace=False)
drd2_train_undersampled_h = pd.concat((drd2_train_undersampled, pool.loc[keep_pool]))

print(f"Train size: {drd2_train.shape}")
print(f"Train undersampled size: {drd2_train_undersampled.shape}")
print(f"Train undersampled human size: {drd2_train_undersampled.shape}")

Train size: (21302, 2052)
Train undersampled size: (1420, 2053)
Train undersampled human size: (1420, 2053)


In [5]:
drd2_train_undersampled["activity_y"] = drd2_train_undersampled.activity.values.tolist()
drd2_train_undersampled_h["activity_h"] = drd2_train_undersampled_h.activity.values.tolist()

drd2_test["activity_y"] = drd2_test.activity.values.tolist()
drd2_test["activity_h"] = drd2_test.activity.values.tolist()

In [6]:
train_features = drd2_train_undersampled[[f"bit{i}" for i in range(d)]].values
train_features_h = drd2_train_undersampled_h[[f"bit{i}" for i in range(d)]].values
train_labels = drd2_train_undersampled[["activity_y"]].values
train_labels_h = drd2_train_undersampled_h[["activity_h"]].values

test_features = drd2_test[[f"bit{i}" for i in range(d)]].values
test_labels = drd2_test[["activity_y", "activity_h"]].values

X_train = torch.tensor(train_features, dtype=torch.double)
X_train_h = torch.tensor(train_features_h, dtype=torch.double)
y_train = torch.tensor(train_labels, dtype=torch.double)
h_train = torch.tensor(train_labels_h, dtype=torch.double)

X_test = torch.tensor(test_features, dtype=torch.double)
y_test = torch.tensor(test_labels, dtype=torch.double)

idx_active = torch.where(y_test[:, 0])[0].numpy()
idx_inactive = [i for i in range(len(y_test)) if i not in idx_active]
idx_inactive = np.random.choice(range(len(y_test)), 600, replace=False)
idx = np.r_[idx_active,idx_inactive]
y_test = y_test[idx]
X_test = X_test[idx]
print(f"Test size: {X_test.shape}")

Test size: torch.Size([930, 2048])


In [7]:
# example usage:
num_features = train_features.shape[1]  # number of input features
dropout = 0.2
num_epochs = 200
lr = 0.1

# create an instance of the NonLinearNetDefer
l2d_model = NonLinearNetDefer(num_features, dropout)

# define the loss function and optimizer for the l2d_model
criterion = nn.BCEWithLogitsLoss()  # use BCEWithLogitsLoss for binary classification
optimizer = optim.SGD(l2d_model.parameters(), lr=lr)

In [8]:
X = X_train
X_h = X_train_h

# binary labels for classifier 1 and classifier 2 (or human model)
y_train = y_train[:,0].unsqueeze(1)
h_train = h_train[:,0].unsqueeze(1)

In [9]:
if CLUSTERING:
    algo = KMeans(n_clusters=2, random_state=seed, n_init=20, max_iter=5000, init="k-means++")
    algo.fit(X)
    c1 = np.where(algo.labels_)[0]
    c2 = [i for i in range(len(X_train)) if i not in c1]

    X_train = X_train[c1, :]
    X_train_h = X_train_h[c2,:]

    y_train = y_train[c1,0].unsqueeze(1)
    h_train = h_train[c2,0].unsqueeze(1)

In [10]:
# # add random noise to h
# oldh = torch.clone(h)
# p =torch.bernoulli(0.75 *torch.ones(len(h))).unsqueeze(1)
# h = p*h + (1-p)*(1-h)

In [11]:
optimization_loop(num_epochs, optimizer, l2d_model, X_train, X_train_h, y_train, h_train, criterion)

Epoch [10/200], Loss: 2.5956365553312883
Epoch [20/200], Loss: 2.5187038253311966
Epoch [30/200], Loss: 2.455695939768825
Epoch [40/200], Loss: 2.404498006713122
Epoch [50/200], Loss: 2.3712669032138787
Epoch [60/200], Loss: 2.351223642155281
Epoch [70/200], Loss: 2.3365356985125785
Epoch [80/200], Loss: 2.3223381665877847
Epoch [90/200], Loss: 2.317269460886917
Epoch [100/200], Loss: 2.307540491607222
Epoch [110/200], Loss: 2.3063412124676033
Epoch [120/200], Loss: 2.3003223968840727
Epoch [130/200], Loss: 2.296475956895193
Epoch [140/200], Loss: 2.2914843964024927
Epoch [150/200], Loss: 2.29005039541522
Epoch [160/200], Loss: 2.2890985082053894
Epoch [170/200], Loss: 2.2851762731533647
Epoch [180/200], Loss: 2.282904700500135
Epoch [190/200], Loss: 2.2810526653570777
Epoch [200/200], Loss: 2.276876294013864


In [13]:
y_test = y_test[:,0].unsqueeze(1)
h_test = y_test[:,0].unsqueeze(1) # y = h
metrics = {}
labels = [y_test, h_test]

with torch.no_grad():
    l2d_model.eval()
    combined_outputs, decision_outputs = l2d_model(X_test)
    pred_clf = (combined_outputs > 0.5).float()
    for i in range(2):
        metrics[f"clf_{i+1}"] = get_metrics(labels[i], pred_clf[:, i])
    boolean = (
            decision_outputs[:, 0] > combined_outputs[:, 0]
        ) * (
            combined_outputs[:, 1] > combined_outputs[:, 0]
        ) + (
            decision_outputs[:,0] > combined_outputs[:, 0]
        ) * (
            combined_outputs[:, 1] < combined_outputs[:, 0]
        ) * 1.
    boolean = torch.tensor(boolean, dtype=torch.float32)

    final_predictions = (boolean * pred_clf[:, 1]) + (1 - boolean) * pred_clf[:, 0]
    metrics[f"system"] = get_metrics(labels[0], final_predictions)
print (json.dumps(metrics, indent=2, default=str))
print(f"Percentage of deferral: {boolean.mean()}")
print(f"Deferral for positive samples: {boolean[(labels[0] == 1).squeeze()].mean()}")
print(f"Deferral for negative samples: {boolean[(labels[0] == 0).squeeze()].mean()}")

{
  "clf_1": {
    "Accuracy": 0.9344086021505377,
    "Precision": 0.9432180598525441,
    "Recall": 0.9344086021505377,
    "F1-Score": 0.9350351898406438
  },
  "clf_2": {
    "Accuracy": 0.6053763440860215,
    "Precision": 0.3664805179789571,
    "Recall": 0.6053763440860215,
    "F1-Score": 0.4565664858947489
  },
  "system": {
    "Accuracy": 0.946236559139785,
    "Precision": 0.9522303935441108,
    "Recall": 0.946236559139785,
    "F1-Score": 0.946689628292985
  }
}
Percentage of deferral: 0.5537634491920471
Deferral for positive samples: 0.00272479560226202
Deferral for negative samples: 0.912966251373291


In [None]:
for i in range(len(X_test)):
    if (y_test[i]==1) and (decision_outputs[i] > combined_outputs[i, 0]) and (combined_outputs[i, 1] > combined_outputs[i, 0]) or (y_test[i]==0) and (decision_outputs[i] > combined_outputs[i, 0]) and (combined_outputs[i, 1] < combined_outputs[i, 0]):
        print(
            f"y = {y_test[i].item()},", 
            f"h = {h_test[i].item()},", 
            f"classifier pred = {combined_outputs[i,0]:.3f},", 
            f"hum model pred = {combined_outputs[i,1]:.3f}",
            "--> deferred"
        )
    else:
        print(
            f"y = {y_test[i].item()},", 
            f"h = {h_test[i].item()},", 
            f"classifier pred = {combined_outputs[i,0]:.3f},", 
            f"hum model pred = {combined_outputs[i,1]:.3f}",
            "--> NOT deferred"
        )

In [None]:
torch.save(l2d_model.state_dict(), "models/l2d_model_demo2.pt")

In [None]:
drd2_train_undersampled.iloc[c1].rename(columns = {"activity": "activity_y"}).to_csv("datasets/drd2_train_undersampled_y_ECFP_counts.csv")
drd2_train_undersampled.iloc[c2].rename(columns = {"activity": "activity_h"}).to_csv("datasets/drd2_train_undersampled_h_ECFP_counts.csv")