In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
from sklearn.cluster import KMeans

from helpers.utils import get_metrics, set_matplotlib_params
from networks.nonlinearnet_aihuman import optimize_alpha, test_time_prediction

set_matplotlib_params()
warnings.filterwarnings('ignore')
seed = 12
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(seed)
rng = np.random.default_rng(seed) 
torch.set_default_dtype(torch.double)
torch.set_default_tensor_type(torch.DoubleTensor)



In [3]:
### From the undersampled training set, create two training sets: one for clf, one for human, based on K-means (K=2)

drd2_train_undersampled = pd.read_csv("datasets/drd2_train_undersampled_ECFP_counts.csv")
d = 2048

X_train_all = torch.tensor(drd2_train_undersampled[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_train_all = torch.tensor(drd2_train_undersampled.activity.values, dtype=torch.double)

algo = KMeans(n_clusters=2, random_state=seed, n_init=20, max_iter=5000, init="k-means++")
algo.fit(X_train_all)
c1_train = np.where(algo.labels_)[0]
c2_train = [i for i in range(len(X_train_all)) if i not in c1_train]

X_train = X_train_all[c1_train, :]
X_train_h = X_train_all[c2_train,:]

y_train = y_train_all[c1_train].unsqueeze(1)
y_train_h = y_train_all[c2_train].unsqueeze(1)

### Test set

drd2_test = pd.read_csv("datasets/drd2_test_ECFP_counts.csv")
X_test_all = torch.tensor(drd2_test[[f"bit{i}" for i in range(d)]].values, dtype=torch.double)
y_test_all = torch.tensor(drd2_test.activity.values, dtype=torch.double)

### Validation set as a fraction of the test set

frac = .25
ntest = len(X_test_all)
nval = int(ntest * frac)
idx = rng.choice(range(ntest), nval, replace=False)
X_val = X_test_all[idx]
y_val = y_test_all[idx].unsqueeze(1)

notidx = [i for i in range(ntest) if i not in idx]
X_test_all = X_test_all[notidx]
y_test_all = y_test_all[notidx].unsqueeze(1)

print(f"Clf train set size: {len(X_train)}")
print(f"Human train set size: {len(X_train_h)}")
print(f"Validation set size: {len(X_val)}")
print(f"Global test set size: {len(X_test_all)}")

### We split the test set to evaluate the accuracy of clf1 and clf2 using the learned clustering

c1 = np.where(algo.predict(X_test_all))[0]
c2 = [i for i in range(len(X_test_all)) if i not in c1]

X_test = X_test_all[c1, :]
X_test_h = X_test_all[c2,:]

y_test = y_test_all[c1]
y_test_h = y_test_all[c2]

print(f"Clf test set size: {len(X_test)}")
print(f"Human test set size: {len(X_test_h)}")

NameError: name 'c1' is not defined

In [None]:
# example usage:
num_features = X_train.shape[1]  # number of input features
dropout = 0.2
num_epochs = 200
lr = 0.1

# define the loss function and optimizer for the l2d_model
criterion = nn.BCEWithLogitsLoss()  # use BCEWithLogitsLoss for binary classification

In [None]:
best_alpha, l2d_model = optimize_alpha([.1,.3,.5,.8,1.], lr, num_features, dropout, num_epochs, X_val, y_val, X_train, X_train_h, y_train, y_train_h, criterion)

In [None]:
metrics = {}
test_labels = [y_test, y_test_h]
test_features = [X_test, X_test_h]

print('Metrics computed using same distributions for train and test set')
for i in range(2):
    final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, test_features[i])
    metrics[f"clf_{i+1}"] = get_metrics(test_labels[i], pred_clf[:, i])
print (json.dumps(metrics, indent=2, default=str))

# for the system, we use the global test set
print('Metrics computed using the whole test set')
final_predictions, pred_clf, boolean, combined_outputs, decision_outputs = test_time_prediction(l2d_model, X_test_all)
metrics[f"clf_{i+1}"] = get_metrics(y_test_all, pred_clf[:, i])
metrics[f"system"] = get_metrics(y_test_all, final_predictions)
print (json.dumps(metrics, indent=2, default=str))
print(f"Percentage of deferral: {boolean.mean()}")
print(f"Deferral for positive samples: {boolean[(y_test_all == 1).squeeze()].mean()}")
print(f"Deferral for negative samples: {boolean[(y_test_all == 0).squeeze()].mean()}")

In [None]:
## Individual metrics

for i in range(len(X_test_all)):
    print(
        f"y = {y_test_all[i].item()},", 
        f"classifier pred = {combined_outputs[i,0]:.3f},", 
        f"hum model pred = {combined_outputs[i,1]:.3f}",
        "deferred" if boolean[i] else "NOT deferred"
    )

In [None]:
torch.save(l2d_model.state_dict(), "models/l2d_model_demo2.pt")

In [None]:
drd2_train_undersampled.iloc[c1].rename(columns = {"activity": "activity_y"}).to_csv("datasets/drd2_train_undersampled_y_ECFP_counts.csv")
drd2_train_undersampled.iloc[c2].rename(columns = {"activity": "activity_h"}).to_csv("datasets/drd2_train_undersampled_h_ECFP_counts.csv")