### Import librairies

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

### Load data

In [4]:
df = pd.read_csv("../data/processed/hiv_ic50_featurized.csv")

In [5]:
# Create 'active' column
df["active"] = (df["standard_value"] <= 1000).astype(int)

### Preparation of data

In [6]:
feature_cols = ["MolWt", "TPSA", "NumRotatableBonds",
                "NumHDonors", "NumHAcceptors", "NumAromaticRings", "LogP"]
X = df[feature_cols].values
y = df["active"].values.astype(np.float32)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [10]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Model creation

In [11]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1=nn.Linear(7,32)
        self.fc2=nn.Linear(32,16)
        self.fc3=nn.Linear(16,1)
    def forward(self,x):
        x=self.fc1(x)
        x=F.relu(x)
        x=self.fc2(x)
        x=F.relu(x)
        x=self.fc3(x)
        return x

In [12]:
model= Net()
print(model)

Net(
  (fc1): Linear(in_features=7, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)


### Model training

In [13]:
Loss=nn.BCEWithLogitsLoss()
optim=torch.optim.Adam(params=model.parameters(),lr=0.01)

In [14]:
for epoch in range(50):
    loss_runnig=0
    for features,label in train_loader:
        optim.zero_grad()
        predict=model(features)
        perte=Loss(predict,label)
        loss_runnig+=perte.item()
        perte.backward()
        optim.step()
    print(f"Epoch: {epoch+1}, loss= {loss_runnig/len(train_loader)}")

Epoch: 1, loss= 0.6617494556139101
Epoch: 2, loss= 0.6341731093964487
Epoch: 3, loss= 0.6172705720055778
Epoch: 4, loss= 0.6083100947569001
Epoch: 5, loss= 0.6027014841448586
Epoch: 6, loss= 0.5954178247811659
Epoch: 7, loss= 0.5923750219480046
Epoch: 8, loss= 0.587052671189578
Epoch: 9, loss= 0.584821982878559
Epoch: 10, loss= 0.5796572366975389
Epoch: 11, loss= 0.5744554560139494
Epoch: 12, loss= 0.5695765769706582
Epoch: 13, loss= 0.5648425472232531
Epoch: 14, loss= 0.561719904868108
Epoch: 15, loss= 0.560495189675745
Epoch: 16, loss= 0.5563845715432797
Epoch: 17, loss= 0.5555643990354718
Epoch: 18, loss= 0.5519083750697802
Epoch: 19, loss= 0.5483724992230253
Epoch: 20, loss= 0.5428025704509807
Epoch: 21, loss= 0.5387425988350274
Epoch: 22, loss= 0.5392518049141146
Epoch: 23, loss= 0.5344523821236953
Epoch: 24, loss= 0.5393730684271398
Epoch: 25, loss= 0.5303383236786104
Epoch: 26, loss= 0.5288423777751203
Epoch: 27, loss= 0.5265977959587889
Epoch: 28, loss= 0.5226476987577834
Epoch

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch

def evaluate_model(model, data_loader, threshold=0.5):
    model.eval()
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for features, labels in data_loader:
            outputs = model(features)
            probs = torch.sigmoid(outputs)  # convert logits in probabilities
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    all_labels = np.array(all_labels).reshape(-1)
    all_probs = np.array(all_probs).reshape(-1)
    

    preds = (all_probs >= threshold).astype(int)

    acc = accuracy_score(all_labels, preds)
    prec = precision_score(all_labels, preds)
    rec = recall_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    auc = roc_auc_score(all_labels, all_probs)
    
    print(f"Threshold: {threshold}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    
    return acc, prec, rec, f1, auc


evaluate_model(model, test_loader, threshold=0.5)


Threshold: 0.5
Accuracy:  0.7540
Precision: 0.7427
Recall:    0.7871
F1-score:  0.7643
ROC AUC:   0.8352


(0.7540208136234626,
 0.7427312775330397,
 0.7871148459383753,
 0.7642792384406165,
 0.8351680717029542)

In [19]:
import numpy as np
from sklearn.metrics import f1_score

def best_f1_threshold(probs, labels):
    ts = np.linspace(0.05, 0.95, 19)
    scores = [(t, f1_score(labels, (probs>=t).astype(int))) for t in ts]
    return max(scores, key=lambda x: x[1])

# reuse outputs from evaluate pass (all_probs, all_labels)
_ = model.eval()
# quick pass to get probs & labels
all_probs, all_labels = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        all_probs.extend(torch.sigmoid(model(Xb)).cpu().numpy().ravel())
        all_labels.extend(yb.cpu().numpy().ravel())
best_t, best_f1 = best_f1_threshold(np.array(all_probs), np.array(all_labels))
print(best_t, best_f1)

0.35 0.7697262479871175


Let's try the new threshold

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch

def evaluate_model(model, data_loader, threshold=0.5):
    model.eval()
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for features, labels in data_loader:
            outputs = model(features)
            probs = torch.sigmoid(outputs)  # convert logits in probabilities
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    all_labels = np.array(all_labels).reshape(-1)
    all_probs = np.array(all_probs).reshape(-1)
    
    
    preds = (all_probs >= threshold).astype(int)
    
   
    acc = accuracy_score(all_labels, preds)
    prec = precision_score(all_labels, preds)
    rec = recall_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    auc = roc_auc_score(all_labels, all_probs)
    
    print(f"Threshold: {threshold}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    
    return acc, prec, rec, f1, auc


evaluate_model(model, test_loader, threshold=0.35)


Threshold: 0.35
Accuracy:  0.7294
Precision: 0.6766
Recall:    0.8926
F1-score:  0.7697
ROC AUC:   0.8352


(0.7294228949858089,
 0.6765746638358103,
 0.892623716153128,
 0.7697262479871175,
 0.8351680717029542)