In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Here we use extracted features for neural network modeling using tensorflow-keras

In [3]:
df = pd.read_csv('desc_final_with_more_small_molecules_filtered_with_good_label_07292024.csv')

In [4]:
df.shape

(1929, 627)

In [5]:
df.head()

Unnamed: 0,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2,good
0,0,2,24,24,120,65,0,0,18,55,...,11.168504,104.83104,973.29553,8.110796,24144.0,109,350,409,13.743055,0
1,0,2,30,30,101,56,0,0,14,45,...,10.873376,94.71096,812.2581,8.04216,17075.0,89,292,338,12.208333,0
2,0,0,21,22,46,28,0,0,7,18,...,10.266149,77.441505,375.14954,8.155425,2310.0,44,150,177,6.055555,0
3,0,1,9,10,34,18,0,0,5,16,...,9.988334,67.54874,244.13242,7.180365,565.0,28,100,121,3.833333,0
4,0,0,21,22,46,28,0,0,6,18,...,10.1625,77.168495,386.12012,8.393916,2325.0,42,146,171,6.277778,0


In [6]:
# split features and target variable
X_1 = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
# Normalization of data
X = pd.DataFrame(MinMaxScaler().fit_transform(X_1), columns=X_1.columns)

In [8]:
X.head()

Unnamed: 0,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,0.0,0.2,0.521739,0.521739,0.59596,0.636364,0.0,0.0,0.391304,0.495495,...,0.0,0.955494,0.654525,0.626908,0.042983,1.124848e-07,0.61236,0.688976,0.668301,0.613073
1,0.0,0.2,0.652174,0.652174,0.5,0.545455,0.0,0.0,0.304348,0.405405,...,0.0,0.930245,0.59011,0.51527,0.042105,7.952914e-08,0.5,0.574803,0.552288,0.54461
2,0.0,0.0,0.456522,0.478261,0.222222,0.262626,0.0,0.0,0.152174,0.162162,...,0.708128,0.878296,0.480189,0.212248,0.043554,1.069464e-08,0.247191,0.295276,0.289216,0.270136
3,0.0,0.1,0.195652,0.217391,0.161616,0.161616,0.0,0.0,0.108696,0.144144,...,0.771535,0.854528,0.417221,0.121421,0.031082,2.559441e-09,0.157303,0.19685,0.197712,0.171004
4,0.0,0.0,0.456522,0.478261,0.222222,0.262626,0.0,0.0,0.130435,0.162162,...,0.708128,0.869428,0.478451,0.219853,0.046604,1.076457e-08,0.235955,0.287402,0.279412,0.28005


In [9]:
# split original dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [10]:
def neural_networks_model(x, y, x_test, y_test, neurons):
    """
    Neural network model for binary classification

    Inputs
    x: descriptors values for training and validation
    y: binary labels for training and validation
    x_test: descriptors values for test
    y_test: binary labels for test

    Outputs
    model: trained neural network model
    scores: a dictionary containing recall, precision, and F1 score for each fold
    """
    np.random.seed(1)
    scores = {'precision': [], 'recall': [], 'f1': []}
    kfold = KFold(n_splits = 5, shuffle=True)

    model = Sequential()
    model.add(Dense(neurons, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification

    opt = Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy',  # Loss function for binary classification
                  optimizer=opt,
                  metrics=['accuracy'])   # Here just keep accuracy for monitoring purposes

    rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=10)

    for train, validation in kfold.split(x, y):

        model.fit(x.iloc[train], y.iloc[train],
                  epochs = 100,
                  batch_size=128,
                  callbacks=[rlrop],
                  verbose=0,
                  validation_data=(x.iloc[validation], y.iloc[validation]))

        y_pred = (model.predict(x_test) > 0.5).astype(int)

        # Calculate precision, recall, and f1 score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store scores for each fold
        scores['precision'].append(precision)
        scores['recall'].append(recall)
        scores['f1'].append(f1)

    return model, scores

In [11]:
# Train the model and evaluate the performance
model, scores = neural_networks_model(X_train, y_train, X_test, y_test, neurons=64)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [12]:
# Print the scores for each fold
print(f"Precision for each fold: {scores['precision']}")
print(f"Recall for each fold: {scores['recall']}")
print(f"F1 Score for each fold: {scores['f1']}")

# Calculate average scores across all folds
avg_precision = np.mean(scores['precision'])
avg_recall = np.mean(scores['recall'])
avg_f1 = np.mean(scores['f1'])

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1 Score: {avg_f1}")

Precision for each fold: [0.0, 0.0, 0.0, 0.0, 0.0]
Recall for each fold: [0.0, 0.0, 0.0, 0.0, 0.0]
F1 Score for each fold: [0.0, 0.0, 0.0, 0.0, 0.0]
Average Precision: 0.0
Average Recall: 0.0
Average F1 Score: 0.0


### When dealing with imbalanced datasets, especially in binary classification, neural networks can struggle to correctly predict the minority class (positive class in my case). I can try several strategies: use class weight, Oversampling (such as SMOTE) for minority class, Undersampling for the majority class (lead to a loss of important information), Use different loss function (such as Focal loss), Use ensemble methods (bagging, boosting).

#### Try combine class weight and Focal Loss

In [18]:
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight

In [21]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [22]:
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        y_true = tf.cast(y_true, tf.float32)
        alpha_t = y_true * alpha + (tf.keras.backend.ones_like(y_true) - y_true) * (1 - alpha)
        p_t = y_true * y_pred + (tf.keras.backend.ones_like(y_true) - y_true) * (1 - y_pred)
        f1 = - alpha_t * tf.keras.backend.pow((tf.keras.backend.ones_like(y_true) - p_t), gamma) * tf.keras.backend.log(p_t)
        return tf.keras.backend.mean(f1)
    return focal_loss_fixed

In [25]:
def neural_networks_model(x, y, x_test, y_test, neurons):
    """
    Neural network model for binary classification

    Inputs
    x: descriptors values for training and validation
    y: binary labels for training and validation
    x_test: descriptors values for test
    y_test: binary labels for test

    Outputs
    model: trained neural network model
    scores: a dictionary containing recall, precision, and F1 score for each fold
    """
    np.random.seed(1)
    scores = {'precision': [], 'recall': [], 'f1': []}
    kfold = KFold(n_splits = 5, shuffle=True)

    # Convert data to NumPy arrays if they aren't already
    x = np.array(x)
    y = np.array(y)
    x_test = np.array(x_test)
    y_test = np.array(y_test)

    model = Sequential()
    model.add(Dense(neurons, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification

    opt = Adam(learning_rate=0.001)
    model.compile(loss=focal_loss(gamma=2., alpha=0.25),  # Loss function for binary classification
                  optimizer=opt,
                  metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])   # Here just keep accuracy for monitoring purposes

    rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=10)

    for train, validation in kfold.split(x, y):
        model.fit(x[train], y[train],  # The use of NumPy-stype indexing
                  epochs = 100,
                  batch_size=128,
                  callbacks=[rlrop],
                  verbose=0,
                  validation_data=(x[validation], y[validation]),
                  class_weight=class_weight_dict)  # Use class weights

        y_pred = (model.predict(x_test) > 0.5).astype(int)

        # Calculate precision, recall, and f1 score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store scores for each fold
        scores['precision'].append(precision)
        scores['recall'].append(recall)
        scores['f1'].append(f1)

    return model, scores

In [26]:
# Train the model and evaluate the performance
model, scores = neural_networks_model(X_train, y_train, X_test, y_test, neurons=64)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [27]:
print("Precision: ", np.mean(scores['precision']))
print("Recall: ", np.mean(scores['recall']))
print("F1 Score: ", np.mean(scores['f1']))

Precision:  0.0
Recall:  0.0
F1 Score:  0.0


## Nothing gets improved.