In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import numpy as np
import tensorflow as tf
from scipy.stats import sem
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing the dataset
# Split the data into features and target variable
X = data.drop(columns=['Diabetes'])
y = data['Diabetes']
y = y.apply(lambda x: 1 if x > 0 else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to TensorFlow datasets
def get_train_dataset(X_train, y_train):
    return tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=1024).batch(32)

# Define Neural Network model
class NeuralNetworkModel(tf.keras.Model):
    def __init__(self, input_dim):
        super(NeuralNetworkModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
binary_crossentropy = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

def train_step(model, X, y):
    with tf.GradientTape() as tape:
        predictions = model(X)
        loss = binary_crossentropy(y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Training the model
def train_model(model, train_dataset, epochs=20):
    for epoch in range(epochs):
        epoch_loss = 0
        for step, (X_batch, y_batch) in enumerate(train_dataset):
            print(f"Batch {step + 1}, X_batch shape: {X_batch.shape}, y_batch shape: {y_batch.shape}")
            loss = train_step(model, X_batch, y_batch)
            epoch_loss += loss
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss / (step + 1):.4f}')

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    logits = model(X_test)
    predictions = tf.round(logits)
    predictions = tf.squeeze(predictions)
    y_test_numpy = y_test.numpy()
    predictions_numpy = predictions.numpy()
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_numpy, predictions_numpy)
    
    # Calculate precision
    precision = precision_score(y_test_numpy, predictions_numpy)
    
    # Calculate sensitivity (recall)
    recall = recall_score(y_test_numpy, predictions_numpy)
    
    # Calculate F1 score
    f1 = f1_score(y_test_numpy, predictions_numpy)
    
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test_numpy, predictions_numpy).ravel()

    # Calculate ROC AUC
    probabilities = logits.numpy()
    roc_auc = roc_auc_score(y_test_numpy, probabilities)
    
    return accuracy, precision, recall, f1, roc_auc, tn, fp, fn, tp

# Initialize metrics list and best model variables
nn_metrics = []
best_nn_model = None
best_nn_score = -np.inf

# Function to perform bootstrap resampling and calculate metrics
def bootstrap_evaluation(X_train, y_train, X_test, y_test, n_iterations=3):
    global best_nn_model, best_nn_score
    
    for i in range(n_iterations):
        print(f'Iteration {i + 1}/{n_iterations}')
        
        # Resample the test dataset
        X_test_resampled, y_test_resampled = resample(X_test, y_test, n_samples=len(X_test), random_state=None)

        # Convert resampled data to TensorFlow tensors
        X_test_resampled_tensor = tf.convert_to_tensor(X_test_resampled, dtype=tf.float32)
        y_test_resampled_tensor = tf.convert_to_tensor(y_test_resampled.values, dtype=tf.float32)
        
        # Initialize a new instance of the model for each iteration
        model = NeuralNetworkModel(input_dim)
        
        # Get the training dataset
        train_dataset = get_train_dataset(X_train, y_train)
        
        # Train the model
        train_model(model, train_dataset, epochs=10)
        
        # Evaluate the model on the resampled dataset
        nn_metric_evaluation = evaluate_model(model, X_test_resampled_tensor, y_test_resampled_tensor)
    
        # Store the best model based on accuracy
        if nn_metric_evaluation[0] > best_nn_score:
            best_nn_score = nn_metric_evaluation[0]
            best_nn_model = model
            
        nn_metrics.append(nn_metric_evaluation)
    
    # Calculate mean and SEM for each metric
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'tn', 'fp', 'fn', 'tp']
    for metric_idx, metric_name in enumerate(metrics):
        metric_values = [metric[metric_idx] for metric in nn_metrics]
        mean = np.mean(metric_values)
        sem_value = sem(metric_values)
        ci_lower = mean - 1.96 * sem_value
        ci_upper = mean + 1.96 * sem_value
        print(f'{metric_name.capitalize()} - Mean: {mean:.4f}, 95% CI: ({ci_lower:.4f}, {ci_upper:.4f})')

# Perform bootstrap evaluation
bootstrap_evaluation(X_train, y_train, X_test, y_test)


Iteration 1/3
Batch 1, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 2, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 3, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 4, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 5, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 6, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 7, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 8, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 9, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 10, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 11, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 12, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 13, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 14, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 15, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 16, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 17, X_batch shape: (32, 17), y_batch shape: (32,)
Batch 18, X_batch shape: (32, 17), y_batch 

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import tensorflow as tf

# Load and preprocess dataset
url = 'https://p16-bot-sign-sg.ciciai.com/tos-alisg-i-b2l6bve69y-sg/0a542996a2e74931b9a17cc3fb593feb.csv~tplv-b2l6bve69y-image.image?rk3s=68e6b6b5&x-expires=1719816096&x-signature=hvoff8lKbCLE4lp7Nfk5Cir9yn4%3D'
data = pd.read_csv(url)

# Handling Missing Values
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
ordinal_binary_cols = ['HighChol', 'CholCheck', 'Smoker', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'DiffWalk', 'Stroke', 'HighBP', 'Diabetes']

for col in numerical_cols:
    if col not in ordinal_binary_cols:
        data[col].fillna(data[col].mean(), inplace=True)

# Normalizing Numerical Features
scaler = StandardScaler()
for col in numerical_cols:
    if col not in ordinal_binary_cols:
        data[col] = scaler.fit_transform(data[[col]])

data.to_csv('preprocessed_data.csv', index=False)

# Load preprocessed dataset
data = pd.read_csv('preprocessed_data.csv')

X = data.drop(columns=['Diabetes'])
y = data['Diabetes']
y = y.apply(lambda x: 1 if x > 0 else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=1024).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

# Define Neural Network model
class NeuralNetworkModel(tf.keras.Model):
    def __init__(self, input_dim):
        super(NeuralNetworkModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
model = NeuralNetworkModel(input_dim)
binary_crossentropy = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Training function
def train_step(model, X, y):
    with tf.GradientTape() as tape:
        predictions = model(X, training=True)
        loss = binary_crossentropy(y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_model(model, train_dataset, epochs=10):
    for epoch in range(epochs):
        epoch_loss = 0
        for step, (X_batch, y_batch) in enumerate(train_dataset):
            loss = train_step(model, X_batch, y_batch)
            epoch_loss += loss
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss / (step + 1):.4f}')

# Evaluation function
def evaluate_model(model, X_test, y_test, threshold=0.5):
    probabilities = model(X_test, training=False)
    predictions = tf.cast(probabilities >= threshold, dtype=tf.float32)
    predictions = tf.squeeze(predictions)
    
    y_test_numpy = y_test.numpy()
    predictions_numpy = predictions.numpy()
    
    accuracy = accuracy_score(y_test_numpy, predictions_numpy)
    precision = precision_score(y_test_numpy, predictions_numpy)
    recall = recall_score(y_test_numpy, predictions_numpy)
    f1 = f1_score(y_test_numpy, predictions_numpy)
    tn, fp, fn, tp = confusion_matrix(y_test_numpy, predictions_numpy).ravel()
    roc_auc = roc_auc_score(y_test_numpy, probabilities)
    
    return accuracy, precision, recall, f1, roc_auc, tn, fp, fn, tp

# Train and evaluate with resampling
def bootstrap_evaluation(X_train, y_train, X_test, y_test, n_iterations=3):
    best_nn_model = None
    best_nn_score = -np.inf
    nn_metrics = []

    for i in range(n_iterations):
        print(f'Iteration {i + 1}/{n_iterations}')
        
        X_test_resampled, y_test_resampled = resample(X_test, y_test, n_samples=len(X_test), random_state=None)
        
        X_test_resampled_tensor = tf.convert_to_tensor(X_test_resampled, dtype=tf.float32)
        y_test_resampled_tensor = tf.convert_to_tensor(y_test_resampled.values, dtype=tf.float32)
        
        model = NeuralNetworkModel(input_dim)
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=1024).batch(32)
        
        train_model(model, train_dataset, epochs=10)
        
        nn_metric_evaluation = evaluate_model(model, X_test_resampled_tensor, y_test_resampled_tensor)
        
        if nn_metric_evaluation[0] > best_nn_score:
            best_nn_score = nn_metric_evaluation[0]
            best_nn_model = model
            
        nn_metrics.append(nn_metric_evaluation)

    return best_nn_model, nn_metrics

best_nn_model, nn_metrics = bootstrap_evaluation(X_train, y_train, X_test, y_test)

# Evaluate the best model on the original test set with adjusted threshold
best_nn_model_metrics = evaluate_model(best_nn_model, tf.convert_to_tensor(X_test, dtype=tf.float32), tf.convert_to_tensor(y_test.values, dtype=tf.float32), threshold=0.3)

print(f'Best Model Metrics: Accuracy: {best_nn_model_metrics[0]}, Precision: {best_nn_model_metrics[1]}, Recall: {best_nn_model_metrics[2]}, F1 Score: {best_nn_model_metrics[3]}, ROC AUC: {best_nn_model_metrics[4]}')
print(f'Confusion Matrix: TN: {best_nn_model_metrics[5]}, FP: {best_nn_model_metrics[6]}, FN: {best_nn_model_metrics[7]}, TP: {best_nn_model_metrics[8]}')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Iteration 1/3
Epoch 1, Loss: 0.5152
Epoch 2, Loss: 0.5051
Epoch 3, Loss: 0.5030
Epoch 4, Loss: 0.5020
Epoch 5, Loss: 0.5011
Epoch 6, Loss: 0.5004
Epoch 7, Loss: 0.4997
Epoch 8, Loss: 0.4990
Epoch 9, Loss: 0.4983
Epoch 10, Loss: 0.4979
Iteration 2/3
Epoch 1, Loss: 0.5106
Epoch 2, Loss: 0.5037
Epoch 3, Loss: 0.5022
Epoch 4, Loss: 0.5012
Epoch 5, Loss: 0.5003
Epoch 6, Loss: 0.4992
Epoch 7, Loss: 0.4988
Epoch 8, Loss: 0.4983
Epoch 9, Loss: 0.4976
Epoch 10, Loss: 0.4969
Iteration 3/3
Epoch 1, Loss: 0.5105
Epoch 2, Loss: 0.5034
Epoch 3, Loss: 0.5018
Epoch 4, Loss: 0.5009
Epoch 5, Loss: 0.4999
Epoch 6, Loss: 0.4991
Epoch 7, Loss: 0.4988
Epoch 8, Loss: 0.4979
Epoch 9, Loss: 0.4973
Epoch 10, Loss: 0.4969
Best Model Metrics: Accuracy: 0.7226112172006507, Precision: 0.6574997481615795, Recall: 0.92594694282877, F1 Score: 0.7689679547596607, ROC AUC: 0.8278420790513153
Confusion Matrix: TN: 3690, FP: 3400, FN: 522, TP: 6527
