In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Load your uploaded dataset
file_path = "diabetes.csv"
data = pd.read_csv(file_path)

# Inspect the first few rows to ensure the data is loaded correctly
print(data.head())

# Splitting features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define base learners
base_learners = {
    "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "svm": SVC(kernel='linear', probability=True, random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5),
    "logistic_regression": LogisticRegression(max_iter=1000, random_state=42)
}

# Train base learners and get their predictions on the training data for stacking
base_predictions_train = np.zeros((X_train.shape[0], len(base_learners)))
base_predictions_test = np.zeros((X_test.shape[0], len(base_learners)))

for i, (name, model) in enumerate(base_learners.items()):
    # Train each base learner
    model.fit(X_train, y_train)

    # Cross-validated predictions on training data (for meta-learner input)
    base_predictions_train[:, i] = cross_val_predict(
        model, X_train, y_train, cv=5, method='predict_proba'
    )[:, 1]

    # Predictions on the test data
    base_predictions_test[:, i] = model.predict_proba(X_test)[:, 1]

# Define the meta-learners (XGBoost and Gradient Boosting)
meta_model_xgb = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
meta_model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train meta-learners using the predictions from base learners
meta_model_xgb.fit(base_predictions_train, y_train)
meta_model_gb.fit(base_predictions_train, y_train)

# Predict on the test data using both meta-learners
xgb_preds = meta_model_xgb.predict(base_predictions_test)
gb_preds = meta_model_gb.predict(base_predictions_test)

# Combine predictions from both meta-learners (Majority Vote)
final_predictions = (xgb_preds + gb_preds) / 2
final_predictions = np.where(final_predictions >= 0.5, 1, 0)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, final_predictions))
print("\nClassification Report:\n", classification_report(y_test, final_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, final_predictions))


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.6688311688311688

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.70      0.73        99
           1       0.53      0.62      0.57        55

    accuracy                           0.67       154
   macro avg       0.65      0.66      0.65       154
weighted avg       0.68      0.67      0.67       154


Confusion Matrix:
 [[69 30]
 [21 34]]


In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Load the dataset
file_path = "diabetes.csv"
data = pd.read_csv(file_path)

# Inspect the data (optional)
print(data.head())

# Split features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Handle imbalanced data with SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Define base learners with optimized hyperparameters
base_learners = {
    "random_forest": RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42),
    "svm": SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=7),
    "logistic_regression": LogisticRegression(C=1.0, max_iter=1000, random_state=42)
}

# Train base learners and collect predictions
base_predictions_train = np.zeros((X_train.shape[0], len(base_learners)))
base_predictions_test = np.zeros((X_test.shape[0], len(base_learners)))

for i, (name, model) in enumerate(base_learners.items()):
    # Train the model
    model.fit(X_train, y_train)
    
    # Cross-validated predictions on training data (for meta-learner)
    base_predictions_train[:, i] = cross_val_predict(
        model, X_train, y_train, cv=5, method='predict_proba'
    )[:, 1]
    
    # Predictions on test data
    base_predictions_test[:, i] = model.predict_proba(X_test)[:, 1]

# Meta-learners with optimized parameters
meta_model_xgb = xgb.XGBClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42, eval_metric='logloss'
)
meta_model_gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)

# Train the meta-learners on the base learners' predictions
meta_model_xgb.fit(base_predictions_train, y_train)
meta_model_gb.fit(base_predictions_train, y_train)

# Get predictions from both meta-learners
xgb_preds = meta_model_xgb.predict(base_predictions_test)
gb_preds = meta_model_gb.predict(base_predictions_test)

# Combine predictions using majority voting
final_predictions = (xgb_preds + gb_preds) / 2
final_predictions = np.where(final_predictions >= 0.5, 1, 0)

# Evaluate the optimized model
print("Accuracy:", accuracy_score(y_test, final_predictions))
print("\nClassification Report:\n", classification_report(y_test, final_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, final_predictions))


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79        99
           1       0.77      0.85      0.81       101

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80   