In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
import pickle



In [14]:
# Load your dataset (replace 'your_dataset.csv' with the actual dataset file)
data = pd.read_csv('diabetes.csv')

In [15]:
# Define features (X) and target (y)
X = data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = data['Outcome']  # Assuming 'Outcome' is the target variable (0 or 1)

In [16]:

# Oversample the minority class to create a balanced dataset
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)


In [17]:
# Feature Selection using SelectKBest and chi-squared
selector = SelectKBest(chi2, k=5)  # Adjust the number of features (k) as needed
X_resampled = selector.fit_transform(X_resampled, y_resampled)

In [18]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [19]:
# Standardize features (optional but often beneficial)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define machine learning models
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('Extra Trees', ExtraTreesClassifier(n_estimators=100, random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=100, random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', C=1, probability=True, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB())
]


In [20]:
# Hyperparameter tuning using GridSearchCV for Extra Trees
best_models = {}
for name, model in models:
    param_grid = {}  # Define hyperparameter grid for each model
    if name == 'SVM':
        param_grid = {'C': [0.1, 1, 10]}
    elif name == 'KNN':
        param_grid = {'n_neighbors': [3, 5, 7]}
    elif name == 'Extra Trees':  # Include hyperparameters for Extra Trees
        param_grid = {'n_estimators': [50, 100, 200]}
    grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

# Weighted Ensemble of Base Models
ensemble_model = VotingClassifier(estimators=[
    ('Random Forest', best_models['Random Forest']),
    ('Extra Trees', best_models['Extra Trees']),
    ('XGBoost', best_models['XGBoost']),
    ('Gradient Boosting', best_models['Gradient Boosting']),
    ('SVM', best_models['SVM']),
    ('KNN', best_models['KNN']),
    ('Naive Bayes', best_models['Naive Bayes'])
], voting='soft', weights=[2, 1, 2, 2, 1, 1, 1])

In [23]:
# Evaluate the ensemble model using cross-validation
cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
print(f'Ensemble Model: Cross-Validation Accuracy: {np.mean(cv_scores):.2f} (+/- {np.std(cv_scores):.2f})')

# Fit the ensemble model on the entire training dataset
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ensemble_model.predict(X_test)

# Evaluate the ensemble model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Model Test Accuracy: {test_accuracy:.2f}')

# Generate a classification report for detailed metrics
print(classification_report(y_test, y_pred))


Ensemble Model: Cross-Validation Accuracy: 0.82 (+/- 0.03)
Ensemble Model Test Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.86      0.77      0.81        99
           1       0.79      0.88      0.84       101

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200

