In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
#splitting data into training and testing sets
from sklearn.model_selection import train_test_split

#predicting the probability of a binary outcome using the logistic function
from sklearn.linear_model import LogisticRegression

#c measuring the proportion of correctly classified instances in a classification model
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

In [None]:
#loarding data set
data = pd.read_csv("diabetes.csv")

In [None]:
data

In [None]:
#checking for missing values
sns.heatmap(data.isnull())

In [None]:
#co relation matrix
correlation = data.corr()
print(correlation)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#train test split
X = data.drop("Outcome",axis = 1)
Y = data['Outcome']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

# Logisitic Regression

In [None]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train,Y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
print(prediction)

In [None]:
accuracy = accuracy_score(prediction,Y_test)

In [None]:
print(accuracy)

In [None]:
# Save the trained model to a file
joblib.dump(model, 'trained_model.joblib')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Logistic Regression with cross-validation
lr_model = LogisticRegression(max_iter=1000)
lr_accuracies = cross_val_score(lr_model, X, Y, cv=10)  # 10-fold cross-validation

# Display cross-validation accuracies and the best accuracy
print("Logistic Regression Cross-Validation Accuracies:", lr_accuracies)
print("Best Logistic Regression Accuracy:", max(lr_accuracies))


# Logistic Regression with improvements

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd

# Loading dataset
data = pd.read_csv("diabetes.csv")

# Train-test split
X = data.drop("Outcome", axis=1)
Y = data['Outcome']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Feature Scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, Y_train)

# Best model
best_model = grid.best_estimator_
prediction = best_model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(Y_test, prediction)

# Output
print("Best Hyperparameters:", grid.best_params_)
print("Test Accuracy:", accuracy)


# Random Forest

In [None]:
# # Importing the Random Forest classifier
# from sklearn.ensemble import RandomForestClassifier

# # Training the Random Forest model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train, Y_train)

# # Making predictions
# rf_predictions = rf_model.predict(X_test)

# # Calculating accuracy
# rf_accuracy = accuracy_score(rf_predictions, Y_test)

# # Displaying predictions and accuracy
# print(rf_predictions)
# print(rf_accuracy)

# # Save the trained Random Forest model to a file
# # joblib.dump(rf_model, 'rf_trained_model.joblib')


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest with cross-validation
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_accuracies = cross_val_score(rf_model, X, Y, cv=10)  # 10-fold cross-validation

# Display cross-validation accuracies and the best accuracy
print("Random Forest Cross-Validation Accuracies:", rf_accuracies)
print("Best Random Forest Accuracy:", max(rf_accuracies))


# Decision Tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score

# # Training the Decision Tree model
# dt_model = DecisionTreeClassifier(random_state=42)
# dt_model.fit(X_train, Y_train)

# # Making predictions
# dt_predictions = dt_model.predict(X_test)

# # Calculating accuracy
# dt_accuracy = accuracy_score(dt_predictions, Y_test)

# # Displaying predictions and accuracy
# print("Decision Tree Predictions:", dt_predictions)
# print("Decision Tree Accuracy:", dt_accuracy)

# # Save the trained Decision Tree model
# # joblib.dump(dt_model, 'dt_trained_model.joblib')


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Decision Tree with cross-validation
dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracies = cross_val_score(dt_model, X, Y, cv=10)  # 10-fold cross-validation

# Display cross-validation accuracies and the best accuracy
print("Decision Tree Cross-Validation Accuracies:", dt_accuracies)
print("Best Decision Tree Accuracy:", max(dt_accuracies))


# Stack Ensemble (Final Model)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

# Load the Pima Indian Diabetes dataset
data = pd.read_csv("diabetes.csv")

# Features and target variable
X = data.drop("Outcome", axis=1)
Y = data['Outcome']

# Base learners
base_learners = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

# Meta-model (XGBClassifier)
meta_model = XGBClassifier(eval_metric='logloss')

# Stacking ensemble
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model)

# Apply cross-validation
cv_scores = cross_val_score(stacking_model, X, Y, cv=20, scoring='accuracy') # 20-fold cross-validation

# Display cross-validation results and the best accuracy
print("Cross-validation Accuracy Scores for Stacking Model:", cv_scores)
print("Best Stacking Model Accuracy:", max(cv_scores))


# Evaluation Metrics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("diabetes.csv")

# Count the occurrences of each value in the 'Outcome' column
outcome_counts = data['Outcome'].value_counts()

# Display the counts
print("Count of each class in the Outcome column:")
print(outcome_counts)

# Plot the bar chart with specified bar colors
plt.bar(outcome_counts.index, outcome_counts.values, color=['#4CAF50', '#F44336'], edgecolor='black')
plt.xticks([0, 1], ['Non-Diabetic (0)', 'Diabetic (1)'])
plt.title("Distribution of Outcome Column")

# Add value labels inside the bars
for i, value in enumerate(outcome_counts.values):
    plt.text(i, value - 20, str(value), color='white', ha='center', va='center', fontsize=10)

plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Fit the stacking model
stacking_model.fit(X, Y)

# Predict the outcomes
y_pred = stacking_model.predict(X)

# Generate confusion matrix
cm = confusion_matrix(Y, y_pred)

# Display confusion matrix with a title
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=stacking_model.classes_)
cm_display.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix for Stacking Model")
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Fit the stacking model
stacking_model.fit(X, Y)

# Predict probabilities for the ROC curve
y_prob = stacking_model.predict_proba(X)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(Y, y_prob)

# Compute AUC score
auc_score = roc_auc_score(Y, y_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line (no skill)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Stacking Model')
plt.legend(loc='lower right')
plt.show()


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Fit the stacking model
stacking_model.fit(X, Y)

# Predict the outcomes
y_pred = stacking_model.predict(X)

# Calculate Precision, Recall, and F1-score
precision = precision_score(Y, y_pred)
recall = recall_score(Y, y_pred)
f1 = f1_score(Y, y_pred)

# Create a DataFrame to display the results
metrics_data = {
    'Metric': ['Precision', 'Recall', 'F1-score'],
    'Value': [precision, recall, f1]
}

metrics_df = pd.DataFrame(metrics_data)

# Display the table
print(metrics_df)


In [None]:
from sklearn.metrics import classification_report

# Fit the ensemble model (stacking_model)
stacking_model.fit(X, Y)

# Predict the outcomes
y_pred = stacking_model.predict(X)

# Generate the classification report with meaningful labels
report = classification_report(Y, y_pred, target_names=['Non-Diabetic', 'Diabetic'])

# Display the classification report
print("Classification Report for Ensemble Model:")
print(report)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# Fit the stacking model
stacking_model.fit(X, Y)

# Predict the outcomes
y_pred = stacking_model.predict(X)

# Calculate Precision, Recall, and F1-score
precision = precision_score(Y, y_pred)
recall = recall_score(Y, y_pred)
f1 = f1_score(Y, y_pred)

# Metrics for plotting
metrics = ['Precision', 'Recall', 'F1-score']
values = [precision, recall, f1]

# Plot the metrics in a bar graph
plt.figure(figsize=(8, 6))
bars = plt.bar(metrics, values, color=['darkblue', 'orange', 'green'], width=0.4)  # Reduced bar width

# Add values inside the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval / 2, f'{yval:.2f}', ha='center', va='center', color='white', fontsize=12)

# Adjust spacing between bars
plt.subplots_adjust(left=0.2, right=0.8)

plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Precision, Recall, and F1-score for Stacking Model')
plt.ylim([0, 1])
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Data
models = ['Logistic Regression', 'Random Forest', 'Decision Tree', 'Ensemble Model']
accuracies = [82.89, 84.42, 80.52, 92.30]

colors = ["#FF6F61", "#FFD54F", "#FFCC80", "#4CAF50"]

# Plotting with adjusted bar width
plt.figure(figsize=(8, 5))
bars = plt.bar(models, accuracies, color=colors, width=0.5)

# Adding titles and labels
plt.title('Model Accuracy Comparison', fontsize=14)
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)

plt.ylim(0, 100)

# Display values in the middle of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval/2, round(yval, 2), 
             ha='center', va='center', color='black', fontsize=12)

# Display the plot
plt.show()
