In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("diabetes(2).csv")

In [None]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Display basic information
print("Dataset Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe())

# Handle missing values (zeros in specific columns are treated as missing)
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in columns_with_zeros:
    data[column] = data[column].replace(0, np.nan)
    data[column].fillna(data[column].median(), inplace=True)

# Check for any remaining missing values
print("\nMissing Values After Imputation:")
print(data.isnull().sum())

# Exploratory Data Analysis (EDA)
# 1. Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# 2. Histograms for all features
data.hist(figsize=(12, 10), bins=20)
plt.suptitle('Histograms of Features')
plt.tight_layout()
plt.savefig('histograms.png')
plt.close()

# 3. Box Plots for detecting outliers
plt.figure(figsize=(12, 8))
data.boxplot()
plt.title('Box Plots of Features')
plt.xticks(rotation=45)
plt.savefig('boxplots.png')
plt.close()

# 4. Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Outcome', data=data)
plt.title('Class Distribution of Outcome')
plt.savefig('class_distribution.png')
plt.close()

# Feature Scaling
scaler = StandardScaler()
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_scaled = scaler.fit_transform(X)

# Save preprocessed data
preprocessed_data = pd.DataFrame(X_scaled, columns=X.columns)
preprocessed_data['Outcome'] = y
preprocessed_data.to_csv('preprocessed_diabetes.csv', index=False)

print("\nPreprocessing and EDA completed. Preprocessed data saved as 'preprocessed_diabetes.csv'.")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Summary Statistics:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)



Preprocessing and EDA completed. Preprocessed data saved as 'preprocessed_diabetes.csv'.


# New Section

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
data = pd.read_csv('preprocessed_diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define kernels to evaluate
kernels = ['linear', 'rbf', 'poly']
results = {}

for kernel in kernels:
    # Initialize and train SVM model
    svm = SVC(kernel=kernel, random_state=42)
    svm.fit(X_train, y_train)

    # Predict
    y_pred = svm.predict(X_test)

    # Evaluate
    results[kernel] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - SVM ({kernel} kernel)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'svm_{kernel}_confusion_matrix.png')
    plt.close()

# Print results
print("SVM Results with Different Kernels:")
for kernel, metrics in results.items():
    print(f"\nKernel: {kernel}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1']:.4f}")

SVM Results with Different Kernels:

Kernel: linear
Accuracy: 0.7532
Precision: 0.6667
Recall: 0.6182
F1-Score: 0.6415

Kernel: rbf
Accuracy: 0.7468
Precision: 0.6667
Recall: 0.5818
F1-Score: 0.6214

Kernel: poly
Accuracy: 0.7403
Precision: 0.7273
Recall: 0.4364
F1-Score: 0.5455


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
data = pd.read_csv('preprocessed_diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - KNN')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('knn_confusion_matrix.png')
plt.close()

# Print results
print("KNN Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

KNN Results:
Accuracy: 0.7338
Precision: 0.6129
Recall: 0.6909
F1-Score: 0.6496


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
data = pd.read_csv('preprocessed_diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('rf_confusion_matrix.png')
plt.close()

# Feature Importance Plot
plt.figure(figsize=(8, 6))
sns.barplot(x=rf.feature_importances_, y=X.columns)
plt.title('Feature Importance - Random Forest')
plt.savefig('rf_feature_importance.png')
plt.close()

# Print results
print("Random Forest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Random Forest Results:
Accuracy: 0.7403
Precision: 0.6316
Recall: 0.6545
F1-Score: 0.6429


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
data = pd.read_csv('preprocessed_diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Logistic Regression model
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('lr_confusion_matrix.png')
plt.close()

# Print results
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Logistic Regression Results:
Accuracy: 0.7532
Precision: 0.6667
Recall: 0.6182
F1-Score: 0.6415


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
data = pd.read_csv('preprocessed_diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
svm = SVC(kernel='rbf', probability=True, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)

# Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(
    estimators=[('svm', svm), ('knn', knn), ('rf', rf), ('lr', lr)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting)
recall_voting = recall_score(y_test, y_pred_voting)
f1_voting = f1_score(y_test, y_pred_voting)

# Confusion Matrix for Voting Classifier
cm_voting = confusion_matrix(y_test, y_pred_voting)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_voting, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Voting Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('voting_confusion_matrix.png')
plt.close()

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[('svm', svm), ('knn', knn), ('rf', rf)],
    final_estimator=LogisticRegression(random_state=42, max_iter=1000)
)
stacking_clf.fit(X_train, y_train)
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate Stacking Classifier
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking)
recall_stacking = recall_score(y_test, y_pred_stacking)
f1_stacking = f1_score(y_test, y_pred_stacking)

# Confusion Matrix for Stacking Classifier
cm_stacking = confusion_matrix(y_test, y_pred_stacking)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_stacking, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Stacking Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('stacking_confusion_matrix.png')
plt.close()

# Print results
print("Voting Classifier Results:")
print(f"Accuracy: {accuracy_voting:.4f}")
print(f"Precision: {precision_voting:.4f}")
print(f"Recall: {recall_voting:.4f}")
print(f"F1-Score: {f1_voting:.4f}")

print("\nStacking Classifier Results:")
print(f"Accuracy: {accuracy_stacking:.4f}")
print(f"Precision: {precision_stacking:.4f}")
print(f"Recall: {recall_stacking:.4f}")
print(f"F1-Score: {f1_stacking:.4f}")

Voting Classifier Results:
Accuracy: 0.7468
Precision: 0.6481
Recall: 0.6364
F1-Score: 0.6422

Stacking Classifier Results:
Accuracy: 0.7468
Precision: 0.6481
Recall: 0.6364
F1-Score: 0.6422
