In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Path to your CSV file
file_path = 'diabetes.csv'

# Read the CSV file into a DataFrame
pima = pd.read_csv(file_path)

# Display the contents of the DataFrame
print(pima)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [3]:
# Display features
print("Features:\n", pima.columns)

Features:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [4]:
pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Data Sampling

In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Display the class distribution before SMOTE
print("Class distribution before SMOTE:", Counter(pima['Outcome']))

# Separate the features and the target variable
X = pima.drop('Outcome', axis=1)
y = pima['Outcome']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Display the class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_smote))

Class distribution before SMOTE: Counter({0: 500, 1: 268})
Class distribution after SMOTE: Counter({1: 500, 0: 500})


## Missing Values

In [6]:
# Check for missing values in the dataset
missing_values = pima.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
# Replace zero values with NaN for the specified columns
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
X_smote[columns_to_replace] = X_smote[columns_to_replace].replace(0, np.nan)

# Check for missing values again in the resampled dataset after replacing 0 with NaN
missing_values_resampled_updated = X_smote.isnull().sum()
print("\nMissing values in each column after replacing 0 with NaN:")
print(missing_values_resampled_updated)

# Display a summary of the total missing values in the updated resampled dataset
total_missing_resampled_updated = missing_values_resampled_updated.sum()
print(f"\nTotal missing values in the updated resampled dataset: {total_missing_resampled_updated}")

# Optionally, display columns with missing values only in the updated resampled dataset
missing_values_only_resampled_updated = missing_values_resampled_updated[missing_values_resampled_updated > 0]
if not missing_values_only_resampled_updated.empty:
    print("\nColumns with missing values in the updated resampled dataset:")
    print(missing_values_only_resampled_updated)
else:
    print("\nNo missing values in the updated resampled dataset.")

# Calculate the mean of each column
mean_values = X_smote.mean()

# Replace NaN values with the mean of each column
X_smote.fillna(mean_values, inplace=True)

# Check again for missing values in the dataset
missing_values_after = X_smote.isnull().sum()
print("Missing values in each column after replacement:\n", missing_values_after)


Missing values in each column after replacing 0 with NaN:
Pregnancies                   0
Glucose                       5
BloodPressure                51
SkinThickness               295
Insulin                     488
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

Total missing values in the updated resampled dataset: 850

Columns with missing values in the updated resampled dataset:
Glucose            5
BloodPressure     51
SkinThickness    295
Insulin          488
BMI               11
dtype: int64
Missing values in each column after replacement:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64


In [8]:
# Print the current dataset after data cleaning
print("\nDataset after data cleaning:")
print(X_smote)


Dataset after data cleaning:
     Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin  \
0              6    148.0           72.0      35.000000  165.414062   
1              1     85.0           66.0      29.000000  165.414062   
2              8    183.0           64.0      29.680851  165.414062   
3              1     89.0           66.0      23.000000   94.000000   
4              0    137.0           40.0      35.000000  168.000000   
..           ...      ...            ...            ...         ...   
995            5    164.0           64.0      29.680851  165.414062   
996            5    107.0           69.0      31.000000  165.414062   
997            4    171.0           83.0      27.000000  154.000000   
998            8    111.0           81.0      32.000000  175.000000   
999            4    144.0           79.0      32.000000  165.414062   

           BMI  DiabetesPedigreeFunction  Age  
0    33.600000                  0.627000   50  
1    26.600000       

## Feature Selection

In [9]:
# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=6)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")


Selected features by chi-square test:
 Index(['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')

SVM Model Evaluation:
Accuracy: 0.74
Recall: 0.73
Precision: 0.75
F1-Score: 0.74

Decision Tree Model Evaluation:
Accuracy: 0.79
Recall: 0.81
Precision: 0.77
F1-Score: 0.79


In [14]:
# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=7)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")

#-----------------------------------------------------------------------------------------------------------------------------------------------------
print("-----------------------------------------------------------------------------------------------")

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=6)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")

#-----------------------------------------------------------------------------------------------------------------------------------------------------
print("-----------------------------------------------------------------------------------------------")

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=5)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")

#-----------------------------------------------------------------------------------------------------------------------------------------------------
print("-----------------------------------------------------------------------------------------------")

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=4)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")

#-----------------------------------------------------------------------------------------------------------------------------------------------------
print("-----------------------------------------------------------------------------------------------")

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=3)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")

#-----------------------------------------------------------------------------------------------------------------------------------------------------
print("-----------------------------------------------------------------------------------------------")

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=2)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")


Selected features by chi-square test:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'Age'],
      dtype='object')

SVM Model Evaluation:
Accuracy: 0.76
Recall: 0.75
Precision: 0.76
F1-Score: 0.76

Decision Tree Model Evaluation:
Accuracy: 0.79
Recall: 0.84
Precision: 0.77
F1-Score: 0.80
-----------------------------------------------------------------------------------------------
Selected features by chi-square test:
 Index(['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')

SVM Model Evaluation:
Accuracy: 0.74
Recall: 0.73
Precision: 0.75
F1-Score: 0.74

Decision Tree Model Evaluation:
Accuracy: 0.80
Recall: 0.82
Precision: 0.79
F1-Score: 0.81
-----------------------------------------------------------------------------------------------
Selected features by chi-square test:
 Index(['Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')

SVM Model Evaluation:
Accuracy: 0.75
Recall: 0.73
P