In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Path to your CSV file
file_path = 'diabetes.csv'

# Read the CSV file into a DataFrame
pima = pd.read_csv(file_path)

# Display the contents of the DataFrame
print(pima)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [3]:
# Display features
print("Features:\n", pima.columns)

Features:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [4]:
pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Data Sampling

In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Display the class distribution before SMOTE
print("Class distribution before SMOTE:", Counter(pima['Outcome']))

# Separate the features and the target variable
X = pima.drop('Outcome', axis=1)
y = pima['Outcome']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Display the class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_smote))

Class distribution before SMOTE: Counter({0: 500, 1: 268})
Class distribution after SMOTE: Counter({1: 500, 0: 500})


## Missing Values

In [6]:
# Check for missing values in the dataset
missing_values = pima.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
# Replace zero values with NaN for the specified columns
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
X_smote[columns_to_replace] = X_smote[columns_to_replace].replace(0, np.nan)

# Check for missing values again in the resampled dataset after replacing 0 with NaN
missing_values_resampled_updated = X_smote.isnull().sum()
print("\nMissing values in each column after replacing 0 with NaN:")
print(missing_values_resampled_updated)

# Display a summary of the total missing values in the updated resampled dataset
total_missing_resampled_updated = missing_values_resampled_updated.sum()
print(f"\nTotal missing values in the updated resampled dataset: {total_missing_resampled_updated}")

# Optionally, display columns with missing values only in the updated resampled dataset
missing_values_only_resampled_updated = missing_values_resampled_updated[missing_values_resampled_updated > 0]
if not missing_values_only_resampled_updated.empty:
    print("\nColumns with missing values in the updated resampled dataset:")
    print(missing_values_only_resampled_updated)
else:
    print("\nNo missing values in the updated resampled dataset.")

# Calculate the mean of each column
mean_values = X_smote.mean()

# Replace NaN values with the mean of each column
X_smote.fillna(mean_values, inplace=True)

# Check again for missing values in the dataset
missing_values_after = X_smote.isnull().sum()
print("Missing values in each column after replacement:\n", missing_values_after)


Missing values in each column after replacing 0 with NaN:
Pregnancies                   0
Glucose                       5
BloodPressure                51
SkinThickness               295
Insulin                     488
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

Total missing values in the updated resampled dataset: 850

Columns with missing values in the updated resampled dataset:
Glucose            5
BloodPressure     51
SkinThickness    295
Insulin          488
BMI               11
dtype: int64
Missing values in each column after replacement:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64


In [8]:
# Print the current dataset after data cleaning
print("\nDataset after data cleaning:")
print(X_smote)


Dataset after data cleaning:
     Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin  \
0              6    148.0           72.0      35.000000  165.414062   
1              1     85.0           66.0      29.000000  165.414062   
2              8    183.0           64.0      29.680851  165.414062   
3              1     89.0           66.0      23.000000   94.000000   
4              0    137.0           40.0      35.000000  168.000000   
..           ...      ...            ...            ...         ...   
995            5    164.0           64.0      29.680851  165.414062   
996            5    107.0           69.0      31.000000  165.414062   
997            4    171.0           83.0      27.000000  154.000000   
998            8    111.0           81.0      32.000000  175.000000   
999            4    144.0           79.0      32.000000  165.414062   

           BMI  DiabetesPedigreeFunction  Age  
0    33.600000                  0.627000   50  
1    26.600000       

## Feature Selection

In [14]:
# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=7)
X_kbest = chi2_selector.fit_transform(X_smote, y_smote)

# Get the columns selected by the chi-square test
selected_features = X.columns[chi2_selector.get_support()]
print("Selected features by chi-square test:\n", selected_features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y_smote, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Model Evaluation:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Recall: {svm_recall:.2f}")
print(f"Precision: {svm_precision:.2f}")
print(f"F1-Score: {svm_f1:.2f}")

# Decision Tree model
# Decision Tree Model
tree_model = DecisionTreeClassifier(criterion='gini')
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {tree_accuracy:.2f}")
print(f"Recall: {tree_recall:.2f}")
print(f"Precision: {tree_precision:.2f}")
print(f"F1-Score: {tree_f1:.2f}")


Selected features by chi-square test:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'Age'],
      dtype='object')

SVM Model Evaluation:
Accuracy: 0.76
Recall: 0.75
Precision: 0.76
F1-Score: 0.76

Decision Tree Model Evaluation:
Accuracy: 0.79
Recall: 0.83
Precision: 0.77
F1-Score: 0.80


## Comparison between Kernels

In [10]:
# List of kernels to evaluate
kernels = ['linear', 'sigmoid', 'poly', 'rbf']

# Dictionary to store the evaluation results
svm_results = {}

for kernel in kernels:
    # Initialize and train the SVM model with the specified kernel
    svm_model = SVC(kernel=kernel, C=1)
    svm_model.fit(X_train, y_train)
    y_pred_svm = svm_model.predict(X_test)
    
    # Evaluate the SVM model
    accuracy = accuracy_score(y_test, y_pred_svm)
    recall = recall_score(y_test, y_pred_svm)
    precision = precision_score(y_test, y_pred_svm)
    f1 = f1_score(y_test, y_pred_svm)
    
    # Store the results in the dictionary
    svm_results[kernel] = {
        'Accuracy': round(accuracy, 2),
        'Recall': round(recall, 2),
        'Precision': round(precision, 2),
        'F1-Score': round(f1, 2)
    }

# Print the evaluation results for each kernel
print("\nSVM Model Evaluation with Different Kernels:")
for kernel, metrics in svm_results.items():
    print(f"\nKernel: {kernel.capitalize()}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Recall: {metrics['Recall']}")
    print(f"Precision: {metrics['Precision']}")
    print(f"F1-Score: {metrics['F1-Score']}")



SVM Model Evaluation with Different Kernels:

Kernel: Linear
Accuracy: 0.74
Recall: 0.73
Precision: 0.75
F1-Score: 0.74

Kernel: Sigmoid
Accuracy: 0.64
Recall: 0.63
Precision: 0.65
F1-Score: 0.64

Kernel: Poly
Accuracy: 0.76
Recall: 0.85
Precision: 0.72
F1-Score: 0.78

Kernel: Rbf
Accuracy: 0.78
Recall: 0.86
Precision: 0.74
F1-Score: 0.8


## Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the RBF kernel
param_grid = {'C': [0.1, 1, 10, 100]}

# Initialize the SVM model with RBF kernel
svm_rbf = SVC(kernel='rbf')

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm_rbf, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters found by grid search
print("Best Parameters found by Grid Search:", best_params)

# Predict using the best estimator
y_pred_rbf = best_estimator.predict(X_test)

# Evaluate the best estimator
rbf_accuracy = accuracy_score(y_test, y_pred_rbf)
rbf_recall = recall_score(y_test, y_pred_rbf)
rbf_precision = precision_score(y_test, y_pred_rbf)
rbf_f1 = f1_score(y_test, y_pred_rbf)

print("\nRBF SVM Model Evaluation with Best Parameters:")
print(f"Accuracy: {rbf_accuracy:.2f}")
print(f"Recall: {rbf_recall:.2f}")
print(f"Precision: {rbf_precision:.2f}")
print(f"F1-Score: {rbf_f1:.2f}")

# Iterate through the C values and print the evaluation metrics
for C_value in param_grid['C']:
    svm_model = SVC(kernel='rbf', C=C_value)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nEvaluation Metrics for RBF SVM with C={C_value}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"F1-Score: {f1:.2f}")


Best Parameters found by Grid Search: {'C': 10}

RBF SVM Model Evaluation with Best Parameters:
Accuracy: 0.77
Recall: 0.83
Precision: 0.74
F1-Score: 0.78

Evaluation Metrics for RBF SVM with C=0.1:
Accuracy: 0.76
Recall: 0.87
Precision: 0.72
F1-Score: 0.79

Evaluation Metrics for RBF SVM with C=1:
Accuracy: 0.78
Recall: 0.86
Precision: 0.74
F1-Score: 0.80

Evaluation Metrics for RBF SVM with C=10:
Accuracy: 0.77
Recall: 0.83
Precision: 0.74
F1-Score: 0.78

Evaluation Metrics for RBF SVM with C=100:
Accuracy: 0.77
Recall: 0.83
Precision: 0.74
F1-Score: 0.78


## Random Search

In [12]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for C
param_dist = {'C': np.logspace(-1, 2, 100)}

# Initialize the SVM model with RBF kernel
svm_model_rbf = SVC(kernel='rbf')

# Use RandomizedSearchCV to search for the best C value
random_search = RandomizedSearchCV(svm_model_rbf, param_distributions=param_dist, n_iter=4, random_state=42, cv=3, scoring='accuracy')
random_search.fit(X_train, y_train)

# Print the best C value found
print("\nBest C value found:", random_search.best_params_['C'])

# Get the results for different C values
results = random_search.cv_results_

# Print the evaluation metrics for each C value
print("\nEvaluation metrics for different C values:")
for mean_score, params in zip(results['mean_test_score'], results['params']):
    # Train and predict using the best estimator for the current C value
    svm_model_rbf = SVC(kernel='rbf', C=params['C'])
    svm_model_rbf.fit(X_train, y_train)
    y_pred_svm_rbf = svm_model_rbf.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred_svm_rbf)
    recall = recall_score(y_test, y_pred_svm_rbf)
    precision = precision_score(y_test, y_pred_svm_rbf)
    f1 = f1_score(y_test, y_pred_svm_rbf)
    
    print(f"\nC: {params['C']:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"F1-Score: {f1:.2f}")



Best C value found: 4.037017258596556

Evaluation metrics for different C values:

C: 32.75
Accuracy: 0.76
Recall: 0.81
Precision: 0.73
F1-Score: 0.77

C: 4.04
Accuracy: 0.77
Recall: 0.83
Precision: 0.74
F1-Score: 0.78

C: 13.22
Accuracy: 0.76
Recall: 0.82
Precision: 0.73
F1-Score: 0.77

C: 2.31
Accuracy: 0.78
Recall: 0.85
Precision: 0.75
F1-Score: 0.80


## Bayesian Optimization

In [13]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Define the parameter grid for Bayesian optimization
param_grid = {'C': [0.1, 1.0, 10.0, 100.0]}

# Initialize the SVM model with 'rbf' kernel
svm_rbf = SVC(kernel='rbf')

# Perform Bayesian optimization for hyperparameter tuning
bayes_search = BayesSearchCV(estimator=svm_rbf, search_spaces=param_grid, n_iter=10, cv=5, scoring='f1', random_state=42)
bayes_search.fit(X_train, y_train)

# Get the best parameters
best_params = bayes_search.best_params_
print("\nBest parameters found by Bayesian optimization:")
print(best_params)

# Evaluate the SVM model with 'rbf' kernel and different C values
C_values = [0.1, 1.0, 10.0, 100.0]
results = {}

for C in C_values:
    svm_rbf = SVC(kernel='rbf', C=C)
    svm_rbf.fit(X_train, y_train)
    y_pred_rbf = svm_rbf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_rbf)
    recall = recall_score(y_test, y_pred_rbf)
    precision = precision_score(y_test, y_pred_rbf)
    f1 = f1_score(y_test, y_pred_rbf)
    
    results[C] = {
        'Accuracy': round(accuracy, 2),
        'Recall': round(recall, 2),
        'Precision': round(precision, 2),
        'F1-Score': round(f1, 2)
    }

# Print the evaluation results for each C value
print("\nSVM Model Evaluation with 'rbf' Kernel and Different C Values:")
for C, metrics in results.items():
    print(f"\nC value: {C}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Recall: {metrics['Recall']}")
    print(f"Precision: {metrics['Precision']}")
    print(f"F1-Score: {metrics['F1-Score']}")



Best parameters found by Bayesian optimization:
OrderedDict([('C', 10.0)])

SVM Model Evaluation with 'rbf' Kernel and Different C Values:

C value: 0.1
Accuracy: 0.76
Recall: 0.87
Precision: 0.72
F1-Score: 0.79

C value: 1.0
Accuracy: 0.78
Recall: 0.86
Precision: 0.74
F1-Score: 0.8

C value: 10.0
Accuracy: 0.76
Recall: 0.83
Precision: 0.74
F1-Score: 0.78

C value: 100.0
Accuracy: 0.76
Recall: 0.83
Precision: 0.74
F1-Score: 0.78
