In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
import category_encoders as ce

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import resample
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

plt.style.use('ggplot')


1. EDA

In [None]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
is_diabetic = df["diabetes"].value_counts()
print("Yes: ",is_diabetic[1])
print("No: ",is_diabetic[0])

The data set is unbalanced

In [None]:
# Check for duplicates and missing values
print("Missing Values")
print(df.isna().sum())
print("Duplicates:" ,df.duplicated().sum())

In [None]:
df.describe(include="all")

2. Data Visualization

In [None]:
is_diabetic = df["diabetes"].value_counts()
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)  # Subplot for the pie chart
plt.pie(is_diabetic, labels=["No", "YES"], autopct="%0.0f%%")
plt.title("is_diabetic Counts")
plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()


Gender Distribution

In [None]:
fig,axb = plt.subplots(ncols=2,nrows=1,figsize=(15, 8))

#Gender Distribution
explode = [0.1, 0.1, 0.1]
df.groupby('gender')['diabetes'].count().plot.pie(explode=explode, autopct="%1.1f%%",ax=axb[0]);

ax = sns.countplot(x="gender", hue="diabetes", data=df,ax=axb[1], palette="viridis")

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
plt.title("Distribution of Gender with Diabetic Status")
plt.xlabel("Gender")
plt.ylabel("Count")

# Show the plot
plt.show()

Age Distribution

In [None]:
#Age Distribution
plt.hist(df[df['diabetes'] == 1]['age'], bins=20, alpha=1, label='Diabetic', color='red')
plt.hist(df[df['diabetes'] == 0]['age'], bins=20, alpha=0.5, label='Non-Diabetic', color='blue')

plt.title('Distribution of Age with Diabetic Status')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()

Hypertension Distribution

In [None]:
fig,axb = plt.subplots(ncols=2,nrows=1,figsize=(15, 8))

#Hypertension Distribution
explode = [0.1, 0.1]
df.groupby('hypertension')['diabetes'].count().plot.pie(explode=explode, autopct="%1.1f%%",ax=axb[0]);

ax = sns.countplot(x="hypertension", hue="diabetes", data=df,ax=axb[1], palette="viridis")

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
plt.title("Distribution of Hypertension with Diabetic Status")
plt.xlabel("Hypertension")
plt.ylabel("Count")

# Show the plot
plt.show()

Heart Disease Distribution

In [None]:
fig,axb = plt.subplots(ncols=2,nrows=1,figsize=(15, 8))

#Heart Diseas Distribution
explode = [0.1, 0.1]
df.groupby('heart_disease')['diabetes'].count().plot.pie(explode=explode, autopct="%1.1f%%",ax=axb[0]);

ax = sns.countplot(x="heart_disease", hue="diabetes", palette="viridis", data=df,ax=axb[1])

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
plt.title("Distribution of Heart Disease with Diabetic Status")
plt.xlabel("Heart Disease")
plt.ylabel("Count")

# Show the plot
plt.show()

Smoking History Distribution

In [None]:
#Smoking History Bar chart
plt.figure(figsize=(10, 6))
sns.countplot(x="smoking_history", data=df, hue="diabetes", palette="viridis")

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
plt.title("Smoking History Category with Diabetic Status")
plt.xlabel("Smoking History")
plt.ylabel("Count")

# Show the plot
plt.show()

BMI Distribution

In [None]:
#BMI Distribution
plt.hist(df[df['diabetes'] == 1]['bmi'], bins=20, alpha=1, label='Diabetic', color='red')
plt.hist(df[df['diabetes'] == 0]['bmi'], bins=20, alpha=0.5, label='Non-Diabetic', color='blue')

plt.title('Distribution of BMI with Diabetic Status')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.legend()

Hemoglobin A1c Distribution

Hemoglobin A1c (HbA1c):

Definition: HbA1c is a measure of the average blood glucose levels over the past two to three months.

How it works: When glucose enters the bloodstream, it binds to hemoglobin in red blood cells. The higher the blood glucose levels, the more glucose binds to hemoglobin. HbA1c reflects the percentage of hemoglobin that has glucose attached to it.

Usage: It is a long-term marker of blood sugar control and is commonly used for monitoring and managing diabetes. It provides an average of blood sugar levels over time, offering a more stable indicator than daily or frequent glucose measurements.


In [None]:
# HbA1c_level Distribution
plt.hist(df[df['diabetes'] == 1]['HbA1c_level'], bins=10, alpha=1, label='Diabetic', color='red')
plt.hist(df[df['diabetes'] == 0]['HbA1c_level'], bins=10, alpha=0.5, label='Non-Diabetic', color='blue')

plt.title('Distribution of Hemglobin A1c level over the past 2 to 3 months with Diabetic Status')
plt.xlabel('HbA1c Level')
plt.ylabel('Frequency')
plt.legend()

Blood Glucose Level Distribution

Blood Glucose Level:

Definition: Blood glucose level represents the concentration of glucose in the blood at a specific point in time.

How it works: Glucose is the primary source of energy for cells, and blood glucose levels can fluctuate throughout the day based on factors like food intake, physical activity, and insulin production.

Usage: Blood glucose levels are often measured through daily monitoring, especially for people with diabetes. It provides immediate information about how the body is handling glucose at a particular moment.

In [None]:
# Blood Glucose Level Distribution
plt.hist(df[df['diabetes'] == 1]['blood_glucose_level'], bins=10, alpha=1, label='Diabetic', color='red')
plt.hist(df[df['diabetes'] == 0]['blood_glucose_level'], bins=10, alpha=0.5, label='Non-Diabetic', color='blue')

plt.title('Distribution of Blood Glucose Level at a Specific Point of Time with Diabetic Status')
plt.xlabel('Blood Glucose Level')
plt.ylabel('Frequency')
plt.legend()

Box Plots

In [None]:
# Set up the figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Box plot of Age by Diabetes Status
sns.boxplot(x='diabetes', y='age', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution by Diabetes Status')

# Box plot of BMI by Diabetes Status
sns.boxplot(x='diabetes', y='bmi', data=df, ax=axes[0, 1])
axes[0, 1].set_title('BMI Distribution by Diabetes Status')

# Box plot of HbA1c Level by Diabetes Status
sns.boxplot(x='diabetes', y='HbA1c_level', data=df, ax=axes[1, 0])
axes[1, 0].set_title('HbA1c Level Distribution by Diabetes Status')

# Box plot of Blood Glucose Level by Diabetes Status
sns.boxplot(x='diabetes', y='blood_glucose_level', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Blood Glucose Level Distribution by Diabetes Status')

# Adjust layout
plt.tight_layout()
plt.show()


Correlation Matrix

In [None]:
# Select numerical columns (excluding 'gender')
numerical_columns = df.select_dtypes(include=['number']).columns

# Compute the correlation matrix for selected columns
corr_matrix = df[numerical_columns].corr()

# Plot the correlation coefficients heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Coefficients Heatmap')
plt.show()


3. Data Pre-processing

3.1 Handling Missing Data

In [None]:
for column_name in df.columns:
    total_na_rows = df[column_name].isna().sum()
    print(f"Total NaN rows in column {column_name}: {total_na_rows}")

Since there is no missing data, we do not need to handle. 

3.2 Handling Outliers

In [None]:
# Create an empty DataFrame to store outliers
df_outliers_only = pd.DataFrame(columns=df.columns)

# Function to calculate the 3-standard deviation range
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    low = mean - 3 * sd
    high = mean + 3 * sd
    
    return (low, high)

# Iterate through numerical columns without binary values
# columns - age, bmi, HbA1c_level, blood_glucose_level
for col_name in ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']:
    lower, upper = three_sd_range(df[col_name])
    outliers_mask = (df[col_name] < lower) | (df[col_name] > upper)
    
    # Print information about outliers in each column
    print(f"{col_name} has outliers: {outliers_mask.any()}")
    
    # Collect outliers into the DataFrame
    df_outliers_only = pd.concat([df_outliers_only, df[outliers_mask]])

# Display the DataFrame with outliers only
print("\nDataFrame with Outliers Only:")
df_outliers_only


3.3 Data Encoding

In [None]:
# Check the data types
df.dtypes

Gender Encoding

WOEEncoder

is a type of categorical encoding technique used in machine learning, particularly in the context of handling categorical variables in predictive modeling tasks, such as binary classification. WOE stands for "Weight of Evidence." It is designed for categorical variables, which can include both ordinal and nominal data. It's a popular encoding technique in credit scoring and fraud detection.

In summary, while label encoding simply assigns numerical labels to categories, WOE encoding calculates numerical values based on the relationship between each category and the target variable, providing more meaningful representations for categorical variables in certain modeling contexts, especially those where the predictive power of categorical variables is crucial.

In [None]:
df['gender'].unique()

In [None]:
#Applying WOE encoding for gender

# Initialize the WOEEncoder
woe_encoder = ce.WOEEncoder(cols=['gender'])

# Fit and transform the encoder on the data
encoded_gender_df = woe_encoder.fit_transform(df['gender'], df['diabetes'])

# Concatenate the encoded data with the original DataFrame
df_encoded_gender = pd.concat([df, encoded_gender_df], axis=1)

print(df_encoded_gender.head())

Smoking History Encoding

In [None]:
df['smoking history'].unique()

In [None]:
#Applying WOE encoding for smoking history

# Initialize the WOEEncoder
woe_encoder = ce.WOEEncoder(cols=['smoking_history'])

# Fit and transform the encoder on the data
encoded_smoking_history_df = woe_encoder.fit_transform(df['smoking_history'], df['diabetes'])

# Concatenate the encoded data with the encoded gender df
df_encoded = pd.concat([df_encoded_gender, encoded_smoking_history_df], axis=1)

print(df_encoded.head())

3.4 Downsampling 

Downsampling involves reducing the number of instances in the majority class to balance it with the number of instances in the minority class.

This helps prevent the machine learning model from being biased towards the majority class and improves its ability to learn patterns from the minority class.

In [None]:
is_diabetic = df["diabetes"].value_counts()
print("Yes: ",is_diabetic[1])
print("No: ",is_diabetic[0])

We have accessed that the dataset is unbalanced. The diabetic count, the variable to predict is not represented equally.

In [None]:
No_class = df_encoded[df_encoded["diabetes"]==0]
Yes_class = df_encoded[df_encoded["diabetes"]==1]

No_class = resample(No_class, replace=False, n_samples=len(Yes_class))
down_samples = pd.concat([Yes_class, No_class], axis=0)

# Count the occurrences of each class in the original dataset
original_class_counts = df_encoded["diabetes"].value_counts()

# Count the occurrences of each class in the downsampled dataset
downsampled_class_counts = down_samples["diabetes"].value_counts()

# Calculate the percentage of each class
original_percentages = original_class_counts / len(df_encoded) * 100
downsampled_percentages = downsampled_class_counts / len(down_samples) * 100

# Plotting
plt.figure(figsize=(12, 6))

# Bar chart for original class distribution
plt.subplot(1, 2, 1)
bars_1 = plt.bar(original_class_counts.index, original_class_counts.values, color=['orange', 'green'])
for bar, label in zip(bars_1, original_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, f'{label:.2f}%', ha='center', va='bottom')
plt.title('Original Diabetes Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(original_class_counts.index, ['0', '1'])

# Bar chart for downsampled class distribution
plt.subplot(1, 2, 2)
bars_2 = plt.bar(downsampled_class_counts.index, downsampled_class_counts.values, color=['orange', 'green'])
for bar, label in zip(bars_2, downsampled_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, f'{label:.2f}%', ha='center', va='bottom')
plt.title('Downsampled Diabetes Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(downsampled_class_counts.index, ['0', '1'])

plt.tight_layout() # the plots will be automatically adjusted to ensure that there is no overlap between subplots and that all elements are clearly visible.
plt.show()

In [None]:
down_samples