In [1]:
!pip install xlsxwriter
!pip install imbalanced-learn
!pip install category_encoders
!pip install -U imbalanced-learn



In [2]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
import category_encoders as ce

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import resample
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../diabetes_prediction_dataset.csv')

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [6]:
#checking empty rows in the dataframe
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Removing Outliers

In [7]:
# Create an empty DataFrame to store outliers
df_outliers_only = pd.DataFrame(columns=df.columns)

# Dictionary to store the percentage of outliers for each column
percent_outliers = {}

# Function to calculate the 3-standard deviation range
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    low = mean - 3 * sd
    high = mean + 3 * sd
    
    return (low, high)

# Iterate through numerical columns without binary values
# columns - age, bmi, HbA1c_level, blood_glucose_level
for col_name in ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']:
    lower, upper = three_sd_range(df[col_name])
    outliers_mask = (df[col_name] < lower) | (df[col_name] > upper)

     # Calculate the percentage of outliers
    num_outliers = outliers_mask.sum()
    total_values = len(df[col_name])
    percent_outliers[col_name] = (num_outliers / total_values) * 100
    
    # Print information about outliers in each column
    print(f"{col_name} has outliers: {outliers_mask.any()}")
    
    # Collect outliers into the DataFrame
    df_outliers_only = pd.concat([df_outliers_only, df[outliers_mask]])

# Print percentage of outliers for each column
print("\nPercentage of Outliers for Each Column:")
for col, percentage in percent_outliers.items():
    print(f"{col}: {percentage}%")

# Display the DataFrame with outliers only
print("\nDataFrame with Outliers Only:")
df_outliers_only

age has outliers: False
bmi has outliers: True
HbA1c_level has outliers: True
blood_glucose_level has outliers: True

Percentage of Outliers for Each Column:
age: 0.0%
bmi: 1.294%
HbA1c_level: 1.315%
blood_glucose_level: 1.403%

DataFrame with Outliers Only:


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
11,Female,54.0,0,0,former,54.70,6.0,100,0
39,Female,34.0,0,0,never,56.43,6.2,200,0
59,Female,67.0,0,0,never,63.48,8.8,155,1
93,Male,38.0,0,0,never,55.61,6.5,130,0
98,Female,30.0,0,0,No Info,50.13,6.0,100,0
...,...,...,...,...,...,...,...,...,...
99763,Female,50.0,0,0,never,35.91,6.0,280,1
99826,Male,63.0,0,1,No Info,27.32,6.6,300,1
99867,Male,64.0,1,0,former,33.12,5.7,300,1
99938,Male,55.0,0,1,former,30.42,6.2,300,1


In [8]:
# Create an empty DataFrame to store records without outliers
df_copy = df.copy()
df_without_outliers = pd.DataFrame(columns=df.columns)

# Exclude outliers from the copied DataFrame
df_without_outliers = df_copy.loc[~df_copy.index.isin(df_outliers_only.index)]

df_without_outliers

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [11]:
# Checking Unique Variables in Gender Column
unique_gender_initial = df_without_outliers['gender'].unique()
print(unique_gender_initial)

['Female' 'Male' 'Other']


In [13]:
# Remove 'Other' from column
df = df_without_outliers[df_without_outliers['gender'] != 'Other']

In [19]:
# Checking Unique Variables in Gender Column after removal of 'other'
unique_gender_new= df_without_outliers['gender'].unique()
print(unique_gender_new)

['Female' 'Male' 'Other']


In [24]:
# Checking Unique Variables in smoking_history Column
unique_smoking_history_initial = df_without_outliers['smoking_history'].unique()
print(unique_smoking_history_initial)

['never' 'No Info' 'current' 'former' 'ever' 'not current']


In [28]:
#checking for NaN values in smoking_history Column
nan_values = df_without_outliers['smoking_history'].isna().sum()

nan_values

0

In [31]:
df_without_outliers['smoking_history'].isnull().sum()

0

We have categorized “never” , “No info” as “no”, “current”, “former”, “ever” and “not current” have been categorized as “yes”.

In [35]:
new_values = {"never": "no", "No Info":"no", "current": "yes",  "former": "yes", "ever": "yes", "not current": "yes", }
df_without_outliers['smoking_history'] = df_without_outliers['smoking_history'].map(new_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_without_outliers['smoking_history'] = df_without_outliers['smoking_history'].map(new_values)


In [37]:
unique_smoking_history_new = df_without_outliers['smoking_history'].unique()

unique_smoking_history_new

array(['no', 'yes'], dtype=object)

In [38]:
df_without_outliers

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,no,25.19,6.6,140,0
1,Female,54.0,0,0,no,27.32,6.6,80,0
2,Male,28.0,0,0,no,27.32,5.7,158,0
3,Female,36.0,0,0,yes,23.45,5.0,155,0
4,Male,76.0,1,1,yes,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,no,27.32,6.2,90,0
99996,Female,2.0,0,0,no,17.37,6.5,100,0
99997,Male,66.0,0,0,yes,27.83,5.7,155,0
99998,Female,24.0,0,0,no,35.42,4.0,100,0


eNCODING gender and smoking history using WOE Encoding

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
import category_encoders as ce

from sklearn.preprocessing import LabelEncoder

from sklearn.utils import resample
plt.style.use('ggplot')

Applying WOE Encoding for Smoking History

In [40]:
#applying WOE encoding for smoking history
# Applying WOE encoding for smoking history
woe_encoder_smoking_history = ce.WOEEncoder(cols=['smoking_history'])


In [42]:
# Fit and transform the encoder on the data
encoded_smoking_history_df = pd.DataFrame()

encoded_smoking_history_df['smoking_history_encoded'] = woe_encoder_smoking_history.fit_transform(df_without_outliers['smoking_history'], df_without_outliers['diabetes'])

In [43]:
# Concatenate the encoded data with the original DataFrame
df_without_outliers_smoking_history_encoded = pd.concat([df_without_outliers, encoded_smoking_history_df], axis=1)

applying woe encoding for gender

In [44]:
#applying WOE encoding for gender
# Applying WOE encoding for gender
woe_encoder_gender_and_smoking = ce.WOEEncoder(cols=['gender'])

In [45]:
# Fit and transform the encoder on the data
encoded_smoking_history__gender_df = pd.DataFrame()

encoded_smoking_history__gender_df['gender_encoded'] = woe_encoder_gender_and_smoking.fit_transform(df_without_outliers_smoking_history_encoded['gender'], df_without_outliers_smoking_history_encoded['diabetes'])

In [46]:
# Concatenate the encoded data with the original DataFrame
df_with_outliers_encoded_gender_smoking = pd.concat([df_without_outliers_smoking_history_encoded,encoded_smoking_history__gender_df], axis=1)

In [47]:
df_with_outliers_encoded_gender_smoking

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,Female,80.0,0,1,no,25.19,6.6,140,0,-0.247356,-0.128959
1,Female,54.0,0,0,no,27.32,6.6,80,0,-0.247356,-0.128959
2,Male,28.0,0,0,no,27.32,5.7,158,0,-0.247356,0.160772
3,Female,36.0,0,0,yes,23.45,5.0,155,0,0.452953,-0.128959
4,Male,76.0,1,1,yes,20.14,4.8,155,0,0.452953,0.160772
...,...,...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,no,27.32,6.2,90,0,-0.247356,-0.128959
99996,Female,2.0,0,0,no,17.37,6.5,100,0,-0.247356,-0.128959
99997,Male,66.0,0,0,yes,27.83,5.7,155,0,0.452953,0.160772
99998,Female,24.0,0,0,no,35.42,4.0,100,0,-0.247356,-0.128959


In [48]:
columns_to_remove = ['gender','smoking_history']

In [49]:
# Make a copy of the DataFrame without the specified columns
df_for_normalization = df_with_outliers_encoded_gender_smoking.drop(columns_to_remove, axis=1).copy()

In [50]:
df_for_normalization

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,80.0,0,1,25.19,6.6,140,0,-0.247356,-0.128959
1,54.0,0,0,27.32,6.6,80,0,-0.247356,-0.128959
2,28.0,0,0,27.32,5.7,158,0,-0.247356,0.160772
3,36.0,0,0,23.45,5.0,155,0,0.452953,-0.128959
4,76.0,1,1,20.14,4.8,155,0,0.452953,0.160772
...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,-0.247356,-0.128959
99996,2.0,0,0,17.37,6.5,100,0,-0.247356,-0.128959
99997,66.0,0,0,27.83,5.7,155,0,0.452953,0.160772
99998,24.0,0,0,35.42,4.0,100,0,-0.247356,-0.128959


In [51]:
df_for_normalization.to_csv('Original Dataset without Outliers.csv', index=False)

Normalization of the above Dataset

In [52]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for standardization
numerical_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'smoking_history_encoded', 'gender_encoded']

# Create a StandardScaler instance
scaler = StandardScaler()

df_for_normalization[numerical_columns] = scaler.fit_transform(df_for_normalization[numerical_columns])

df_for_normalization

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.713008,0,1,-0.286437,1.134061,0.126046,0,-0.633042,-0.841116
1,0.560337,0,0,0.072849,1.134061,-1.523079,0,-0.633042,-0.841116
2,-0.592335,0,0,0.072849,0.232946,0.620784,0,-0.633042,1.188683
3,-0.237667,0,0,-0.579938,-0.467921,0.538328,0,1.579675,-0.841116
4,1.535674,1,1,-1.138266,-0.668169,0.538328,0,1.579675,1.188683
...,...,...,...,...,...,...,...,...,...
99995,1.713008,0,0,0.072849,0.733566,-1.248225,0,-0.633042,-0.841116
99996,-1.745006,0,0,-1.605507,1.033937,-0.973371,0,-0.633042,-0.841116
99997,1.092339,0,0,0.158875,0.232946,0.538328,0,1.579675,1.188683
99998,-0.769669,0,0,1.439149,-1.469161,-0.973371,0,-0.633042,-0.841116


In [53]:
df_for_normalization.to_csv('Original Dataset without Outliers Normalized.csv', index=False)