In [1]:
!pip install xlsxwriter
!pip install imbalanced-learn
!pip install category_encoders




In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
import category_encoders as ce

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import resample
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../diabetes_prediction_dataset.csv')

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
#checking empty rows in the dataframe
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [6]:
# Checking Unique Variables in Gender Column
unique_gender_initial = df['gender'].unique()
print(unique_gender_initial)

['Female' 'Male' 'Other']


In [7]:
# Remove 'Other' from column
df = df[df['gender'] != 'Other']

In [8]:
# Checking Unique Variables in Gender Column after removal of 'other'
unique_gender_new= df['gender'].unique()
print(unique_gender_new)

['Female' 'Male']


In [9]:
# Checking Unique Variables in smoking_history Column
unique_smoking_history_initial = df['smoking_history'].unique()
print(unique_smoking_history_initial)

['never' 'No Info' 'current' 'former' 'ever' 'not current']


In [10]:
#checking for NaN values in smoking_history Column
nan_values = df['smoking_history'].isna().sum()

nan_values


0

In [11]:
df['smoking_history'].isnull().sum()

0

We have categorized “never” , “No info” as “no”, “current”, “former”, “ever” and “not current” have been categorized as “yes”.

In [12]:
new_values = {"never": "no", "No Info":"no", "current": "yes",  "former": "yes", "ever": "yes", "not current": "yes", }
df['smoking_history'] = df['smoking_history'].map(new_values)


In [13]:
unique_smoking_history_new = df['smoking_history'].unique()

unique_smoking_history_new

array(['no', 'yes'], dtype=object)

In [14]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,no,25.19,6.6,140,0
1,Female,54.0,0,0,no,27.32,6.6,80,0
2,Male,28.0,0,0,no,27.32,5.7,158,0
3,Female,36.0,0,0,yes,23.45,5.0,155,0
4,Male,76.0,1,1,yes,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,no,27.32,6.2,90,0
99996,Female,2.0,0,0,no,17.37,6.5,100,0
99997,Male,66.0,0,0,yes,27.83,5.7,155,0
99998,Female,24.0,0,0,no,35.42,4.0,100,0


Encoding gender and smoking history using WOE Encoding

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
import category_encoders as ce

from sklearn.preprocessing import LabelEncoder

from sklearn.utils import resample
plt.style.use('ggplot')

Applying WOE Encoding for Smoking History

In [16]:
#applying WOE encoding for smoking history
# Applying WOE encoding for smoking history
woe_encoder_smoking_history = ce.WOEEncoder(cols=['smoking_history'])


In [17]:
# Fit and transform the encoder on the data
encoded_smoking_history_df = pd.DataFrame()

encoded_smoking_history_df['smoking_history_encoded'] = woe_encoder_smoking_history.fit_transform(df['smoking_history'], df['diabetes'])



In [18]:
# Concatenate the encoded data with the original DataFrame
df_with_outliers = pd.concat([df, encoded_smoking_history_df], axis=1)

In [19]:
df_with_outliers.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded
0,Female,80.0,0,1,no,25.19,6.6,140,0,-0.246527
1,Female,54.0,0,0,no,27.32,6.6,80,0,-0.246527
2,Male,28.0,0,0,no,27.32,5.7,158,0,-0.246527
3,Female,36.0,0,0,yes,23.45,5.0,155,0,0.450465
4,Male,76.0,1,1,yes,20.14,4.8,155,0,0.450465


Applying WOE Encoding for Gender

In [20]:
#applying WOE encoding for gender
# Applying WOE encoding for gender
woe_encoder_gender_and_smoking = ce.WOEEncoder(cols=['gender'])


In [21]:
# Fit and transform the encoder on the data
encoded_smoking_history__gender_df = pd.DataFrame()

encoded_smoking_history__gender_df['gender_encoded'] = woe_encoder_gender_and_smoking.fit_transform(df_with_outliers['gender'], df_with_outliers['diabetes'])

In [22]:
# Concatenate the encoded data with the original DataFrame
df_with_outliers_encoded_gender_smoking = pd.concat([df_with_outliers,encoded_smoking_history__gender_df], axis=1)

In [23]:
df_with_outliers_encoded_gender_smoking

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,Female,80.0,0,1,no,25.19,6.6,140,0,-0.246527,-0.119227
1,Female,54.0,0,0,no,27.32,6.6,80,0,-0.246527,-0.119227
2,Male,28.0,0,0,no,27.32,5.7,158,0,-0.246527,0.150651
3,Female,36.0,0,0,yes,23.45,5.0,155,0,0.450465,-0.119227
4,Male,76.0,1,1,yes,20.14,4.8,155,0,0.450465,0.150651
...,...,...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,no,27.32,6.2,90,0,-0.246527,-0.119227
99996,Female,2.0,0,0,no,17.37,6.5,100,0,-0.246527,-0.119227
99997,Male,66.0,0,0,yes,27.83,5.7,155,0,0.450465,0.150651
99998,Female,24.0,0,0,no,35.42,4.0,100,0,-0.246527,-0.119227


In [24]:
columns_to_remove = ['gender','smoking_history']

In [25]:
# Make a copy of the DataFrame without the specified columns
df_for_normalization = df_with_outliers_encoded_gender_smoking.drop(columns_to_remove, axis=1).copy()

In [26]:
df_for_normalization

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,80.0,0,1,25.19,6.6,140,0,-0.246527,-0.119227
1,54.0,0,0,27.32,6.6,80,0,-0.246527,-0.119227
2,28.0,0,0,27.32,5.7,158,0,-0.246527,0.150651
3,36.0,0,0,23.45,5.0,155,0,0.450465,-0.119227
4,76.0,1,1,20.14,4.8,155,0,0.450465,0.150651
...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,-0.246527,-0.119227
99996,2.0,0,0,17.37,6.5,100,0,-0.246527,-0.119227
99997,66.0,0,0,27.83,5.7,155,0,0.450465,0.150651
99998,24.0,0,0,35.42,4.0,100,0,-0.246527,-0.119227


In [35]:
df_for_normalization.to_csv('Original Dataset with Outliers Included.csv', index=False)

Normalization of the above Dataset

In [37]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for standardization
numerical_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'smoking_history_encoded', 'gender_encoded']

# Create a StandardScaler instance
scaler = StandardScaler()

df_for_normalization[numerical_columns] = scaler.fit_transform(df_for_normalization[numerical_columns])

df_for_normalization

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.692577,0,1,-0.321051,1.001692,0.047709,0,-0.640425,-0.841175
1,0.537899,0,0,-0.000114,1.001692,-1.426157,0,-0.640425,-0.841175
2,-0.616779,0,0,-0.000114,0.161089,0.489869,0,-0.640425,1.188813
3,-0.261494,0,0,-0.583225,-0.492714,0.416175,0,1.561464,-0.841175
4,1.514935,1,1,-1.081957,-0.679515,0.416175,0,1.561464,1.188813
...,...,...,...,...,...,...,...,...,...
99995,1.692577,0,0,-0.000114,0.628091,-1.180513,0,-0.640425,-0.841175
99996,-1.771458,0,0,-1.499326,0.908292,-0.934869,0,-0.640425,-0.841175
99997,1.070828,0,0,0.076730,0.161089,0.416175,0,1.561464,1.188813
99998,-0.794422,0,0,1.220350,-1.426718,-0.934869,0,-0.640425,-0.841175


In [38]:
df_for_normalization.to_csv('Original Dataset with Outliers Included Normalized.csv', index=False)