In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing necessary librarires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# read the dataset
pd.set_option('display.max_columns',None)
df = pd.read_csv("C:/Users/User/Downloads/diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# Dive Into Data

In [4]:
# information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
# statistical summary of numeric columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,100000.0,41.885856,22.51684,0.08,24.0,43.0,60.0,80.0
hypertension,100000.0,0.07485,0.26315,0.0,0.0,0.0,0.0,1.0
heart_disease,100000.0,0.03942,0.194593,0.0,0.0,0.0,0.0,1.0
bmi,100000.0,27.320767,6.636783,10.01,23.63,27.32,29.58,95.69
HbA1c_level,100000.0,5.527507,1.070672,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,100000.0,138.05806,40.708136,80.0,100.0,140.0,159.0,300.0
diabetes,100000.0,0.085,0.278883,0.0,0.0,0.0,0.0,1.0


In [6]:
# summary of categorical column
df.describe(include='object')

Unnamed: 0,gender,smoking_history
count,100000,100000
unique,3,6
top,Female,No Info
freq,58552,35816


In [7]:
# details about categorical columns
print(df['gender'].value_counts())
print("\033[1mNOTE: 'gender' column have an unathentic value labeled as 'Other' in number of 18.\033[0m")

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64
[1mNOTE: 'gender' column have an unathentic value labeled as 'Other' in number of 18.[0m


In [8]:
print(df['smoking_history'].value_counts())
print("\033[1mNote: 'smoking_history' have 6 value where a value is labeled as 'No Info' in number of 35816003.\033[0m")

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64
[1mNote: 'smoking_history' have 6 value where a value is labeled as 'No Info' in number of 35816003.[0m


# Handling Missing Values

**Both "gender" and "smoking_history" columns have missing values respectively labeled(missing count) as 'Other(18)' and 'No Info(35816)'**

In [9]:
# droping 'Other' from ['gender'] column
df.drop(df[df['gender']=='Other'].index,inplace=True)
df['gender'].value_counts()

Female    58552
Male      41430
Name: gender, dtype: int64

**Missing values of "somiking_history" column would imputed by KNN Imputer**

In [10]:
# copying the dataframe to another object
df1 = df.copy()

In [11]:
# label encoding on categorical columns
df1['gender'] = df1['gender'].apply(lambda x:1 if x=='Male' else 0)
df1['smoking_history'] = df1['smoking_history'].replace({'never':0,
                                                         'No Info':np.nan,
                                                         'current':1,
                                                         'former':2,
                                                         'ever':3,
                                                         'not current':4})

In [12]:
df1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0.0,25.19,6.6,140,0
1,0,54.0,0,0,,27.32,6.6,80,0
2,1,28.0,0,0,0.0,27.32,5.7,158,0
3,0,36.0,0,0,1.0,23.45,5.0,155,0
4,1,76.0,1,1,1.0,20.14,4.8,155,0
