# Generate Dataset Summary Report using Pandas Profiling

## Load the Dataset

In [1]:
import pandas as pd

df = pd.read_csv('diabetes_prediction_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


## Install pandas-profiling
⚠️ pandas-profiling package naming was changed. To continue profiling data use ydata-profiling instead!

https://pypi.org/project/pandas-profiling/

In [2]:
!pip install pandas-profiling

Collecting pandas-profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ydata-profiling (from pandas-profiling)
  Downloading ydata_profiling-4.7.0-py2.py3-none-any.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.9/357.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]<0.7.7,>=0.7.5 (from ydata-profiling->pandas-profiling)
  Downloading visions-0.7.6-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.8/104.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata-profiling->pandas-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling->pandas-profiling)
  Downloading phik-0.12.4-cp310-cp3

In [3]:
from ydata_profiling import ProfileReport

# Initialize profile report instance
profile = ProfileReport(df, title="Profile Report")
# Generate dataset report
profile.to_file(output_file="Diabetes Dataset Report.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Solve dataset imbalance issues using SMOTE

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
df.diabetes.value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

## Label Encoding

In [6]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [7]:
from sklearn.preprocessing import LabelEncoder

cat = ['gender', 'smoking_history']
for i in cat:
  encoder = LabelEncoder()
  encoded_data = encoder.fit_transform(df[i])
  df[i] = encoded_data

In [8]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


## SMOTE (Synthetic Minority Over-sampling Technique)

In [10]:
!pip install imbalanced-learn



In [11]:
from sklearn.model_selection import train_test_split

y = df['diabetes']
x = df.drop(columns='diabetes')

In [12]:
from imblearn.over_sampling import SMOTE

# Create a SMOTE object
smote = SMOTE(sampling_strategy='auto', random_state=42)
# Apply SMOTE on X Y
x_resampled, y_resampled = smote.fit_resample(x, y)
# Convert back to pandas types
x_resampled_df = pd.DataFrame(x_resampled, columns=x.columns)
y_resampled_df = pd.Series(y_resampled, name=y.name)

In [19]:
x_resampled

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.000000,0,1,4,25.190000,6.600000,140
1,0,54.000000,0,0,0,27.320000,6.600000,80
2,1,28.000000,0,0,4,27.320000,5.700000,158
3,0,36.000000,0,0,1,23.450000,5.000000,155
4,1,76.000000,1,1,1,20.140000,4.800000,155
...,...,...,...,...,...,...,...,...
182995,0,59.503975,1,1,3,28.064037,6.401590,155
182996,0,80.000000,0,0,4,29.462296,6.489903,159
182997,0,65.671103,0,0,4,27.458384,8.865779,200
182998,0,70.170151,0,0,3,37.724239,6.207462,280


In [13]:
# Combine df
resampled_df = pd.concat([x_resampled_df, y_resampled_df], axis=1)

In [15]:
resampled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183000 entries, 0 to 182999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               183000 non-null  int64  
 1   age                  183000 non-null  float64
 2   hypertension         183000 non-null  int64  
 3   heart_disease        183000 non-null  int64  
 4   smoking_history      183000 non-null  int64  
 5   bmi                  183000 non-null  float64
 6   HbA1c_level          183000 non-null  float64
 7   blood_glucose_level  183000 non-null  int64  
 8   diabetes             183000 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 12.6 MB


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  int64  
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  int64  
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 6.9 MB


In [17]:
df.diabetes.value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [18]:
resampled_df.diabetes.value_counts()

diabetes
0    91500
1    91500
Name: count, dtype: int64

In [20]:
resampled_df.to_csv("SMOTE_Diabetes_Dataset.csv")