In [2]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')
np.random.seed(32)

# Diabetes dataset
## This data contains health and demographic data of 100000 individuals

In [3]:
data_raw = pd.read_csv("diabetes_dataset_with_notes.csv")

In [4]:
data_raw.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,clinical_notes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0,"Overweight, advised dietary and exercise modif..."
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0,Healthy BMI range.
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0,"Young patient, generally lower risk but needs ..."
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0,"Overweight, advised dietary and exercise modif..."
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0,"Healthy BMI range. High HbA1c level, indicativ..."


In [5]:
len(data_raw)

100000

In [6]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  100000 non-null  int64  
 1   gender                100000 non-null  object 
 2   age                   100000 non-null  float64
 3   location              100000 non-null  object 
 4   race:AfricanAmerican  100000 non-null  int64  
 5   race:Asian            100000 non-null  int64  
 6   race:Caucasian        100000 non-null  int64  
 7   race:Hispanic         100000 non-null  int64  
 8   race:Other            100000 non-null  int64  
 9   hypertension          100000 non-null  int64  
 10  heart_disease         100000 non-null  int64  
 11  smoking_history       100000 non-null  object 
 12  bmi                   100000 non-null  float64
 13  hbA1c_level           100000 non-null  float64
 14  blood_glucose_level   100000 non-null  int64  
 15  d

There are no missing values in the dataset.

Due to ethical reasons we deleted the columns about race.

In [12]:
df = data_raw.drop(data_raw.iloc[:, 4:9], axis=1)

In [13]:
df.columns

Index(['year', 'gender', 'age', 'location', 'hypertension', 'heart_disease',
       'smoking_history', 'bmi', 'hbA1c_level', 'blood_glucose_level',
       'diabetes', 'clinical_notes'],
      dtype='object')

We changed the types of some columns (year).

In [14]:
df['year'] = pd.to_datetime(df['year'], format='%Y')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   year                 100000 non-null  datetime64[ns]
 1   gender               100000 non-null  object        
 2   age                  100000 non-null  float64       
 3   location             100000 non-null  object        
 4   hypertension         100000 non-null  int64         
 5   heart_disease        100000 non-null  int64         
 6   smoking_history      100000 non-null  object        
 7   bmi                  100000 non-null  float64       
 8   hbA1c_level          100000 non-null  float64       
 9   blood_glucose_level  100000 non-null  int64         
 10  diabetes             100000 non-null  int64         
 11  clinical_notes       100000 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(4), object(4)
memory usage: 9.2+ 

In [16]:
df['smoking_history'].value_counts()

smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

As it can be seen, there are 6 different categories in the smoking_history column. Former and not current can have the same meaning, which means that they could be merged into one category. 'Ever' could mean 'former', 'current' or 'not current', but because we don't know which, for now we decided to leave it as it is.

In [17]:
df['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [20]:
df['location'].value_counts()

location
Iowa                    2038
Nebraska                2038
Kentucky                2038
Hawaii                  2038
Florida                 2037
Minnesota               2037
New Jersey              2037
Arkansas                2037
Delaware                2036
Kansas                  2036
Michigan                2036
Massachusetts           2036
Maine                   2036
District of Columbia    2036
Louisiana               2036
Georgia                 2036
Oregon                  2036
Pennsylvania            2036
Alabama                 2036
Illinois                2036
Rhode Island            2035
Colorado                2035
Maryland                2035
New York                2035
Connecticut             2035
Mississippi             2035
Missouri                2035
Alaska                  2035
North Carolina          2035
New Hampshire           2035
North Dakota            2035
South Dakota            2033
Montana                 2033
New Mexico              2033
Idaho

In [21]:
len(df['location'].value_counts())

55

There are 55 different locations. All of them are relevant.

In [22]:
df.describe()

Unnamed: 0,year,age,hypertension,heart_disease,bmi,hbA1c_level,blood_glucose_level,diabetes
count,100000,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,2018-05-12 12:34:55.200000,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
min,2015-01-01 00:00:00,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,2019-01-01 00:00:00,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,2019-01-01 00:00:00,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,2019-01-01 00:00:00,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,2022-01-01 00:00:00,80.0,1.0,1.0,95.69,9.0,300.0,1.0
std,,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883


In [23]:
df['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [32]:
len(df[df['diabetes'] == 0]) / len(df['diabetes']) * 100

91.5

The vast majority of respondents (91.5%) don't have diabetes.

100000