# Thực hiện thống kê mô tả trên tập dữ liệu về bệnh tiểu đường.

### 1. Import libraries

In [1]:
import pandas as pd  #for data manipulation operations
import numpy as np  #for numeric operations on data
from scipy import stats  

### 2. Load dataset

In [14]:
data = pd.read_csv("data/diabetes.csv")
df = data.copy()
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### 3. Overview

In [16]:
# Take a quick look at the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [17]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [18]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [19]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [20]:
df.shape

(768, 9)

### 4. Khám phá dữ liệu

In [22]:
# Get the mean of the data
data_mean = np.mean(df["Glucose"])
print("Mean of Glucose: ", data_mean)

Mean of Glucose:  120.89453125


In [23]:
# Get the median of the data
data_median = np.median(df["Glucose"])
print("Median of Glucose: ", data_median)

Median of Glucose:  117.0


In [24]:
# Get the mode of the data
data_mode = stats.mode(df["Glucose"])
print("Mode of Glucose: ", data_mode)

Mode of Glucose:  ModeResult(mode=99, count=17)


In [25]:
# Obtain the variance of the data
data_variance = np.var(df["Glucose"])
print("Variance of Glucose: ", data_variance)

Variance of Glucose:  1020.9172617594401


In [26]:
# Obtain the standard deviation of the data
data_sd = np.std(df["Glucose"])
print("Standard Deviation of Glucose: ", data_sd)

Standard Deviation of Glucose:  31.95179590820272


In [27]:
# Compute the maximum and minimum values of the data
data_max = np.max(df["Glucose"])
data_min = np.min(df["Glucose"])
print("Max of Glucose: ", data_max)
print("Min of Glucose: ", data_min)

Max of Glucose:  199
Min of Glucose:  0


In [28]:
# Obtain the 60th percentile of the data
data_percentile = np.percentile(df["Glucose"],60)
print("60th Percentile of Glucose: ", data_percentile)

60th Percentile of Glucose:  125.0


In [29]:
# Obtain the quartiles of the data
data_quartile = np.quantile(df["Glucose"],0.75)
print("75th Percentile (Q3) of Glucose: ", data_quartile)

75th Percentile (Q3) of Glucose:  140.25


In [31]:
# Get the IQR of the data
data_IQR = stats.iqr(df["Glucose"])
print("IQR of Glucose: ", data_IQR)

IQR of Glucose:  41.25


### 5. Loại bỏ dữ liệu trùng lặp

In [33]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [34]:
# Remove duplicates across the columns in our dataset:
df_duplicate = df.drop_duplicates()
df_duplicate.shape

(768, 9)

In [35]:
# Delete a specified row at index value 1:
df.drop(labels=[1], axis=0)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
# Delete a single column
# df.drop(labels=['Year_Birth'], axis=1)

### 6. Thay thế dữ liệu và thay đổi định dạng của dữ liệu

In [38]:
# thay thế 0 thành NaN cho các cột quan trọng
cols_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [40]:
# Điền missing bằng median
for col in cols_with_zeros:
    df[col] = df[col].fillna(df[col].median())
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [42]:
df["Pregnancies"] = df["Pregnancies"].astype(int)
df["Outcome"] = df["Outcome"].astype(int)
df.dtypes

Pregnancies                   int32
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int32
dtype: object

### 7. Xử lý dữ liệu thiếu

In [43]:
# Check for missing values using the isnull and sum methods
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [44]:
# Drop missing values using the dropna method
df_withoutna = df.dropna(how = 'any')
df_withoutna.shape

(768, 9)

Kết thúc