# Data Cleaning and Feature Engineering

This notebook performs structured cleaning and preprocessing of the dataset to prepare it for exploratory analysis and modeling.

### Data Load and Inspection

In [394]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cardio_train.csv", sep=";")

In [395]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [368]:
df.shape

(70000, 13)

**Renaming Columns**

In [369]:
new_names = {"ap_hi": "systolic blood pressure", "ap_lo": "diastolic blood pressure", "gluc": "glucose", "smoke": "smoking", 
             "alco": "alcohol", "active": "physical activity", "cardio": "disease"}

df.rename(columns = new_names, inplace = True)

In [370]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        70000 non-null  int64  
 1   age                       70000 non-null  int64  
 2   gender                    70000 non-null  int64  
 3   height                    70000 non-null  int64  
 4   weight                    70000 non-null  float64
 5   systolic blood pressure   70000 non-null  int64  
 6   diastolic blood pressure  70000 non-null  int64  
 7   cholesterol               70000 non-null  int64  
 8   glucose                   70000 non-null  int64  
 9   smoking                   70000 non-null  int64  
 10  alcohol                   70000 non-null  int64  
 11  physical activity         70000 non-null  int64  
 12  disease                   70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [371]:
df.isna().sum()

id                          0
age                         0
gender                      0
height                      0
weight                      0
systolic blood pressure     0
diastolic blood pressure    0
cholesterol                 0
glucose                     0
smoking                     0
alcohol                     0
physical activity           0
disease                     0
dtype: int64

#### **Observations**

- The dataset is very clean, there are no missing values, which is to be expected since it's from a clean Kaggle dataset.
- There is a id column which I will drop from the dataset since it provides no value.

In [373]:
df.drop(columns=['id'], inplace = True)

In [374]:
#summary statistics
df.describe()

Unnamed: 0,age,gender,height,weight,systolic blood pressure,diastolic blood pressure,cholesterol,glucose,smoking,alcohol,physical activity,disease
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0



#### **Observations**

- Numeric attributes: gender, height, weight, systolic blood pressure, diastolic blood pressure
- Categorical attributes: cholesterol, glucose, smoking, alcohol, physical activity, disease
- Let's change the categorical attributes from integers to string
- Age is given in days, let's divide by 365 to convert to years

In [375]:
df["cholesterol"] = df["cholesterol"].map({1: "normal", 2: "above normal", 3: "well above normal"})
df["glucose"] = df["glucose"].map({1: "normal", 2: "above normal", 3: "well above normal"})
df["smoking"] = df["smoking"].map({1: "Yes", 0: "No"})
df["alcohol"] = df["alcohol"].map({1: "Yes", 0: "No"})
df["physical activity"] = df["physical activity"].map({1: "Yes", 0: "No"})
df["disease"] = df["disease"].map({1: "Yes", 0: "No"})
df["gender"] = df["gender"].map({1: "Women", 2: "Men"})
df["age"] = df["age"].apply(lambda x: int(np.round(x / 365)))

In [376]:
df.dtypes

age                           int64
gender                       object
height                        int64
weight                      float64
systolic blood pressure       int64
diastolic blood pressure      int64
cholesterol                  object
glucose                      object
smoking                      object
alcohol                      object
physical activity            object
disease                      object
dtype: object

In [377]:
df["age"].describe()

count    70000.000000
mean        53.338686
std          6.765294
min         30.000000
25%         48.000000
50%         54.000000
75%         58.000000
max         65.000000
Name: age, dtype: float64

In [378]:
num_cols = ["age", "height", "weight", "systolic blood pressure", "diastolic blood pressure"]
cat_cols = ["gender", "cholesterol", "glucose", "smoking", "alcohol", "physical activity", "disease"]
num_df = df[num_cols]
cat_df = df[cat_cols]
num_df.describe()

Unnamed: 0,age,height,weight,systolic blood pressure,diastolic blood pressure
count,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.338686,164.359229,74.20569,128.817286,96.630414
std,6.765294,8.210126,14.395757,154.011419,188.47253
min,30.0,55.0,10.0,-150.0,-70.0
25%,48.0,159.0,65.0,120.0,80.0
50%,54.0,165.0,72.0,120.0,80.0
75%,58.0,170.0,82.0,140.0,90.0
max,65.0,250.0,200.0,16020.0,11000.0


In [379]:
cat_df.describe()

Unnamed: 0,gender,cholesterol,glucose,smoking,alcohol,physical activity,disease
count,70000,70000,70000,70000,70000,70000,70000
unique,2,3,3,2,2,2,2
top,Women,normal,normal,No,No,Yes,No
freq,45530,52385,59479,63831,66236,56261,35021


In [380]:
#lowest weights
df.sort_values(by = "weight").head()

Unnamed: 0,age,gender,height,weight,systolic blood pressure,diastolic blood pressure,cholesterol,glucose,smoking,alcohol,physical activity,disease
57858,52,Men,165,10.0,180,1100,above normal,above normal,No,No,Yes,Yes
33817,59,Men,178,11.0,130,90,normal,normal,No,No,Yes,Yes
60188,60,Women,162,21.0,120,80,above normal,normal,No,No,Yes,Yes
29488,56,Men,177,22.0,120,80,normal,normal,Yes,Yes,Yes,No
26806,64,Women,157,23.0,110,80,normal,normal,No,No,Yes,No


#### **Observations**

- There are outliers in height, weight, systolic blood pressure and diastolic blood pressure - for example, height of 55 cm, weight of 30 kg and under, blood pressures of 16020 and 11000, therefore we will only include entries limited to biologically possible ranges since we are infering for average humans.

In [381]:
#df filtered
df_f = df.copy()
df_f = df_f[
            (df_f["height"] >= 100) & (df_f["weight"] <= 250) &
            (df_f["weight"] >= 30) &
            (df_f["systolic blood pressure"] >= 70) & (df_f["systolic blood pressure"] <= 250) &
            (df_f["diastolic blood pressure"] >= 40) & (df_f["diastolic blood pressure"] <= 140)
]

In [382]:
df_f.describe()

Unnamed: 0,age,height,weight,systolic blood pressure,diastolic blood pressure
count,68714.0,68714.0,68714.0,68714.0,68714.0
mean,53.326571,164.39644,74.121284,126.616701,81.34923
std,6.767432,7.985136,14.306273,16.739756,9.526771
min,30.0,100.0,30.0,70.0,40.0
25%,48.0,159.0,65.0,120.0,80.0
50%,54.0,165.0,72.0,120.0,80.0
75%,58.0,170.0,82.0,140.0,90.0
max,65.0,250.0,200.0,240.0,140.0


## Feature Engineeering

- BMI, is a significant risk factor for heart disease. https://pmc.ncbi.nlm.nih.gov/articles/PMC10132081/

In [383]:
df_f["bmi"] = df_f["weight"] / ((df_f["height"] / 100) ** 2)

#### **Association Hypotheses**

From, https://www.cdc.gov/heart-disease/risk-factors/index.html as a reference:
1. The older a person is, the more likely they are to have CVD (cardiovascular disease).
2. The higher BMI and weight, the more likely the person is to have CVD.
3. Higher systolic blood pressure and diastolic blood pressure is associated with CVD.
4. High cholesterol is associated with CVD.
5. Less physical activity is associated with CVD.
6. Smokers are more likely to have CVD.

In [384]:
df_f.describe()

Unnamed: 0,age,height,weight,systolic blood pressure,diastolic blood pressure,bmi
count,68714.0,68714.0,68714.0,68714.0,68714.0,68714.0
mean,53.326571,164.39644,74.121284,126.616701,81.34923,27.473472
std,6.767432,7.985136,14.306273,16.739756,9.526771,5.351101
min,30.0,100.0,30.0,70.0,40.0,10.726644
25%,48.0,159.0,65.0,120.0,80.0,23.875115
50%,54.0,165.0,72.0,120.0,80.0,26.346494
75%,58.0,170.0,82.0,140.0,90.0,30.119376
max,65.0,250.0,200.0,240.0,140.0,152.551775


#### **Duplicates**

In [385]:
# there are 3815 exactly duplicated rows, let's remove them
df_f.duplicated().sum() 

np.int64(3815)

In [391]:
# examples of duplicates
# keep = False -> Marks all duplicates as True
# default: keep = True -> Marks all duplicates as True except for the first occurence
duplicates = df_f[df_f.duplicated(keep=False)].sort_values(by=df_f.columns.tolist())
duplicates.head()

Unnamed: 0,age,gender,height,weight,systolic blood pressure,diastolic blood pressure,cholesterol,glucose,smoking,alcohol,physical activity,disease,bmi
5984,39,Men,165,65.0,120,80,normal,normal,No,No,No,No,23.875115
15114,39,Men,165,65.0,120,80,normal,normal,No,No,No,No,23.875115
37423,39,Women,170,72.0,120,80,normal,normal,No,No,Yes,No,24.913495
66714,39,Women,170,72.0,120,80,normal,normal,No,No,Yes,No,24.913495
36905,40,Men,160,60.0,120,80,normal,normal,No,No,Yes,No,23.4375


In [392]:
df_f.drop_duplicates(inplace = True)
df_f.shape

(64899, 13)

In [398]:
# saving the clean data
df_f.to_csv("../data/cleaned_data.csv", index=False)

In [400]:
# check it's saved
df_check = pd.read_csv("../data/cleaned_data.csv")
df_check.shape

(64899, 13)

/Users/kelvinzhou/Downloads/disease prediction/notebooks
