In [1]:
import pandas as pd
import numpy as np

## Read data

In [2]:
np.random.seed(seed=42)
df_data = pd.read_csv("./cardiovascular-disease-dataset/original/cardio_train.csv", sep=';', index_col="id")
display(df_data.describe())
display(df_data.head())

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


## Fill 5% random values with NaNs

In [3]:
to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("height")] = np.nan

to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("weight")] = np.nan

to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("cholesterol")] = np.nan

## Decode categorical values with text

In [4]:
df_data["gender"] = df_data["gender"].replace({
    1: "women",
    2: "men"
})

df_data["cholesterol"]  = df_data["cholesterol"].replace({
    1: "normal",
    2: "above_normal",
    3: "well_above_normal"
})

df_data["gluc"]  = df_data["gluc"].replace({
    1: "normal",
    2: "above_normal",
    3: "well_above_normal"
})

display(df_data.describe(include="all"))
display(df_data.head())

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000,66591.0,66578.0,70000.0,70000.0,66589,70000,70000.0,70000.0,70000.0,70000.0
unique,,2,,,,,3,3,,,,
top,,women,,,,,normal,normal,,,,
freq,,45530,,,,,49789,59479,,,,
mean,19468.865814,,164.361205,74.210467,128.817286,96.630414,,,0.088129,0.053771,0.803729,0.4997
std,2467.251667,,8.226411,14.397678,154.011419,188.47253,,,0.283484,0.225568,0.397179,0.500003
min,10798.0,,55.0,10.0,-150.0,-70.0,,,0.0,0.0,0.0,0.0
25%,17664.0,,159.0,65.0,120.0,80.0,,,0.0,0.0,1.0,0.0
50%,19703.0,,165.0,72.0,120.0,80.0,,,0.0,0.0,1.0,0.0
75%,21327.0,,170.0,82.0,140.0,90.0,,,0.0,0.0,1.0,1.0


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,men,168.0,62.0,110,80,normal,normal,0,0,1,0
1,20228,women,156.0,85.0,140,90,well_above_normal,normal,0,0,1,1
2,18857,women,165.0,64.0,130,70,,normal,0,0,0,1
3,17623,men,169.0,82.0,150,100,normal,normal,0,0,1,1
4,17474,women,156.0,56.0,100,60,normal,normal,0,0,0,0


# Write messy csv to disk

In [5]:
df_data.to_csv("./cardiovascular-disease-dataset/messy/cardio_train.csv", sep=';', index_label="id")