In [152]:
import pandas as pd

data = pd.read_csv('cardio_dataset-original.csv')

# Step 1 - Handling Outliers

In [153]:
factor = 4

upper_lim = data['AGEIR'].mean () + data['AGEIR'].std () * factor
lower_lim = data['AGEIR'].mean () - data['AGEIR'].std () * factor

data_new = data[(data['AGEIR'] < upper_lim) & (data['AGEIR'] > lower_lim)]

In [154]:
upper_lim = data_new['TC'].mean () + data_new['TC'].std () * factor
lower_lim = data_new['TC'].mean () - data_new['TC'].std () * factor

data_new = data_new[(data_new['TC'] < upper_lim) & (data_new['TC'] > lower_lim)]

In [155]:
upper_lim = data_new['HDL'].mean () + data_new['HDL'].std () * factor
lower_lim = data_new['HDL'].mean () - data_new['HDL'].std () * factor

data_new = data_new[(data_new['HDL'] < upper_lim) & (data_new['HDL'] > lower_lim)]

In [156]:
upper_lim = data_new['RISK'].mean () + data_new['RISK'].std () * factor
lower_lim = data_new['RISK'].mean () - data_new['RISK'].std () * factor

data_new = data_new[(data_new['RISK'] < upper_lim) & (data_new['RISK'] > lower_lim)]

# Step 2 - Scaling

In [157]:
numerical_data=data_new.drop(['SEX','SMOKE_','BPMED','DIAB_noyes','RISK'],axis=1)
numerical_data.head()

Unnamed: 0,AGEIR,TC,HDL
0,48,236,66
1,48,260,51
2,44,187,49
3,42,216,57
4,56,156,42


In [158]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
transformed_numerical_data = minmax_scaler.fit_transform(numerical_data)
transformed_numerical_data=pd.DataFrame(transformed_numerical_data, columns = ['AGEIR', 'TC', 'HDL'])

# Step 3 - Categorical Encoding

In [159]:
data_new["SEX"] = data_new["SEX"].astype('category')
data_new["SMOKE_"] = data_new["SMOKE_"].astype('category')
data_new["BPMED"] = data_new["BPMED"].astype('category')
data_new["DIAB_noyes"] = data_new["DIAB_noyes"].astype('category')
print(data_new.dtypes)

SEX           category
AGEIR            int64
TC               int64
HDL              int64
SMOKE_        category
BPMED         category
DIAB_noyes    category
RISK           float64
dtype: object


### 1 - Label Encoding

In [160]:
categorical_data = data_new.drop(['AGEIR', 'TC', 'HDL','RISK'], axis=1)
categorical_data.head()

Unnamed: 0,SEX,SMOKE_,BPMED,DIAB_noyes
0,female,no smoking,taking,no
1,male,no smoking,taking,yes
2,male,smoking,taking,no
3,female,smoking,taking,no
4,female,no smoking,taking,no


In [161]:
categorical_data["SEX"] = categorical_data["SEX"].cat.codes
categorical_data["SMOKE_"] = categorical_data["SMOKE_"].cat.codes
categorical_data["BPMED"] = categorical_data["BPMED"].cat.codes
categorical_data["DIAB_noyes"] = categorical_data["DIAB_noyes"].cat.codes
categorical_data.head()

Unnamed: 0,SEX,SMOKE_,BPMED,DIAB_noyes
0,0,0,1,0
1,1,0,1,1
2,1,1,1,0
3,0,1,1,0
4,0,0,1,0


### 2- One Hot Encoding

In [None]:
categorical_data = data_new.drop(['AGEIR', 'TC', 'HDL','RISK'], axis=1)
categorical_data.head()

In [None]:
encoded_sex = pd.get_dummies(categorical_data['SEX'])
encoded_smoke = pd.get_dummies(categorical_data['SMOKE_'])
encoded_bpmed = pd.get_dummies(categorical_data['BPMED'])
encoded_diab = pd.get_dummies(categorical_data['DIAB_noyes'])


categorical_data=categorical_data.join(encoded_sex)
categorical_data=categorical_data.join(encoded_smoke)
categorical_data=categorical_data.join(encoded_bpmed)
categorical_data=categorical_data.join(encoded_diab)

categorical_data.head()

In [None]:
categorical_data=categorical_data.drop(['SEX','SMOKE_','BPMED','DIAB_noyes'],axis=1)
categorical_data.head()

# Step 4 -Reforming the dataset

In [162]:
dataset=categorical_data.join(transformed_numerical_data)
dataset=dataset.join(data_new['RISK'])
dataset.head()

Unnamed: 0,SEX,SMOKE_,BPMED,DIAB_noyes,AGEIR,TC,HDL,RISK
0,0,0,1,0,0.205128,0.531532,0.519231,1.1
1,1,0,1,1,0.205128,0.603604,0.375,7.0
2,1,1,1,0,0.102564,0.384384,0.355769,7.0
3,0,1,1,0,0.051282,0.471471,0.432692,0.4
4,0,0,1,0,0.410256,0.291291,0.288462,2.2


In [165]:
dataset=dataset.dropna()

In [166]:
print(dataset.isnull().sum())

SEX           0
SMOKE_        0
BPMED         0
DIAB_noyes    0
AGEIR         0
TC            0
HDL           0
RISK          0
dtype: int64


# Step 5 - Training