### Importing modules

In [1]:
import pandas as pd
import numpy as np
import sklearn

### Loading CSV

In [2]:
heartDiseaseData = pd.read_csv("heart disease classification dataset.csv")
heartDiseaseData.shape

(303, 15)

In [25]:
heartDiseaseData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,1
1,37,1,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,1
2,41,0,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,1
3,56,1,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,1
4,57,0,0,131.712375,354.0,0,1,163.0,1,0.6,2,0,2,1


### Dropping "Unnamed: 0 column"

In [3]:
heartDiseaseData = heartDiseaseData.drop("Unnamed: 0", axis=1)

In [4]:
heartDiseaseData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,male,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,yes
1,37,male,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,yes
2,41,female,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,yes
3,56,male,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,yes
4,57,female,0,,354.0,0,1,163.0,1,0.6,2,0,2,yes


### Inspecting missing(NaN) values

In [5]:
heartDiseaseData.isnull().sum()

age         0
sex         0
cp          0
trestbps    4
chol        1
fbs         0
restecg     0
thalach     5
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

### SimpleImputer Import and setting its strategy to mean

In [6]:
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values=np.nan, strategy="mean")

### Imputing 'trestbps' missing values

In [7]:
impute.fit(heartDiseaseData[['trestbps']])
heartDiseaseData['trestbps'] = impute.transform(heartDiseaseData[['trestbps']])

### Imputing 'chol' missing values

In [8]:
impute.fit(heartDiseaseData[['chol']])
heartDiseaseData['chol'] = impute.transform(heartDiseaseData[['chol']])

### Imputing 'thalach' missing values

In [9]:
impute.fit(heartDiseaseData[['thalach']])
heartDiseaseData['thalach'] = impute.transform(heartDiseaseData[['thalach']])

In [10]:
heartDiseaseData.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [11]:
heartDiseaseData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,male,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,yes
1,37,male,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,yes
2,41,female,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,yes
3,56,male,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,yes
4,57,female,0,131.712375,354.0,0,1,163.0,1,0.6,2,0,2,yes


### Encoding

In [12]:
heartDiseaseData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    object 
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    float64
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    object 
dtypes: float64(4), int64(8), object(2)
memory usage: 33.3+ KB


In [13]:
heartDiseaseData['sex'].unique()

array(['male', 'female'], dtype=object)

In [14]:
heartDiseaseData['target'].unique()

array(['yes', 'no'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

In [16]:
heartDiseaseData['sex'] = enc.fit_transform(heartDiseaseData['sex'])
heartDiseaseData['target'] = enc.fit_transform(heartDiseaseData['target'])

### Scaling between 0-1 && Feature and Label selection

In [17]:
x = heartDiseaseData.drop(columns='target', axis=1)
y = heartDiseaseData['target']

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [26]:
x_train, x_test, y_train, y_text = train_test_split(x, y, test_size=0.2, random_state=1)

In [20]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)

In [27]:
x_test_scaled = scaler.transform(x_test)