In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv('heart.csv')

# **Know Data**

In [24]:
df.head(4)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [26]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [27]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [28]:
df['Sex'].unique()

array(['M', 'F'], dtype=object)

In [29]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

# **train_test_split**

In [30]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# **Encoding**

**Ordinal Encoding**

In [31]:
from sklearn.preprocessing import OrdinalEncoder

In [32]:
from sklearn.preprocessing import OrdinalEncoder

# Define the encoder with categories
oe = OrdinalEncoder(categories=[
    ['M', 'F'],
    ['ATA', 'ASY', 'NAP', 'TA'],
    ['Normal', 'ST', 'LVH'],
    ['N', 'Y'],
    ['Down', 'Flat', 'Up']
])
# Select the categorical columns to encode
categorical_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Fit the encoder on training data
oe.fit(X_train[categorical_columns])

# Transform both train and test data
X_train_encoded = oe.transform(X_train[categorical_columns])
X_test_encoded = oe.transform(X_test[categorical_columns])
X_train[categorical_columns] = X_train_encoded
X_test[categorical_columns] = X_test_encoded

In [33]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 734 entries, 795 to 102
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             734 non-null    int64  
 1   Sex             734 non-null    float64
 2   ChestPainType   734 non-null    float64
 3   RestingBP       734 non-null    int64  
 4   Cholesterol     734 non-null    int64  
 5   FastingBS       734 non-null    int64  
 6   RestingECG      734 non-null    float64
 7   MaxHR           734 non-null    int64  
 8   ExerciseAngina  734 non-null    float64
 9   Oldpeak         734 non-null    float64
 10  ST_Slope        734 non-null    float64
dtypes: float64(6), int64(5)
memory usage: 68.8 KB


In [35]:
X_train.head(4)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
795,42,0.0,2.0,120,240,1,0.0,194,0.0,0.8,0.0
25,36,0.0,2.0,130,209,0,0.0,178,0.0,0.0,2.0
84,56,0.0,1.0,150,213,1,0.0,125,1.0,1.0,1.0
10,37,1.0,2.0,130,211,0,0.0,142,0.0,0.0,2.0


In [36]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [37]:
from sklearn.metrics import accuracy_score
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8695652173913043

In [38]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

In [39]:
accuracy_score(y_test,knn.predict(X_test))

0.7065217391304348

**One Hot Encoding**

In [40]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [41]:
encoded_df = pd.get_dummies(df,columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'])
encoded_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True


In [42]:
X_train,X_test,y_train,y_test = train_test_split(encoded_df.drop(columns=['HeartDisease']),encoded_df['HeartDisease'],test_size=0.2,random_state=42)

In [43]:
rf_2 = RandomForestClassifier()
rf_2.fit(X_train,y_train)

In [44]:
y_pred = rf_2.predict(X_test)
accuracy_score(y_test,y_pred)

0.875

In [45]:
knn_2 = KNeighborsClassifier(n_neighbors=5)
knn_2.fit(X_train,y_train)

In [46]:
accuracy_score(y_test,knn_2.predict(X_test))

0.7065217391304348