# import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# preprocessing

In [7]:
trainData = pd.read_csv('train.csv')
print(pd.DataFrame(trainData))

      id  age sex  ... oldpeak     ST slope  heart_disease_present
0      0   48   M  ...     1.0  Downsloping                      1
1      1   46   M  ...     0.0    Upsloping                      0
2      2   65   F  ...     0.8    Upsloping                      0
3      3   51   F  ...     0.6    Upsloping                      0
4      4   58   F  ...     1.0    Upsloping                      0
..   ...  ...  ..  ...     ...          ...                    ...
828  828   59   M  ...     1.2         Flat                      1
829  829   48   M  ...     0.0    Upsloping                      0
830  830   44   M  ...     0.0    Upsloping                      0
831  831   69   M  ...     1.5         Flat                      1
832  832   54   M  ...     2.8         Flat                      1

[833 rows x 13 columns]


In [8]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833 entries, 0 to 832
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     833 non-null    int64  
 1   age                    833 non-null    int64  
 2   sex                    833 non-null    object 
 3   chest pain type        833 non-null    object 
 4   resting bps            833 non-null    int64  
 5   cholesterol            833 non-null    int64  
 6   fasting blood sugar    833 non-null    int64  
 7   resting ecg            833 non-null    object 
 8   max heart rate         833 non-null    int64  
 9   exercise angina        833 non-null    object 
 10  oldpeak                833 non-null    float64
 11  ST slope               833 non-null    object 
 12  heart_disease_present  833 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 84.7+ KB


In [9]:
trainData.drop('id',axis=1,inplace=True)
print(trainData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833 entries, 0 to 832
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    833 non-null    int64  
 1   sex                    833 non-null    object 
 2   chest pain type        833 non-null    object 
 3   resting bps            833 non-null    int64  
 4   cholesterol            833 non-null    int64  
 5   fasting blood sugar    833 non-null    int64  
 6   resting ecg            833 non-null    object 
 7   max heart rate         833 non-null    int64  
 8   exercise angina        833 non-null    object 
 9   oldpeak                833 non-null    float64
 10  ST slope               833 non-null    object 
 11  heart_disease_present  833 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 78.2+ KB
None


In [10]:
trainData['exercise angina'].replace({'NO':0, 'YES':1}, inplace=True)
trainData['sex'].replace({'M':0, 'F':1}, inplace=True)
trainData['chest pain type'].replace({'typical':1, 'typical angina':2, 'non-anginal pain':3, 'asymptomatic':4}, inplace=True)
trainData['resting ecg'].replace({'Normal':1, 'Abnormality in T-T wave':2, 'Left ventricular hypertrophy':3}, inplace=True)
trainData['ST slope'].replace({'Normal.':1, 'Upsloping':2, 'Flat':3, 'Downsloping':4}, inplace=True)

In [11]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833 entries, 0 to 832
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    833 non-null    int64  
 1   sex                    833 non-null    int64  
 2   chest pain type        833 non-null    int64  
 3   resting bps            833 non-null    int64  
 4   cholesterol            833 non-null    int64  
 5   fasting blood sugar    833 non-null    int64  
 6   resting ecg            833 non-null    int64  
 7   max heart rate         833 non-null    int64  
 8   exercise angina        833 non-null    int64  
 9   oldpeak                833 non-null    float64
 10  ST slope               833 non-null    int64  
 11  heart_disease_present  833 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 78.2 KB


In [12]:
trainData.isna()

Unnamed: 0,age,sex,chest pain type,resting bps,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,heart_disease_present
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
828,False,False,False,False,False,False,False,False,False,False,False,False
829,False,False,False,False,False,False,False,False,False,False,False,False
830,False,False,False,False,False,False,False,False,False,False,False,False
831,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
x = trainData.drop("heart_disease_present", axis=1)
y = trainData["heart_disease_present"]

In [14]:
print(x)

     age  sex  chest pain type  ...  exercise angina  oldpeak  ST slope
0     48    0                2  ...                0      1.0         4
1     46    0                4  ...                0      0.0         2
2     65    1                3  ...                0      0.8         2
3     51    1                3  ...                0      0.6         2
4     58    1                1  ...                0      1.0         2
..   ...  ...              ...  ...              ...      ...       ...
828   59    0                4  ...                1      1.2         3
829   48    0                3  ...                0      0.0         2
830   44    0                3  ...                0      0.0         2
831   69    0                4  ...                1      1.5         3
832   54    0                4  ...                1      2.8         3

[833 rows x 11 columns]


In [15]:
print(y)

0      1
1      0
2      0
3      0
4      0
      ..
828    1
829    0
830    0
831    1
832    1
Name: heart_disease_present, Length: 833, dtype: int64


# Building Modal

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

## Decision Tree

In [23]:
decisionTree = DecisionTreeClassifier(criterion='gini')

In [24]:
decisionTree.fit(x_train, y_train)

DecisionTreeClassifier()

### Prediction

In [25]:
y_predict = decisionTree.predict(x_test)

In [26]:
metrics.accuracy_score(y_test, y_predict)

0.828

## K-Nearest Neighbors modal

In [20]:
CLf = KNeighborsClassifier(n_neighbors=38, weights='distance', algorithm='kd_tree', metric='manhattan')

In [None]:
CLf.fit(x_train, y_train)

KNeighborsClassifier(algorithm='kd_tree', metric='manhattan', n_neighbors=55,
                     weights='distance')

### Prediction and accuracy section

In [None]:
y_predict = CLf.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, y_predict)

0.808

## Support Vector Machines modal

In [None]:
s_v_c = SVC(kernel = 'linear')

In [None]:
s_v_c.fit(x_train, y_train)

SVC(kernel='linear')

### Prediction and accuracy section

In [None]:
y_predict = s_v_c.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, y_predict)

0.804

## Logistic Regression modal

In [None]:
scaler = StandardScaler()
lr = LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear')
reg = Pipeline([('standardize', scaler),
                    ('log_reg', lr)])

In [None]:
reg.fit(x_train, y_train)

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg',
                 LogisticRegression(class_weight='balanced', penalty='l1',
                                    solver='liblinear', warm_start=True))])

### Prediction and accuracy section

In [None]:
y_predict = reg.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, y_predict)

0.832

# Submission file section

In [27]:
testData = pd.read_csv('test.csv')
testData

Unnamed: 0,id,age,sex,chest pain type,resting bps,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope
0,0,54,M,asymptomatic,110,239,0,Normal,126,YES,2.8,Flat
1,1,63,M,asymptomatic,150,0,0,Normal,86,YES,2.0,Flat
2,2,50,M,non-anginal pain,140,233,0,Normal,163,NO,0.6,Flat
3,3,65,M,asymptomatic,120,177,0,Normal,140,NO,0.4,Upsloping
4,4,53,M,asymptomatic,123,282,0,Normal,95,YES,2.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...,...
353,353,67,M,non-anginal pain,152,212,0,Left ventricular hypertrophy,150,NO,0.8,Flat
354,354,62,F,asymptomatic,140,394,0,Left ventricular hypertrophy,157,NO,1.2,Flat
355,355,56,M,typical angina,126,166,0,Abnormality in T-T wave,140,NO,0.0,Upsloping
356,356,57,M,non-anginal pain,150,168,0,Normal,174,NO,1.6,Upsloping


In [28]:
testData['exercise angina'].replace({'NO':0, 'YES':1}, inplace=True)
testData['sex'].replace({'M':0, 'F':1}, inplace=True)
testData['chest pain type'].replace({'typical':1, 'typical angina':2, 'non-anginal pain':3, 'asymptomatic':4}, inplace=True)
testData['resting ecg'].replace({'Normal':1, 'Abnormality in T-T wave':2, 'Left ventricular hypertrophy':3}, inplace=True)
testData['ST slope'].replace({'Normal.':1, 'Upsloping':2, 'Flat':3, 'Downsloping':4}, inplace=True)
testData.drop('id',axis=1,inplace=True)

In [29]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  358 non-null    int64  
 1   sex                  358 non-null    int64  
 2   chest pain type      358 non-null    int64  
 3   resting bps          358 non-null    int64  
 4   cholesterol          358 non-null    int64  
 5   fasting blood sugar  358 non-null    int64  
 6   resting ecg          358 non-null    int64  
 7   max heart rate       358 non-null    int64  
 8   exercise angina      358 non-null    int64  
 9   oldpeak              358 non-null    float64
 10  ST slope             358 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 30.9 KB


## csv file

In [None]:
#y_predict = decisionTree.predict(x_test)
#y_predict = CLf.predict(x_test)
#y_predict = s_v_c.predict(testData)
y_predict = reg.predict(x_test)

In [None]:
id = np.arange(len(y_predict))
dataset = pd.DataFrame({'id': id, 'heart_disease_present': y_predict})

In [None]:
dataset

Unnamed: 0,id,heart_disease_present
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
...,...,...
353,353,1
354,354,1
355,355,0
356,356,0


In [None]:
dataset.to_csv('predictied.csv', index=False)