# Heart Disease Classification with SVM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


## Loading the Data

[UCI Heart-Disease Data](https://archive.ics.uci.edu/ml/datasets/Heart+Disease)

- age
- sex
- chest pain type (4 values)
- resting blood pressure
- serum cholestoral in mg/dl
- fasting blood sugar > 120 mg/dl
- resting electrocardiographic results (values 0,1,2)
- maximum heart rate achieved
- exercise induced angina
- oldpeak = ST depression induced by exercise relative to rest
- the slope of the peak exercise ST segment
- number of major vessels (0-3) colored by flourosopy
- thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Targets:
- 0: Has Disease
- 1: No Disease

In [2]:
df=pd.read_csv('heart.csv')
train_df = pd.read_csv('Heart_train.csv')
test_df = pd.read_csv('Heart_test.csv')

train_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,67,1,2,152,212,0,0,150,0,0.8,1,0,3,0
1,53,1,2,130,246,1,0,173,0,0.0,2,3,2,1
2,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
3,45,1,1,128,308,0,0,170,0,0.0,2,0,2,1
4,50,1,0,144,200,0,0,126,1,0.9,1,0,3,0


## Train Test 

In [3]:
X_train = train_df.drop(columns='target')
X_test = test_df.drop(columns='target')
y_train = train_df['target']
y_test = test_df['target']

In [4]:
cat_columns = ['cp', 'exang', 'slope', 'thal']
num_columns = [c for c in X_train.columns if c not in cat_columns]

print(num_columns)
cat_columns

['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'oldpeak', 'ca']


['cp', 'exang', 'slope', 'thal']

In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder=OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[cat_columns])

#Column Names
column_names=encoder.get_feature_names(input_features = cat_columns)

num_columns + list(column_names)

['age',
 'sex',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'oldpeak',
 'ca',
 'cp_0',
 'cp_1',
 'cp_2',
 'cp_3',
 'exang_0',
 'exang_1',
 'slope_0',
 'slope_1',
 'slope_2',
 'thal_0',
 'thal_1',
 'thal_2',
 'thal_3']

In [6]:
def dataset_to_encode(X,y):
    X_cat_encoded=encoder.transform(X[cat_columns])
    
    X_cat_encoded_df=pd.DataFrame(X_cat_encoded.todense(),columns=column_names,index=X.index)
    
    X_encoded = pd.concat([X[num_columns], X_cat_encoded_df],axis=1)
    return X_encoded

X_train_encoded=dataset_to_encode(X_train,y_train)

X_test_encoded=dataset_to_encode(X_test,y_test)

# Train Data

In [7]:
print(X_train_encoded.shape)
X_train_encoded.head()

(242, 22)


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,oldpeak,ca,cp_0,...,cp_3,exang_0,exang_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,67,1,152,212,0,0,150,0.8,0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,53,1,130,246,1,0,173,0.0,3,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,61,1,134,234,0,1,145,2.6,2,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,45,1,128,308,0,0,170,0.0,0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,50,1,144,200,0,0,126,0.9,0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# Test Data

In [8]:
print(X_test_encoded.shape)
X_test_encoded.head()

(61, 22)


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,oldpeak,ca,cp_0,...,cp_3,exang_0,exang_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,58,1,120,284,0,0,160,1.8,0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,52,1,112,230,0,1,160,0.0,1,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,42,0,120,209,0,1,173,0.0,0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,55,1,130,262,0,1,155,0.0,0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,53,0,130,264,0,0,143,0.4,0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# Model Building

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

grid_param={'kernel':('linear','sigmoid','poly','rbf'),'C':(0.1,1,10)}



In [10]:
def SVM_Model(X_train,X_test,y_train,y_test,grid):
    model=SVC()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    scorer=make_scorer(f1_score,average='micro')
    clf=GridSearchCV(SVC(),grid_param,scoring=scorer)
    clf.fit(X_train,y_train)
    print('Best Score',clf.best_score_,'with',clf.best_params_,'\n')
    x=clf.best_params_
    
    model=SVC(kernel=x['kernel'],C=x['C'],random_state=1)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    print('\tClassification Report of best feature')
    print(metrics.classification_report(y_test,y_pred))
    clf_report =metrics.classification_report(y_test,y_pred,output_dict=True)
    
    
    return clf_report

In [11]:
# SVM_Model(X_train_encoded,X_test_encoded,y_train,y_test,grid_param)

dict_values=SVM_Model(X_train_encoded,X_test_encoded,y_train,y_test,grid_param)

Best Score 0.8306972789115645 with {'C': 0.1, 'kernel': 'linear'} 

Classification Report of best feature
              precision    recall  f1-score   support

           0       0.85      0.79      0.81        28
           1       0.83      0.88      0.85        33

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



In [12]:
dict_values

{'0': {'precision': 0.8461538461538461,
  'recall': 0.7857142857142857,
  'f1-score': 0.8148148148148148,
  'support': 28},
 '1': {'precision': 0.8285714285714286,
  'recall': 0.8787878787878788,
  'f1-score': 0.8529411764705883,
  'support': 33},
 'accuracy': 0.8360655737704918,
 'macro avg': {'precision': 0.8373626373626374,
  'recall': 0.8322510822510822,
  'f1-score': 0.8338779956427016,
  'support': 61},
 'weighted avg': {'precision': 0.836642046478112,
  'recall': 0.8360655737704918,
  'f1-score': 0.835440551448266,
  'support': 61}}