In [278]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics 



Data contains;

    age - age in years
    sex - (1 = male; 0 = female)
    cp - chest pain type
    trestbps - resting blood pressure (in mm Hg on admission to the hospital)
    chol - serum cholestoral in mg/dl
    fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
    restecg - resting electrocardiographic results
    thalach - maximum heart rate achieved
    exang - exercise induced angina (1 = yes; 0 = no)
    oldpeak - ST depression induced by exercise relative to rest
    slope - the slope of the peak exercise ST segment
    ca - number of major vessels (0-3) colored by flourosopy
    thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
    ADH - have disease or not (1=yes, 0=no)



In [279]:
df = pd.read_csv("Heart.csv", usecols=[1,2,3,4,5,6,7,8,9,10,11,12,13,14])
df.head()


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [280]:
df.dropna()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,asymptomatic,140,241,0,0,123,1,0.2,2,0.0,reversable,Yes
298,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes


In [281]:
numeric_features = ['Age', 'RestBP','Chol','RestECG','MaxHR','Oldpeak','Slope','Ca']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Sex', 'ChestPain', 'Fbs','ExAng','Thal']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [282]:
df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [283]:
feature_cols = ['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs','RestECG','MaxHR','ExAng','Oldpeak','Slope','Ca','Thal']
X = df[feature_cols] # Features
y = df['AHD'] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

Compare accuracy with varied values of C.

In [284]:
for i in range(1,10,1):

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.SVC(kernel='linear', C=i,random_state=1))])

    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)

    #Predict the response for test dataset
    y_pred = clf.predict(X_test)

    print("Accuracy with C is",i,":",metrics.accuracy_score(y_test, y_pred))


Accuracy with C is 1 : 0.8681318681318682
Accuracy with C is 2 : 0.8681318681318682
Accuracy with C is 3 : 0.8681318681318682
Accuracy with C is 4 : 0.8571428571428571
Accuracy with C is 5 : 0.8571428571428571
Accuracy with C is 6 : 0.8571428571428571
Accuracy with C is 7 : 0.8571428571428571
Accuracy with C is 8 : 0.8571428571428571
Accuracy with C is 9 : 0.8571428571428571


Conclusion: The larger C number, the smaller accuracy rate is

Compare Accuracy Between three type of kernel

In [292]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.SVC(kernel='linear', C=10,random_state=1))])

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy with linear kernel:",metrics.accuracy_score(y_test, y_pred))

Accuracy with linear kernel: 0.8571428571428571


In [291]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.SVC(kernel='poly', C=10,random_state=1))])

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy with polynomial kernel:",metrics.accuracy_score(y_test, y_pred))

Accuracy with polynomial kernel: 0.7472527472527473


In [290]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.SVC(kernel='rbf', C=10,random_state=1))])

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy with radial kernel:",metrics.accuracy_score(y_test, y_pred))

Accuracy with radial kernel: 0.7582417582417582


Conclusion: The best technique to use for support vector classifier on this dataset is linear kernel