In [73]:
import pandas as pd # pandas is used to load and manipulate data and for One-Hot Encoding
import numpy as np # numpy is used to calculate the mean and standard deviation
import matplotlib.pyplot as plt # matplotlib is for drawing graphs
import matplotlib.colors as colors
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.preprocessing import scale,MinMaxScaler # scale and center data
from sklearn.svm import SVC # this will make a support vector machine for classificaiton
from sklearn.model_selection import GridSearchCV # this will do cross validation
from sklearn.metrics import confusion_matrix,accuracy_score # this creates a confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay # draws a confusion matrix
from sklearn.decomposition import PCA # to perform PCA to plot the data

In [74]:
df = pd.read_csv('/content/processed.cleveland.data')

In [75]:
df.columns = ['age',
              'sex',
              'cp',
              'restbp',
              'chol',
              'fbs',
              'restecg',
              'thalach',
              'exang',
              'oldpeak',
              'slope',
              'ca',
              'thal',
              'hd']

In [76]:
df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


In [77]:
df.drop(index=np.where(df.iloc[:,:]=='?')[0],inplace=True)

In [78]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [79]:
X

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0
297,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0
298,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0
299,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0


In [80]:
X = pd.get_dummies(X,columns=['cp','restecg','slope','ca','thal'])
y = y.apply(lambda x:0 if x==0 else 1)

In [81]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [82]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [83]:
svc = SVC(random_state=42)
svc.fit(X_train,y_train)
accuracy_score(y_test,svc.predict(X_test))

0.8918918918918919

In [117]:
params = {
    'C' : [0.001,0.01,1,2,3,4,5,6,7,8,9,10,100,5],
    'kernel' : ['linear','rbf','poly','sigmoid'],
    'degree' : [1,2,3,5,10,100],
}
grid = GridSearchCV(estimator=SVC(),param_grid=params,scoring='accuracy')

In [118]:
grid.fit(X_train,y_train)
grid.best_params_

{'C': 1, 'degree': 1, 'kernel': 'rbf'}

In [119]:
grid.best_score_

np.float64(0.828989898989899)

In [121]:
svc = SVC(C= 2, degree= 1, kernel= 'rbf')
svc.fit(X_train,y_train)
accuracy_score(y_test,svc.predict(X_test))

0.9054054054054054

In [122]:
confusion_matrix(y_test,svc.predict(X_test))

array([[41,  2],
       [ 5, 26]])