<a href="https://colab.research.google.com/github/kubadomi/DataScience_MachineLearning/blob/master/Echocardiogram/Echocardiogram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic: Optimization of sensitivity hyperparameter in SVM classification on echocardiogram dataset with missing objects

In [0]:
import keras
keras.__version__
import numpy as np
import tensorflow as tf
import pandas as pd
import sklearn
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

In [19]:
#Loading file
csv_file = open("echocardiogram.data")
df = pd.read_csv(csv_file)
df.replace('?',np.NaN,inplace=True)
df.isna().sum()

survival                  2
still-alive               1
age                       5
pericardial-effusion      1
fractional-shortening     8
epss                     15
lvdd                     11
wall-motion-score         4
wall-motion-index         1
mult                      4
name                      0
group                    22
alive-at-1               58
dtype: int64

In [20]:
df

Unnamed: 0,survival,still-alive,age,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-score,wall-motion-index,mult,name,group,alive-at-1
0,11,0,71,0,0.260,9,4.600,14,1,1,name,1,0
1,19,0,72,0,0.380,6,4.100,14,1.700,0.588,name,1,0
2,16,0,55,0,0.260,4,3.420,14,1,1,name,1,0
3,57,0,60,0,0.253,12.062,4.603,16,1.450,0.788,name,1,0
4,19,1,57,0,0.160,22,5.750,18,2.250,0.571,name,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,7.5,1,64,0,0.24,12.9,4.72,12,1,0.857,name,,
128,41,0,64,0,0.28,5.40,5.47,11,1.10,0.714,name,,
129,36,0,69,0,0.20,7.00,5.05,14.5,1.21,0.857,name,,
130,22,0,57,0,0.14,16.1,4.36,15,1.36,0.786,name,,


In [21]:
df['alive-at-1'].value_counts()

0    50
1    24
Name: alive-at-1, dtype: int64

In [22]:
#Missing data
#For numeric data, missing objects are replaced by avg. of existing data
#For categorical data, the most common category has been put into missing places
imr = SimpleImputer(missing_values=np.NaN, strategy='mean')
imr = imr.fit(df[['survival']])
df['survival'] = imr.transform(df[['survival']])

imr = imr.fit(df[['age']])
df['age'] = imr.transform(df[['age']])
imr = imr.fit(df[['fractional-shortening']])
df['fractional-shortening'] = imr.transform(df[['fractional-shortening']])
imr = imr.fit(df[['epss']])
df['epss'] = imr.transform(df[['epss']])
imr = imr.fit(df[['lvdd']])
df['lvdd'] = imr.transform(df[['lvdd']])
imr = imr.fit(df[['wall-motion-score']])
df['wall-motion-score'] = imr.transform(df[['wall-motion-score']])
imr = imr.fit(df[['wall-motion-index']])
df['wall-motion-index'] = imr.transform(df[['wall-motion-index']])

imr = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

imr = imr.fit(df[['still-alive']])
df['still-alive'] = imr.transform(df[['still-alive']])
imr = imr.fit(df[['pericardial-effusion']])
df['pericardial-effusion'] = imr.transform(df[['pericardial-effusion']])
imr = imr.fit(df[['alive-at-1']])
df['alive-at-1'] = imr.transform(df[['alive-at-1']])
df.isna().sum()

survival                  0
still-alive               0
age                       0
pericardial-effusion      0
fractional-shortening     0
epss                      0
lvdd                      0
wall-motion-score         0
wall-motion-index         0
mult                      4
name                      0
group                    22
alive-at-1                0
dtype: int64

In [23]:
df

Unnamed: 0,survival,still-alive,age,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-score,wall-motion-index,mult,name,group,alive-at-1
0,11.0,0,71.0,0,0.260,9.000,4.600,14.0,1.000,1,name,1,0
1,19.0,0,72.0,0,0.380,6.000,4.100,14.0,1.700,0.588,name,1,0
2,16.0,0,55.0,0,0.260,4.000,3.420,14.0,1.000,1,name,1,0
3,57.0,0,60.0,0,0.253,12.062,4.603,16.0,1.450,0.788,name,1,0
4,19.0,1,57.0,0,0.160,22.000,5.750,18.0,2.250,0.571,name,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,7.5,1,64.0,0,0.240,12.900,4.720,12.0,1.000,0.857,name,,0
128,41.0,0,64.0,0,0.280,5.400,5.470,11.0,1.100,0.714,name,,0
129,36.0,0,69.0,0,0.200,7.000,5.050,14.5,1.210,0.857,name,,0
130,22.0,0,57.0,0,0.140,16.100,4.360,15.0,1.360,0.786,name,,0


In [0]:
#dividing dataset into 2 collections
target = df.iloc[:,12]
data = df.iloc[:, 0:9]
target = target.to_numpy()
data = data.to_numpy()

In [0]:
target = target.astype(int)
data = data.astype(float)

In [0]:
from sklearn.model_selection import train_test_split

data = np.array(data)
#spliting data into training and testing data for first model
(X_train, X_test, y_train, y_test) = train_test_split(data,
	target, test_size=0.2)


In [0]:
#normalization - avoiding higher number weight
mean = X_train.mean(axis=0)
X_train -= mean
std = X_train.std(axis=0)
X_train /= std

X_test -= mean
X_test /= std

In [28]:
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

#comparing parameter gamma and C
clf = SVC(gamma='auto')
for C in [1.0, 100.0, 0.01]:
  clf.set_params(C=C, kernel='linear')
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print("[linear, C=", C, "]\n")
  clf.kernel = 'rbf'
  for gamma in ['scale', 'auto', 1.0, 10.0, 0.1]:
    clf.set_params(C=C, kernel='rbf', gamma=gamma)
    clf.fit(X_train, y_train)
    print("gamma = ",gamma)
    print("ACC = ", metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

[linear, C= 1.0 ]

gamma =  scale
ACC =  0.8888888888888888
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        21
           1       0.80      0.67      0.73         6

    accuracy                           0.89        27
   macro avg       0.85      0.81      0.83        27
weighted avg       0.88      0.89      0.89        27

gamma =  auto
ACC =  0.8888888888888888
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        21
           1       0.80      0.67      0.73         6

    accuracy                           0.89        27
   macro avg       0.85      0.81      0.83        27
weighted avg       0.88      0.89      0.89        27

gamma =  1.0
ACC =  0.8888888888888888
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        21
           1       0.80      0.67      0.73         6

    accuracy                           0.89

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
from sklearn import svm

#training model with best parameters
model = svm.SVC(C=0.01, gamma = 'auto')
model = model.fit(X_train,y_train)

pred_labels = model.predict(X_test)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix

mask = pred_labels==y_test
correct = np.count_nonzero(mask)
print(correct*100.0/pred_labels.size)

cm = confusion_matrix(y_test, pred_labels)
print(cm)
print(classification_report(y_test, pred_labels))

77.77777777777777
[[21  0]
 [ 6  0]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.88        21
           1       0.00      0.00      0.00         6

    accuracy                           0.78        27
   macro avg       0.39      0.50      0.44        27
weighted avg       0.60      0.78      0.68        27



  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
from sklearn.model_selection import KFold
from sklearn.svm import SVC

#evaluation of classifier with C and kernel given
def evaluate_classifier(C, kernel):
  kf = KFold(n_splits=10)
  X = data
  y = target
  score = 0
  for train_index, test_index in kf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = SVC(gamma='auto', kernel=kernel, C=C)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    partial_score = metrics.recall_score(y_test, y_pred, average = "weighted")
    print(partial_score)
    score += partial_score
  return score
    
    
for C in [1.0, 100.0, 0.1, 0.01, 10.0, 50.0]:
    max = 0
    parameters = ""
    for kernel in ["rbf","poly", "linear"]:
      print(f"\n C = {C}", f"kernel = {kernel}")
      score = evaluate_classifier(C, kernel)/10.0
      if (score > max): 
        max = score
        parameters = f"C = {C}", f"kernel = {kernel}"
      print("\n average score for parameters:")
      print(score)
print("\n",max)
print(parameters)


 C = 1.0 kernel = rbf
0.9285714285714286
0.42857142857142855
0.9230769230769231
0.7692307692307693
0.9230769230769231
0.8461538461538461
0.8461538461538461
0.6923076923076923
0.8461538461538461
1.0

 average score for parameters:
0.8203296703296703

 C = 1.0 kernel = poly
0.9285714285714286
0.7142857142857143
0.9230769230769231
0.6923076923076923
1.0
0.9230769230769231
0.7692307692307693
0.7692307692307693
0.9230769230769231
1.0

 average score for parameters:
0.8642857142857142

 C = 1.0 kernel = linear
1.0
0.7857142857142857
0.6923076923076923
0.7692307692307693
0.8461538461538461
0.9230769230769231
0.8461538461538461
0.8461538461538461
0.8461538461538461
1.0

 average score for parameters:
0.8554945054945055

 C = 100.0 kernel = rbf
0.9285714285714286
0.42857142857142855
0.9230769230769231
0.6923076923076923
0.9230769230769231
0.8461538461538461
0.8461538461538461
0.6923076923076923
0.8461538461538461
1.0

 average score for parameters:
0.8126373626373626

 C = 100.0 kernel = poly


In [32]:
#training model for parameters C and kernel that give best sensitivity (recall)
model = svm.SVC(C=50.0, gamma = 'auto', kernel = "poly")
model = model.fit(X_train,y_train)

pred_labels = model.predict(X_test)

mask = pred_labels==y_test
correct = np.count_nonzero(mask)
print(correct*100.0/pred_labels.size)

cm = confusion_matrix(y_test, pred_labels)
print(cm)
print(classification_report(y_test, pred_labels))

96.29629629629629
[[21  0]
 [ 1  5]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.83      0.91         6

    accuracy                           0.96        27
   macro avg       0.98      0.92      0.94        27
weighted avg       0.96      0.96      0.96        27

