In [124]:
#TITLE: A Machine Learning Approach for the Classification of Cardiac Arrhythmia
#STUDENT NAME: João Pedro da Silva Esteves
#JMBAG: 

In [125]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'

First, get data frame from the .data file of arrhythmia, with features named from 0 to 279, using pandas' read_csv()
Feature 279 is actually the 'targets'

In [126]:
lst = range(0,280)
raw_data = pd.read_csv(url, names = lst, na_values='?')

Separating the features from the targets...

In [127]:
ft_data = raw_data.loc[:, lst[:-1]].values
target_data = raw_data.loc[:, lst[-1]].values

Standardizing the features' numerical data...

In [128]:
ft_data = StandardScaler().fit_transform(ft_data)


Replacing NaN values using imputation with mean values.

In [129]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
ft_data = imp.fit_transform(ft_data)



Determine the number of features needed to get a variance rate of 99%.

In [130]:
pca = PCA(0.99)
pca.fit_transform(ft_data)
len(pca.explained_variance_ratio_)

154

According to the paper, reducing the number of features to 150 features is enough (150 is close to 154 features previously obtained).


In [131]:
pca = PCA(n_components = 150)

principal_components = pca.fit_transform(ft_data)

lst = range(150)

principal_ft_data = pd.DataFrame(data = principal_components, columns= lst)


Concatenating the reduced feature components with the targets in a new data frame...
Then, a training set and a test set are acquired.

In [132]:
target_data_df = pd.DataFrame(target_data)
final_df = pd.concat([principal_ft_data, target_data_df], axis = 1)

training_ft_set, test_ft_set, training_target_set, test_target_set = train_test_split(principal_ft_data, np.ravel(target_data_df), test_size = 0.2, random_state = 25)

Now, 4 algorithms for data classification will be used and compared later on:

In [133]:
#SVM model
#Start by training without tuning hyperparameters

model = SVC()
model.fit(training_ft_set, training_target_set)

#print prediction results

predictions = model.predict(test_ft_set)
#print(classification_report(test_target_set, predictions))

#Now we should use the model with hyperparameters turning

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, verbose=0)

grid.fit(training_ft_set, training_target_set)

print(grid.best_params_)
print(grid.best_estimator_)



{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.001)


In [134]:
grid_predictions = grid.predict(test_ft_set)
print(classification_report(test_target_set, grid_predictions))
print("Accuracy:",metrics.accuracy_score(test_target_set, grid_predictions))

              precision    recall  f1-score   support

           1       0.72      0.94      0.81        51
           2       0.57      0.44      0.50         9
           3       1.00      1.00      1.00         1
           4       0.50      1.00      0.67         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         1
           9       1.00      0.67      0.80         3
          10       1.00      0.67      0.80        15
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1

    accuracy                           0.73        91
   macro avg       0.44      0.43      0.42        91
weighted avg       0.67      0.73      0.68        91

Accuracy: 0.7252747252747253


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [135]:
#Logistic Regression Model
#Start by training without tuning hyperparameters

model = LogisticRegression()
model.fit(training_ft_set, training_target_set)

#print prediction results

predictions = model.predict(test_ft_set)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(test_ft_set, test_target_set)))

#Now we should use the model with hyperparameters tuning

param_grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} # l1 lasso l2 ridge
grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, cv=5, verbose=0)

grid.fit(training_ft_set, training_target_set)

print(grid.best_params_)
print(grid.best_estimator_)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of logistic regression classifier on test set: 0.70


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 0.1, 'penalty': 'l2'}
LogisticRegression(C=0.1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [136]:
grid_predictions = grid.predict(test_ft_set)
print(classification_report(test_target_set, grid_predictions))
print("Accuracy:",metrics.accuracy_score(test_target_set, grid_predictions))

              precision    recall  f1-score   support

           1       0.73      0.96      0.83        51
           2       0.43      0.33      0.38         9
           3       1.00      1.00      1.00         1
           4       0.50      1.00      0.67         1
           5       0.00      0.00      0.00         1
           6       0.50      0.14      0.22         7
           8       0.00      0.00      0.00         1
           9       1.00      0.33      0.50         3
          10       0.90      0.60      0.72        15
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1

    accuracy                           0.71        91
   macro avg       0.46      0.40      0.39        91
weighted avg       0.69      0.71      0.67        91

Accuracy: 0.7142857142857143


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
#K-Nearest Neighbors Model
#Start by training without tuning hyperparameters

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(training_ft_set, training_target_set)

#print prediction results
predictions = classifier.predict(test_ft_set)
print(classification_report(test_target_set, predictions))

#Now we should use the model with hyperparameters tuning

              precision    recall  f1-score   support

           1       0.58      1.00      0.73        51
           2       1.00      0.11      0.20         9
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         3
          10       1.00      0.13      0.24        15
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1

    accuracy                           0.59        91
   macro avg       0.23      0.11      0.11        91
weighted avg       0.59      0.59      0.47        91



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
