# Simple Modelling on Phonocardiogram Murmurs

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

## Importing Data

In [1]:
#imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
import re

#Sklearn modules
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
df = pd.DataFrame(data = np.load('/Users/jmd/Documents/BOOTCAMP/Capstone/signal_murmur_prelogreg_4k.npy'))

In [3]:
df.rename(columns={0:'Murmur'}, inplace=True)
df['Murmur'] = df['Murmur'].astype('int')

In [4]:
y = df['Murmur']
X = df.drop('Murmur', axis=1)

In [5]:
del df

In [6]:
y.value_counts()

0    2391
1     616
Name: Murmur, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3)

## Finding a baseline for a Simple LinReg

In [8]:
# instantiate
logit = LogisticRegression(solver='lbfgs', max_iter=100000)

# fit
logit.fit(X_train, y_train)

#score
train_acc = logit.score(X_train, y_train)
test_acc = logit.score(X_test,y_test)

#report
print(f"Train Accuracy: {(train_acc*100).round(2)}%")
print(f"Test Accuracy: {(test_acc*100).round(2)}%")

Train Accuracy: 99.81%
Test Accuracy: 77.52%


In [9]:
print(f"Chance of guessing correctly if you guess no every time: \
{((1 - y_test.value_counts()[1]/y_test.value_counts()[0]) * 100).round(2)} %")

Chance of guessing correctly if you guess no every time: 74.23 %


This is....not great.

## Finding a baseline for a Simple KNN

In [10]:
#instantiate
knnc = KNeighborsClassifier()

#fit
knnc.fit(X_train, y_train)

#score
train_acc = knnc.score(X_train, y_train)
test_acc = knnc.score(X_test,y_test)

#report
print(f"Train Accuracy: {(train_acc*100).round(2)}%")
print(f"Test Accuracy: {(test_acc*100).round(2)}%")

Train Accuracy: 79.61%
Test Accuracy: 79.51%


In [11]:
knnc.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## Finding a Baseline for a Simple SVM

In [12]:
#instantiate
SVM = SVC()

#fit
SVM.fit(X_train, y_train)

#score
train_acc = SVM.score(X_train, y_train)
test_acc = SVM.score(X_test,y_test)

#report
print(f"Train Accuracy: {(train_acc*100).round(2)}%")
print(f"Test Accuracy: {(test_acc*100).round(2)}%")

Train Accuracy: 87.69%
Test Accuracy: 79.51%


Train acc improved, but our test is the the same as our KNN. 

## Pursuing KNN Further

In [13]:
estimators = [('normalise', StandardScaler()),
              ('reduce_dim', PCA()),
              ('knn', KNeighborsClassifier())]

pipe = Pipeline(estimators)

param_grid = [
            {'normalise': [StandardScaler(), None],
             'reduce_dim': [PCA()],
             'reduce_dim__n_components': [0.9],
             'knn': [KNeighborsClassifier()], 
             'knn__n_neighbors': [5, 10, 25],
             'knn__n_jobs': [-2]},
            {'normalise': [StandardScaler(), None],
             'knn': [KNeighborsClassifier()], 
             'knn__n_neighbors': [5, 10, 25],
             'knn__n_jobs': [-2]
            }
]

In [14]:
grid = GridSearchCV(pipe, param_grid, cv=5, verbose=4)
fittedgrid = grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END knn=KNeighborsClassifier(), knn__n_jobs=-2, knn__n_neighbors=5, normalise=StandardScaler(), reduce_dim=PCA(), reduce_dim__n_components=0.9;, score=0.784 total time=  19.4s
[CV 2/5] END knn=KNeighborsClassifier(), knn__n_jobs=-2, knn__n_neighbors=5, normalise=StandardScaler(), reduce_dim=PCA(), reduce_dim__n_components=0.9;, score=0.779 total time=  22.3s
[CV 3/5] END knn=KNeighborsClassifier(), knn__n_jobs=-2, knn__n_neighbors=5, normalise=StandardScaler(), reduce_dim=PCA(), reduce_dim__n_components=0.9;, score=0.793 total time=  21.7s
[CV 4/5] END knn=KNeighborsClassifier(), knn__n_jobs=-2, knn__n_neighbors=5, normalise=StandardScaler(), reduce_dim=PCA(), reduce_dim__n_components=0.9;, score=0.796 total time=  20.4s
[CV 5/5] END knn=KNeighborsClassifier(), knn__n_jobs=-2, knn__n_neighbors=5, normalise=StandardScaler(), reduce_dim=PCA(), reduce_dim__n_components=0.9;, score=0.776 total time=  18.4s
[CV 1/5] END k

In [15]:
fittedgrid.best_estimator_

Pipeline(steps=[('normalise', StandardScaler()),
                ('reduce_dim', PCA(n_components=0.9)),
                ('knn', KNeighborsClassifier(n_jobs=-2, n_neighbors=10))])

### Best Simple Model

In [17]:
#best estimator
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#PCA
pca = PCA(n_components=0.9)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

#KNN
best_knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-2)
best_knn.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-2, n_neighbors=10)

In [18]:
#score
train_acc = best_knn.score(X_train, y_train)
test_acc = best_knn.score(X_test,y_test)

#report
print(f"Train Accuracy: {(train_acc*100).round(2)}%")
print(f"Test Accuracy: {(test_acc*100).round(2)}%")

Train Accuracy: 79.52%
Test Accuracy: 79.51%


No appreciable performance jump over the base model. 