In [1]:
import logging 
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import sklearn.model_selection

import algo_ecg.data
import algo_ecg.preprocessing
from algo_ecg.feature_transformer import calculate_hrv_based_on_peak_intervals, FindPeaksCustomTransformer, select_cols

from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from scipy.signal import find_peaks
import pandas as pd


In [4]:
#X_in, y = algo_ecg.data.import_physionet_data('../data')
X_in, y = algo_ecg.data.import_physionet_data('../data', num_files_to_read=1000)
X = algo_ecg.preprocessing.preprocess(X_in)

In [5]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=.7)

In [6]:
slice_length = len(X_train[0])
X_train = np.concatenate(X_train).reshape(-1, slice_length, 1)
X_test = np.concatenate(X_test).reshape(-1, slice_length, 1)

In [7]:
X_train_pd = pd.DataFrame.from_records(X_train)
X_test_pd = pd.DataFrame.from_records(X_test)

In [8]:
print(X_train_pd.shape, X_test_pd.shape)

(482, 9000) (208, 9000)


In [9]:

pipe = Pipeline(steps=[
    ('peaks', FindPeaksCustomTransformer (axis=1)), 
    ('feature_selection', SelectFromModel(LinearSVC(dual="auto", penalty="l1", C = 0.01))), 
    ('standardscaler', StandardScaler()), 
    ("lr", LogisticRegression(max_iter=1000, tol=0.1, class_weight = "balanced"))
     ]
    )

In [10]:
# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    "lr__C": np.logspace(-4, 4, 4), 
}

#search = GridSearchCV(pipe, param_grid, n_jobs=2)
#search = RandomizedSearchCV(pipe, param_grid, n_jobs=2, cv = 3)
search = HalvingRandomSearchCV(pipe, param_grid, n_jobs=2, cv = 3)

search.fit(X_train_pd, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)




Best parameter (CV score=0.889):
{'lr__C': 10000.0}


In [13]:
from datetime import datetime
filename = 'model_pipeline_all_data_{}.pkl'.format(datetime.now().strftime('%Y-%m-%d'))

In [14]:
import joblib
#joblib.dump(search, 'model_pipeline_all_data.pkl')
joblib.dump(search, filename)

['model_pipeline_all_data_2024-01-12.pkl']

In [15]:
y_train_pred = search.predict(X_train_pd)
y_test_pred = search.predict(X_test_pd)


In [16]:
CUTOFF = .5
acc = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=(y_test_pred > CUTOFF))
prec = sklearn.metrics.precision_score(y_true=y_test, y_pred=(y_test_pred > CUTOFF))
rec = np.sum(y_test * (y_test_pred > CUTOFF).reshape((-1,))) / np.sum(y_test)
sens = np.sum((y_test + (y_test_pred > CUTOFF).reshape((-1,))) == 0) / np.sum(y_test == 0)
acc, prec, rec, sens

(0.8221153846153846,
 0.4230769230769231,
 0.7586206896551724,
 0.8324022346368715)

In [23]:
def output_performance_metrics(cutoff, y_true, y_pred): 
    acc = sklearn.metrics.accuracy_score(y_true=y_true, y_pred=(y_pred > CUTOFF))
    prec = sklearn.metrics.precision_score(y_true=y_true, y_pred=(y_pred > CUTOFF))
    rec = np.sum(y_true * (y_pred > CUTOFF).reshape((-1,))) / np.sum(y_true)
    sens = np.sum((y_true + (y_pred > CUTOFF).reshape((-1,))) == 0) / np.sum(y_true == 0)
    return acc, prec, rec, sens

In [24]:
output_performance_metrics(.5, y_test, y_test_pred)

(0.8221153846153846,
 0.4230769230769231,
 0.7586206896551724,
 0.8324022346368715)

In [25]:
output_performance_metrics(.5, y_train, y_train_pred)

(0.7946058091286307, 0.381294964028777, 0.803030303030303, 0.7932692307692307)