In [4]:
import pandas as pd
from scipy import stats
import sklearn
import imblearn
import os

train_data_dir = 'data\physionet_dbs\\afdb\clean_segments_training_dft_125hz'
test_data_dir = 'data\physionet_dbs\\afdb\clean_segments_testing_dft_125hz'
write_data_dir = 'data\models_data\\afdb\DS2'
if not os.path.exists(write_data_dir):
    os.makedirs(write_data_dir)

In [5]:
def undersampling_dict_generation(y_):

    class_feature = pd.Series(y_)
    return {
        class_label: class_feature.value_counts().min()
        for class_label in list(class_feature.value_counts().to_dict().keys())
    }

In [7]:

for segments_length_seconds in [3,4,5,6,7,8,9,10]:
    file = f'ft_segments_{segments_length_seconds}s.csv'; print(file)

    train_dataset = pd.read_csv(f'{train_data_dir}\\{file}')
    test_dataset = pd.read_csv(f'{test_data_dir}\\{file}')

    train_dataset = train_dataset[train_dataset['episode'].isin(['N', 'AFIB'])]
    test_dataset = test_dataset[test_dataset['episode'].isin(['N', 'AFIB'])]
    train_dataset.replace({'N': 0, 'AFIB': 1}, inplace = True)
    test_dataset.replace({'N': 0, 'AFIB': 1}, inplace = True)

    train_dataset['segment_fft'] = train_dataset['segment_fft'].apply(lambda seg_list: eval(seg_list))
    test_dataset['segment_fft'] = test_dataset['segment_fft'].apply(lambda seg_list: eval(seg_list))

    for dft_coefficient_iter in range(len(train_dataset.iloc[0]['segment_fft'])):
        train_dataset[f'dft_coefficient_{dft_coefficient_iter}'] = train_dataset['segment_fft'].apply(
            lambda ft_segment: ft_segment[dft_coefficient_iter]
        )
    for dft_coefficient_iter in range(len(test_dataset.iloc[0]['segment_fft'])):
        test_dataset[f'dft_coefficient_{dft_coefficient_iter}'] = test_dataset['segment_fft'].apply(
            lambda ft_segment: ft_segment[dft_coefficient_iter]
        )

    train_dataset.drop(labels = ['segment_fft'], axis = 1, inplace = True)
    test_dataset.drop(labels = ['segment_fft'], axis = 1, inplace = True)

    selected_features = ['episode']

    for dft_coefficient_feature_iter in train_dataset.columns:
        if dft_coefficient_feature_iter == 'episode': continue # not a dft coefficient

        pbc_r, p_val = stats.pearsonr(train_dataset[dft_coefficient_feature_iter], train_dataset['episode'])
        if abs(pbc_r) >= 0.1:
            selected_features.append(dft_coefficient_feature_iter)

    # select only dft coefficients with pbc correlations larger than 0.1
    train_dataset = train_dataset[selected_features]
    test_dataset = test_dataset[selected_features]

    # knn undersampler did not accept dictionary as sampling_strategy even though
    # in the docs it was said it does, thus opting for random undersampling for initial models
    random_undersampler = imblearn.under_sampling.RandomUnderSampler(
            sampling_strategy = undersampling_dict_generation
    )

    y_train = train_dataset.pop('episode')
    X_train = train_dataset

    y_test = test_dataset.pop('episode')
    X_test = test_dataset

    X_train, y_train = random_undersampler.fit_resample(X_train,y_train)
    X_test, y_test = random_undersampler.fit_resample(X_test,y_test)

    standard_scaler = sklearn.preprocessing.StandardScaler()

    X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(standard_scaler.transform(X_test), columns = X_test.columns)

    X_train['episode'] = y_train.reset_index(drop = True)
    X_test['episode'] = y_test.reset_index(drop = True)

    train = X_train
    test = X_test

    train.to_csv(f'{write_data_dir}\\segments_{segments_length_seconds}s_train.csv', index = False)
    test.to_csv(f'{write_data_dir}\\segments_{segments_length_seconds}s_test.csv', index = False)


ft_segments_3s.csv
ft_segments_4s.csv
ft_segments_5s.csv
ft_segments_6s.csv
ft_segments_7s.csv
ft_segments_8s.csv
ft_segments_9s.csv
ft_segments_10s.csv
