In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use(['dark_background'])
# plt.rcParams["figure.figsize"] = (10,7)
from typing import List, Union
import os

data_dir = 'data\physionet_dbs\\afdb\clean_segments_training_dft_125hz'
tables_dir = 'Tables'
if not os.path.exists(tables_dir):
    os.makedirs(tables_dir)


In [3]:
def sns_palette_map(input_string: str):
    if input_string.lower() == 'g': return 'Greens'
    if input_string.lower() == 'r': return 'Reds'

def distribution_histogram_box_plots(df: pd.DataFrame,
                                             ft_feature_name: str,
                                             class_feature_name: str,
                                             classes: List[Union[str, int]],
                                             classes_colors: List[str],
                                             save_path: str = None
                                             ):

    # sort dataframe to match classes labels passed on as arguments to assure correct color of class elements
    sorted_df = pd.DataFrame()
    for class_i in classes:
        sorted_df =  sorted_df.append(df[df[class_feature_name].isin([class_i])], ignore_index = True)
    df = sorted_df

    fig1, ax = plt.subplots(constrained_layout=False)
    gs = fig1.add_gridspec(2, 1, height_ratios=[5,1])

    ax1 = fig1.add_subplot(gs[0, 0])

    for class_iter in classes:
        ax1 = sns.distplot(
            df[df[class_feature_name] == class_iter][ft_feature_name],
            color = [
                sns.color_palette(palette_colors)[3]
                for palette_colors in map(sns_palette_map, classes_colors)][classes.index(class_iter)
            ],
            rug = False,
            kde_kws = {'alpha': .7},
            ax = ax1
        )

    ax2 = fig1.add_subplot(gs[1, 0])
    sns.boxplot(
        data=df,
        x=ft_feature_name,
        y=class_feature_name,
        notch=True,
        ax = ax2,
        palette = [
            sns.color_palette(palette_colors)[3]
            for palette_colors in map(sns_palette_map, classes_colors)
        ]
    )

    sns.despine()
    ax.set_axis_off()
    plt.tight_layout()
    plt.savefig(save_path)
    plt.show()
    plt.close()


In [2]:
# NOTE: ft_coefficients features start from 0 to match with the dataframe index and simplify operations

best_pbc_corrs_by_dataset = dict()

for segments_length_seconds in [3,4,5,6,7,8,9,10]:
    file = f'ft_segments_{segments_length_seconds}s.csv'; print(file)

    dataset = pd.read_csv(f'{data_dir}\\{file}')

    dataset = dataset[dataset['episode'].isin(['N', 'AFIB'])]
    dataset.replace({'N': 0, 'AFIB': 1}, inplace = True)

    dataset['segment_fft'] = dataset['segment_fft'].apply(lambda seg_list: eval(seg_list))

    for ft_coefficient_iter in range(len(dataset.iloc[0]['segment_fft'])):
        dataset[f'ft_coefficient_{ft_coefficient_iter}'] = dataset['segment_fft'].apply(
            lambda ft_segment: ft_segment[ft_coefficient_iter]
        )

    pbc_corrs_list_to_df = list()

    for ft_coefficient_iter in range(len(dataset.iloc[0]['segment_fft'])):

        pbc_r, p_val = stats.pearsonr(dataset[f'ft_coefficient_{ft_coefficient_iter}'], dataset['episode'])

        pbc_corrs_list_to_df.append(dict(
                ft_coefficient = ft_coefficient_iter,
                pbc_r = round(pbc_r, 4),
                p_val = round(p_val, 4),
        ))

    pbc_df = pd.DataFrame(pbc_corrs_list_to_df)
    pbc_df.sort_values(by = 'pbc_r', inplace = True, ascending = False)
    print(pbc_df.head())

    best_pbc_corrs_by_dataset[segments_length_seconds] = {
        ft_coef: pbc_corr_coef
        for (ft_coef, pbc_corr_coef) in pbc_df.iloc[:5].to_dict()['pbc_r'].items()
    }

    pbc_df.to_excel(f'{tables_dir}\\afdb_{file.replace(".csv","")}_pbc.xlsx', index = False)

ft_segments_3s.csv
    ft_coefficient   pbc_r  p_val
51              51  0.3798    0.0
50              50  0.3578    0.0
49              49  0.3361    0.0
48              48  0.3170    0.0
25              25  0.3054    0.0
ft_segments_4s.csv
    ft_coefficient   pbc_r  p_val
67              67  0.3778    0.0
66              66  0.3613    0.0
65              65  0.3419    0.0
64              64  0.3254    0.0
46              46  0.3197    0.0
ft_segments_5s.csv
    ft_coefficient   pbc_r  p_val
83              83  0.3798    0.0
82              82  0.3615    0.0
81              81  0.3439    0.0
58              58  0.3316    0.0
59              59  0.3312    0.0
ft_segments_6s.csv
    ft_coefficient   pbc_r  p_val
99              99  0.3773    0.0
98              98  0.3615    0.0
97              97  0.3485    0.0
69              69  0.3398    0.0
70              70  0.3389    0.0
ft_segments_7s.csv
     ft_coefficient   pbc_r  p_val
119             119  0.4011    0.0
118             118

In [8]:
avg_pbc_corrs_list_to_df = list()

for file in os.listdir(tables_dir):
    if str(file).startswith('afdb'):

        pbc_table_df = pd.read_excel(f'{tables_dir}\\{file}')
        avg_pbc_corrs_list_to_df.append(dict(
                segments_length = file.replace('.xlsx', '').split('_')[-2],
                avg_pbc_r = pbc_table_df['pbc_r'].abs().mean()
        ))

pd.DataFrame(avg_pbc_corrs_list_to_df).to_excel(f'{tables_dir}\\afdb_avg_pbc_corrs_per_segment_length.xlsx', index = False)


