In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import h5py
import wfdb 
from app.ecg.ecg import Datasets, EcgSignal    

# 1. Исследовательский анализ данных



## 1.1 Данные: PTBXL 
[Источник](https://physionet.org/content/ptb-xl/1.0.3/)

- ptbxl_data - общий набор данных со ссылками на записи экг
- ptbxl_scp_statements - набор данных с описанием диагнозов

In [2]:
sns.set_style("whitegrid")


In [3]:
sns.set_style("darkgrid")

In [4]:
ptbxl_data = pd.read_csv(Datasets.ptbxl.path, index_col=0)
ptbxl_statements = pd.read_csv(Datasets.ptbxl_scp_statements.path, index_col=0)

In [5]:
ptbxl_data.head()

In [6]:
ptbxl_statements

In [7]:
print("Всего уникальных состояний здоровья сердца в датасете:", len(ptbxl_statements))

In [8]:
print("Общие уникальные классы:")
ptbxl_statements['diagnostic_class'].unique()

#### Основные классы:
- STTC (ST/T Change - **изменения в ST/T-сегменте**),  5235 записей
- NORM (Normal ECG - **нормальный экг**),  9514 записей
- MI (Myocardial Infarction - **инфаркт миокарда**),  5469 записей
- HYP (Hypertrophy - **гипертрофия**), 2649 записей
- CD (Conduction Disturbance - **нарушение проводимости**),  4898 записей




In [9]:
ptbxl_data.info()

In [10]:
ptbxl_data['scp_codes']

In [11]:
ptbxl_data['scp_codes'].nunique()

In [12]:
d = {'NORM': 100.0, 'SR': 0.0}
for k, v in d.items():
    if k == 'NORM' and v > 80:
        print(k, v)

In [13]:
ptbxl_statements.info()

In [14]:
ptbxl_statements

In [15]:
# Приведём данные к нужному типу
import ast

ptbxl_data['scp_codes'] = ptbxl_data['scp_codes'].apply(lambda x: ast.literal_eval(x))
ptbxl_data['patient_id'] = ptbxl_data['patient_id'].astype(int)
ptbxl_data['nurse'] = ptbxl_data['nurse'].astype('Int64')
ptbxl_data['site'] = ptbxl_data['site'].astype('Int64')
ptbxl_data['validated_by'] = ptbxl_data['validated_by'].astype('Int64')

ptbxl_scp_data = ptbxl_statements[ptbxl_statements['diagnostic'] == 1]

In [16]:
display(
    ptbxl_data.head(),
    ptbxl_data.info()
)

In [17]:
ptbxl_data['strat_fold'].value_counts()

In [18]:
def diagnostic_class(scp):
    res = set()
    for k in scp.keys():
        if k in ptbxl_scp_data.index:
            res.add(ptbxl_scp_data.loc[k].diagnostic_class)
    return list(res)
                    
ptbxl_data['scp_classes'] = ptbxl_data.scp_codes.apply(diagnostic_class)

In [19]:
ptbxl_data.info()

In [20]:
ptbxl_data['scp_classes']

In [21]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(os.path.join(path, f)) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(os.path.join(path, f)) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

sampling_rate = 100
ptbxl_ecg_data = load_raw_data(ptbxl_data, sampling_rate, 
                               r'C:\Users\User\PycharmProjects\ecg-service\data\ptbxl\ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3')
ptbxl_ecg_data.shape

In [22]:
ptbxl_ecg_data

In [23]:
sample = ptbxl_ecg_data[2]
bar, axes = plt.subplots(sample.shape[1], 1, figsize=(30,20))
for i in range(sample.shape[1]):
    sns.lineplot(x=np.arange(sample.shape[0]), y=sample[:, i], ax=axes[i])
# plt.tight_layout()
plt.show()

In [24]:
np.savetxt(r'C:\Users\redmi\Desktop\ecgs\ecg_2.txt', sample)


In [None]:
np.savetxt(r'C:\Users\redmi\Desktop\ecgs\ecg_3.txt', sample)

In [None]:
import missingno as msno

msno.matrix(ptbxl_data)
plt.show()

In [25]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(Datasets.ptbxl_scp_statements.path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]
print(agg_df.shape)
agg_df.head()

In [26]:
def aggregate_supclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))
    
# Apply diagnostic superclass
ptbxl_data['diagnostic_superclass'] = ptbxl_data.scp_codes.apply(aggregate_supclass_diagnostic)
ptbxl_data['diagnostic_superclass_len'] = ptbxl_data['diagnostic_superclass'].apply(len)
ptbxl_data.loc[ptbxl_data.diagnostic_superclass_len > 1, 'diagnostic_superclass']

In [27]:
vc = ptbxl_data['diagnostic_superclass_len'].value_counts()
sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(x=vc.index, y=vc.values/vc.values.sum()*100., legend=False, errorbar=None, palette="crest",orient='v' )
ax.set_title("Распределение длины суперкласса", fontsize=20)
ax.set_xlabel ("Проценты по всем записям")
ax.set_ylabel ("Длина суперклассов")
# for rect in ax.patches:
#     ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

In [28]:
def aggregate_subclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_subclass)
    ret = list(set(tmp))
    ret = ['sub_'+r for r in ret] # to distinguish between subclass and superclass columns
    return ret

# Apply diagnostic subclass
ptbxl_data['diagnostic_subclass'] = ptbxl_data.scp_codes.apply(aggregate_subclass_diagnostic)
ptbxl_data['diagnostic_subclass_len'] = ptbxl_data['diagnostic_subclass'].apply(len)
ptbxl_data.loc[ptbxl_data.diagnostic_subclass_len > 1, 'diagnostic_subclass']

In [29]:
vc = ptbxl_data['diagnostic_subclass_len'].value_counts()

sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(x=vc.values/vc.values.sum()*100., y=vc.index, ci=None, palette="crest",orient='h' )
ax.set_title("Распределение длины подклассов", fontsize=20)
ax.set_xlabel ("percentage over all samples")
ax.set_ylabel ("diagnostic_subclass_len")
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

In [30]:
all_superclass = pd.Series(np.concatenate(ptbxl_data['diagnostic_superclass'].values))
all_subclass = pd.Series(np.concatenate(ptbxl_data['diagnostic_subclass'].values))
superclass_cols = all_superclass.unique()
subclass_cols = all_subclass.unique()
update_cols = np.concatenate([superclass_cols, subclass_cols]) # add meta data columns
meta_cols = ['age', 'sex', 'height', 'weight', 'nurse', 'site', 'device',] # could add more columns as features


In [31]:
class ClassUpdate():
    def __init__(self, cols):
        self.cols = cols

    def __call__(self, row):
        for sc in row['diagnostic_superclass']:
            row[sc] = 1
        for sc in row['diagnostic_subclass']:
            row[sc] = 1
            
        return row

def get_data_by_folds(folds, x, y, update_cols, feature_cols):
    assert len(folds)  > 0, '# of provided folds should longer than 1'
    #print(y.strat_fold)
    filt = np.isin(y.strat_fold.values, folds)
    x_selected = x[filt]
    y_selected = y[filt]
    
    for sc in update_cols:
        y_selected[sc] = 0
        
    cls_updt = ClassUpdate(update_cols)
    
    y_selected = y_selected.apply(cls_updt, axis=1)
    
    return x_selected, y_selected[list(feature_cols)+list(update_cols)+['strat_fold']]

In [32]:
x_all, y_all = get_data_by_folds(np.arange(1, 11), ptbxl_ecg_data, ptbxl_data, update_cols, meta_cols)


In [33]:
y_all

In [34]:
vc = y_all[superclass_cols].sum(axis=0)
sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(x=vc.values/y_all.shape[0]*100., y=vc.index, palette="crest",orient='h' )
ax.set_title("Распределение суперклассов", fontsize=20)
ax.set_xlabel ("Процент записей")
ax.set_ylabel ("Диагностический суперкласс")
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

In [42]:
vc = y_all[subclass_cols].sum(axis=0)

sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(x=vc.values/y_all.shape[0]*100., y=vc.index, palette="crest",orient='h' )
ax.set_title("Распределение диагностических подклассов", fontsize=20)
ax.set_xlabel ("Процент записей")
ax.set_ylabel ("Подкласс")
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

In [36]:
for superclass in superclass_cols:
    
    vc = y_all.loc[y_all[superclass] == 1][subclass_cols].sum(axis=0)
    sns.set_style("whitegrid")
    bar,ax = plt.subplots(figsize=(10,6))
    ax = sns.barplot(x=vc.values/y_all.shape[0]*100., y=vc.index, ci=None, palette="muted",orient='h' )
    ax.set_title("{} подклассы".format(superclass), fontsize=20)
    ax.set_xlabel ("Процент записей")
    ax.set_ylabel ("Диагностический подкласс")
    for rect in ax.patches:
        ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

In [37]:
fig, ax = plt.subplots(figsize=(10,6))
plt.title('Возрастное распределение суперклассов')

for superclass in superclass_cols:
    data = y_all.loc[y_all[superclass] == 1]['age']   
    sns.distplot(data, label=superclass)


plt.legend(loc='upper left')
plt.show()


In [38]:
fig, ax = plt.subplots(figsize=(10,6))
plt.title('Распределение роста суперклассов')

for superclass in superclass_cols:
    data = y_all.loc[y_all[superclass] == 1][['height']]
    sns.distplot(data, label=superclass)

plt.legend(labels=superclass_cols, loc='upper left')
plt.show()

In [39]:

sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,6))
    
ax.set_title("Распределение суперклассов в разрезе пола", fontsize=20)

all_index, all_count, all_values = [], [], []
for sex in y_all.sex.unique():
    vc = y_all.loc[y_all.sex == sex][superclass_cols].sum(axis=0)
    all_index += list(vc.index)
    all_count += list(vc.values/y_all.shape[0]*100.)
    all_values += [sex]*len(vc)
    #print(all_sex)
    
df = pd.DataFrame()
df['diagnositic superclass'] = all_index
df['percentage over all samples'] = all_count
df['sex'] = all_values
    
ax = sns.barplot(data=df, x="percentage over all samples", y="diagnositic superclass", hue="sex", orient='h')
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold')

In [40]:

sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,20))
    
ax.set_title("Diagnositic Superclass Distribution of Different Nurse", fontsize=20)

all_index, all_count, all_values = [], [], []
for nurse in y_all.nurse.unique():
    vc = y_all.loc[y_all.nurse == nurse][superclass_cols].sum(axis=0)
    all_index += list(vc.index)
    all_count += list(vc.values/y_all.shape[0]*100.)
    all_values += [nurse]*len(vc)
    #print(all_sex)
    
df = pd.DataFrame()
df['diagnositic superclass'] = all_index
df['percentage over all samples'] = all_count
df['nurse'] = all_values
    
ax = sns.barplot(data=df, x="percentage over all samples", y="diagnositic superclass", hue="nurse", orient='h')
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold')

In [41]:
sns.set_style("whitegrid")
bar,ax = plt.subplots(figsize=(10,20))
    
ax.set_title("Распределение суперклассов в зависимости от кардиографа", fontsize=20)

all_index, all_count, all_values = [], [], []
for device in y_all.device.unique():
    vc = y_all.loc[y_all.device == device][superclass_cols].sum(axis=0)
    all_index += list(vc.index)
    all_count += list(vc.values/y_all.shape[0]*100.)
    all_values += [device]*len(vc)
    #print(all_sex)
    
df = pd.DataFrame()
df['diagnositic superclass'] = all_index
df['percentage over all samples'] = all_count
df['device'] = all_values

ax = sns.barplot(data=df, x="percentage over all samples", y="diagnositic superclass", hue="device", orient='h')
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.2f%%"% rect.get_width(), weight='bold')

## 1.2 Данные: CODE test
[Источник](https://zenodo.org/records/3765780)

Содержит 827 записей ЭКГ разных пациентов, аннотированных несколькими кардиологами, ординаторами и студентами-медиками
Он содержит аннотации о 6 различных отклонениях ЭКГ:
- 1st degree AV block (1dAVb);
- right bundle branch block (RBBB);
- left bundle branch block (LBBB);
- sinus bradycardia (SB);
- atrial fibrillation (AF); and,
- sinus tachycardia (ST).


In [None]:
ecg_sample_codetest = EcgSignal.take_sample_codetest(person_idx=2, show=True, zone='II', prep=False)