In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import os
import glob
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ラベルが格納されているCSVを読み込む

# 一番最新のラベルデータ
path_training = './physionet.org/files/challenge-2017/1.0.0/REFERENCE-v3.csv'
df_label = pd.read_csv(path_training,header=None)
df_label.columns = ['filename','label']
#display(df_label[:2855])

In [3]:
# filenameをインデックスにする
df_label = df_label.set_index('filename')
df_label = df_label.iloc[:2854,0]
display(df_label.head())

filename
A00001    N
A00002    N
A00003    N
A00004    A
A00005    A
Name: label, dtype: object

In [4]:
# ラベル付を行う
# 値を明示的にするためにnumpyで行う
# 正常心電図が'N'
# 心房細動が'A'
# その他不整脈が'O'
# 雑音が含まれるものが'~'
normal_ecg = np.where(df_label == 'N', 1,0)
af_ecg = np.where(df_label == 'A', 2,0)
other = np.where(df_label == 'O', 3,0)
noise = np.where(df_label == '~', 4,0)
labels = normal_ecg + af_ecg + other + noise
display(labels)

array([1, 1, 1, ..., 3, 2, 3])

In [5]:
# データフレーム化
df_labels = pd.DataFrame(labels, columns=['label_num'], index=df_label.index)
display(df_labels.head())

Unnamed: 0_level_0,label_num
filename,Unnamed: 1_level_1
A00001,1
A00002,1
A00003,1
A00004,2
A00005,2


In [6]:
# 正常心電図が0
# 心房細動が1
# その他不整脈が2
# 雑音が含まれるものが3
df_labels = df_labels - 1
df_label = pd.concat([df_label,df_labels], axis=1)
display(df_label.tail())

Unnamed: 0_level_0,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
A02850,N,0
A02851,N,0
A02852,O,2
A02853,A,1
A02854,O,2


In [7]:
def count_arrhythmia(df):
    """
    不整脈のデータ数を数える
    """
    N = 0
    A = 0
    O = 0
    noise = 0
    for number in df['label_num']:
        if number == 0:
            N += 1
        elif number == 1:
            A += 1
        elif number == 2:
            O += 1
        else:
            noise += 1
    
    print(f'正常心電図 : {N}個')
    print(f'心房細動 : {A}個')
    print(f'その他の不整脈 : {O}個')
    print(f'雑音 : {noise}個')
    
count_arrhythmia(df_label)

正常心電図 : 1700個
心房細動 : 288個
その他の不整脈 : 771個
雑音 : 95個


In [8]:
# 学習データを読み込みラベル付をする

path_training = './training_DataFrame'
path_training = glob.glob(path_training + '/train*pkl')
path_training_str = ''.join(path_training)

# データフレームとして取得
df_training = pd.read_pickle(path_training_str)
#display(df_training)

# ラベルデータと結合
df_training = pd.concat([df_training, df_label], axis=1)
display(df_training.head())

Unnamed: 0_level_0,rmssd,sdnn,sdsd,nn50,pnn50,mrri,mhr,ln(sdnn),stdhr,cvrr,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A00001,100.643575,65.318577,102.02774,4.0,10.526316,761.666667,79.330866,4.179276,6.89082,8.575743,N,0
A00002,290.7802,222.416471,295.066494,19.0,57.575758,893.434343,72.730171,5.404552,24.004782,24.894551,N,0
A00003,229.549405,155.065837,231.05302,32.0,41.025641,763.760684,82.196182,5.04385,19.738613,20.302935,N,0
A00004,237.096636,189.473773,241.103148,24.0,77.419355,936.021505,66.654705,5.244251,13.312296,20.242459,A,1
A00005,292.018724,211.556696,293.689064,64.0,73.563218,668.084291,98.951104,5.354493,30.106937,31.666168,A,1


In [9]:
# 欠損値が含まれるサンプルを削除
df_training.dropna(inplace=True)
df_training.tail()

Unnamed: 0_level_0,rmssd,sdnn,sdsd,nn50,pnn50,mrri,mhr,ln(sdnn),stdhr,cvrr,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A02850,501.102616,337.447882,509.972042,22.0,73.333333,975.0,69.39454,5.821411,27.19332,34.610039,N,0
A02851,371.104004,229.615228,376.855538,18.0,52.941176,744.509804,86.569177,5.436405,22.931642,30.841129,N,0
A02852,477.962646,311.771887,486.78806,20.0,74.074074,986.666667,68.833104,5.742272,28.728158,31.598502,O,2
A02853,365.724726,269.362321,371.066149,31.0,88.571429,851.047619,78.090732,5.596057,26.313719,31.650676,A,1
A02854,106.658266,82.414763,108.41202,20.0,62.5,890.520833,67.956077,4.411765,6.497226,9.25467,O,2


In [10]:
# 欠損値削除後の各心電図の個数
count_arrhythmia(df_training)

正常心電図 : 1700個
心房細動 : 288個
その他の不整脈 : 771個
雑音 : 94個


In [11]:
# データフレームの保存

save_path_dir = './training_DataFrame'

def dataframe_save(df, save_path_dir, file_name):
    """
    CSVとpickle形式で保存
    pickleは読み書きが高速
    パラメータ
    df : 保存したいデータフレーム
    save_path_dir : 保存先フォルダのパス
    file_name : 保存したいファイルネーム
    """
    # 保存先パス
    path_save = os.path.join(save_path_dir + '/' + file_name)
    
    # CSV形式で保存
    df.to_csv(path_save + '.csv', index=True, header=True, sep=',')
    # pickle形式で保存
    df.to_pickle(path_save + '.pkl')

dataframe_save(df=df_training, save_path_dir=save_path_dir, file_name='labeled_training_dataset')